diff options
author | Urban Müller | 2007-07-26 13:02:24 +0000 |
---|---|---|
committer | Urban Müller | 2007-07-26 13:02:24 +0000 |
commit | 806a5297e7e99d455b97a4f0acaba2f40f470584 (patch) | |
tree | b9fc43ef227da87d873cf3676c08c49fa0dc1240 /url.class | |
parent | c3cba034c8009b65c25dd4ef5f54b18d9c8ee7d4 (diff) | |
download | itools-806a5297e7e99d455b97a4f0acaba2f40f470584.tar.gz itools-806a5297e7e99d455b97a4f0acaba2f40f470584.tar.bz2 itools-806a5297e7e99d455b97a4f0acaba2f40f470584.zip |
renamed files for autoloader
Diffstat (limited to 'url.class')
-rw-r--r-- | url.class | 580 |
1 files changed, 0 insertions, 580 deletions
diff --git a/url.class b/url.class deleted file mode 100644 index c94f4af..0000000 --- a/url.class +++ /dev/null @@ -1,580 +0,0 @@ -<?php -/* -** $Id$ -** -** ITools - the Internet Tools Library -** -** Copyright (C) 1995-2007 by the ITools Authors. -** This program is free software; you can redistribute it and/or -** modify it under the terms of either the GNU General Public License -** or the GNU Lesser General Public License, as published by the Free -** Software Foundation. See http://www.gnu.org/licenses/ for details. -** -** url.class - URL parsing, retrieval and caching functions -*/ - -class it_url -{ - /* E.g. HTTP://falcon:joshua@www.Relog.CH.:80/default.asp */ - var $url; /* E.g. http://www.relog.ch/ */ - var $protocol; /* E.g. http */ - var $hostname; /* E.g. relog.ch */ - var $realhostname; /* E.g. www.relog.ch */ - var $port; /* E.g. 80 */ - var $path; /* E.g. / */ - var $rawurl; /* E.g. HTTP://falcon:joshua@www.Relog.CH.:80/default.asp */ - var $user; /* E.g. falcon */ - var $pass; /* E.g. joshua */ - - var $page; /* Page or empty */ - var $page_read; /* true if page read */ - var $title; /* Page title or empty */ - var $description; /* Page description or empty */ - - var $headers; /* Headers of page fetched by get() */ - var $data; /* Data part, even if return code is not 200 */ - var $result; /* Return code of get() */ - var $redir = 0; /* Redirect count */ - - -/** - * Constructor: canonicalize an URL - * @param $url URL this object represents - */ -function it_url($url, $options = array()) -{ - $this->rawurl = $url; - - if (preg_match('#^([a-z]+):/+(?:([^:]*):([^@]*)@)?(.*)$#i', $url, $regs)) - { - $this->protocol = strtolower($regs[1]); - $this->user = $regs[2]; - $this->pass = $regs[3]; - $url = $regs[4]; - } - else if (ereg('^[a-z]:', $url) || ereg('^/', $url)) - { - $this->protocol = 'file'; - } - else - $this->protocol = 'http'; - - /* Default port */ - if ($this->protocol == 'http') - $protoport = 80; - else if ($this->protocol == 'https') - $protoport = 443; - - $this->port = intval($protoport); - - if (class_exists('Net_IDNA')) - $idn = Net_IDNA::getInstance(); - - if ($idn) - $pattern = '^([^/]+)/*(.*)$'; - else - $pattern = '^([a-z0-9_:\.-]+)/*(.*)$'; - - if (eregi($pattern, $url, $regs)) - { - list($hostname, $port) = explode(':', $regs[1]); - - $this->realhostname = strtolower($hostname); - - if ($port) - $this->port = intval($port); - - $url = $regs[2]; - } - - $this->hostname = preg_replace('/^www\./', '', $this->realhostname); - - # Get rid of common index file names - $url = preg_replace('#(^|/)(index\.[ps]?html?|index\.php[34]?|default\.aspx?)$#', '', $url); - - $this->path = ereg_replace('^/$', '', $url); - - if ($this->port != $protoport) - $this->url = "$this->protocol://$this->realhostname:$this->port/$this->path"; - else - $this->url = "$this->protocol://$this->realhostname/$this->path"; - - if ($idn) - { - $realhostname = $this->realhostname; - - if (!preg_match('/^utf-?8$/i', $options['encoding'])) - $realhostname = utf8_encode($realhostname); - - $encoded = $idn->encode($realhostname); - - if ($encoded != $realhostname) - $this->realhostname = $encoded; - } -} - - -/** - * Read the page into memory, extract title and description and - * set $this->page, $this->title and $this->description - * @param $timeout Timeout for operation, defaults to unlimited (0) - * @return True if page has been read and $this->page is set - */ -function read_page($timeout = 0) -{ - unset($this->page); - unset($this->title); - unset($this->description); - - /* - ** If the URL does not contain a dot followed by at least one character, - ** it is considered bogus. This prevents 'localhost', 'www', and numerical IP addresses. - */ - if (!eregi('\.[a-z]+$', $this->realhostname)) - return 0; - - $url = $this->rawurl; - while ($this->page == '') - { - $cmd = 'LANG=C wget 2>&1 -T ' . ((int)$timeout) . ' -q -U "Mozilla/4.0 (Compatible; Relog ITools)" -O - ' . ereg_replace("[ \t]", '\\ ', escapeshellcmd("$url")); - $this->page = `$cmd`; - - if ($this->page == '') /* An error occurred. Find out what it was. */ - { - $cmd = 'LANG=C wget 2>&1 -T' . ((int)$timeout) . ' -v -U "Mozilla/4.0 (Compatible; Relog ITools)" -O - ' . ereg_replace("[ \t]", '\\ ', escapeshellcmd($url)); - $error = `$cmd`; - if (eregi('Location: ([^ ]*)', $error, $regs)) /* Redirect ? */ - { - $url = $regs[1]; - if (!eregi('^[a-z]+:', $url)) /* Kludge for Miss Kournikova's admirers: grok local redirects (in violation of RFC) */ - $url = $this->rawurl.'/'.$url; - } - else - break; - } - - if (++$count > 4) /* Avoid infinite redirect loops */ - break; - } - - $this->page_read = 1; - - if (eregi('<title>([^<]*)</title>', $this->page, $regs)) - $this->title = it_htmlentities_decode($regs[1]); - - if (eregi('<meta name="description"[^>]+content="([^"]*)">', $this->page, $regs)) - $this->description = it_htmlentities_decode($regs[1]); - - return ($this->page != ''); -} - - -/* Return the description of this page */ -function get_description() -{ - if (!$this->page_read) - $this->read_page(); - - return $this->description; -} - - -/* Return the title of this page */ -function get_title() -{ - if (!$this->page_read) - $this->read_page(); - - return $this->title; -} - -/** - * Check if a given url (currently http:port80-only) can be fetched - * Note: Redirects are treated as succesful - * $timeout Timeout for connection in seconds - * @return true if url could be fetched - */ -function is_reachable($timeout = 5) -{ - $result = false; - - if ($fp = @fsockopen($this->realhostname, $this->port, $errno, $errstr, $timeout)) - { - fputs($fp, "GET /$this->path HTTP/1.0\r\nHost: $this->realhostname\r\nUser-Agent: ITools\r\n\r\n"); - $line = fgets($fp, 128); - fclose($fp); - - #debug("it_url::is_reachable($this->rawurl: $line"); - $result = eregi("^$this->protocol/[^ ]+ +[23]", $line); - } - - return $result; -} - -/** - * Get simple URL with timeout. Can be called statically - * @p parameter array with the following keys - * - url: url to get, defaults to constructor URL - * - timeout: timeout per read in milliseconds, defaults to 5000 - * - data: post data array with key-value pairs - * @return contents of resulting page, considering redirects, excluding headers, or false on error - */ -function get($p=null, $timeout=5000) -{ - if (!is_array($p)) - $p = array('url' => $p); - - if (!isset($p['timeout'])) - $p['timeout'] = $timeout; - - if ($p['url']) - $url = new it_url($p['url']); - else - $url =& $this; # Must be reference for $url->result and $url->data to work - - $url->result = $result = false; - unset($url->data); - $url->headers = array(); - - if ($url->protocol == 'http') - { - if ($fp = @fsockopen($url->realhostname, $url->port, $errno, $errstr, $p['timeout']/1000)) - { - # urlencode data pairs if is array - if (is_array($p['data'])) - $data = it_url::params($p['data']); - - $p['headers'] = (array)$p['headers'] + array( - 'Host' => $url->realhostname, - 'User-Agent' => "Mozilla/4.0 (compatible; MSIE 6.0; ITools)", - 'Accept-Language' => T_lang(), - ); - - if ($datalen = strlen($data)) - { - $method = "POST"; - $p['headers'] += array( - 'Content-Type' => "application/x-www-form-urlencoded", - 'Content-Length' => $datalen, - ); - } - else - $method = "GET"; - - if ($url->user || $url->pass) - $p['headers'] += array('Authorization' => 'Basic ' . base64_encode($url->user . ':' . $url->pass)); - - foreach ($p['headers'] as $header => $value) - $headers .= "$header: $value\r\n"; - - stream_set_timeout($fp, intval($p['timeout']/1000), ($p['timeout']%1000)*1000); - @fputs($fp, "$method /$url->path HTTP/1.0\r\n$headers\r\n$data"); - - while (!feof($fp) && ($line = @fgets($fp, 10240)) && ($line = trim($line))) - { - if (preg_match('#^(HTTP\S+)\s(\d+)#', $line, $parts)) # Parse result code - $url->headers[$parts[1]] = $url->result = $parts[2]; - elseif (preg_match('#^Location: (https?://[^/]*)?(/)?(.*)$#i', $line, $parts) && ($parts[1] != $url->url)) # Handle redirects (supports relative and global) - { - unset($p['url'], $p['headers']['Host']); - $url->it_url($parts[1] ? $parts[1].$parts[2].$parts[3] : $url->protocol.'://'.$url->realhostname.($parts[2] ? $parts[2].$parts[3] : '/'.dirname($url->path).'/'.$parts[3])); - if (++$url->redir <= 4) /* Avoid infinite redirects */ - return $url->get($p); - } - elseif (preg_match('#^([^:]+): (.*)$#', $line, $parts)) - $url->headers[$parts[1]] = $parts[2]; - } - - if ($url->result) - { - if ($url->headers['Transfer-Encoding'] == "chunked") # Bogus HTTP/1.1 chunked answer from server (e.g. Wordpress/Apach2/PHP5) - { - while ($len = hexdec(fgets($fp))) - { - $chunk = ""; - - while (!feof($fp) && (strlen($chunk) < $len)) - $chunk .= @fread($fp, $len - strlen($chunk)); - - $url->data .= $chunk; - } - } - else - { - while (!feof($fp)) - $url->data .= @fread($fp, 20480); - } - - if ($url->result < 400) - $result =& $url->data; - } - - @fclose($fp); - } - } - - return $result; -} - - -/** - * Construct a local file name to cache an URL. Named args: - * @url remote url to get - * @cachedir path to cache directory - */ -function get_cache_filename($p) -{ - if (!is_array($p)) - $p = array('url'=>$p); - $p += array('cachedir' => $GLOBALS['ULTRAHOME'] . "/var/urlcache"); - $filename = md5(T_lang() . $p['url']); - - return $p['cachedir'] . "/" . substr($filename, 0, 2) . "/$filename"; -} - - - -/** - * Store contents of url in a file and return file name. Provides locking. Call statically. - * Requires www writeable var/urlcache in your service dir. Params in assoc array: - * @p['url'] url to get - * @p['timeout'] timeout in milliseconds, default 10000 - * @p['maxage'] maximum age of cache entries in seconds, default 86400 - * @p['cleanbefore'] maximum daytime when attempting cleanup, default 7200 - * @p['preprocess'] callback function (or array for methods) to change received file or array('function' => ..., 'in' => $src, 'out' => $dst, ...) with callback function plus args - * @p['safety'] value 0 means dont generate alert, value 1 means generate alerts on timeouts and failures - * @p['keepfailed'] keep old versions of files if download fails (sending alerts conservatively) - * @p['cachedir'] directory to store cache files in. NO TRAILING SLASH - * @p['it_error'] parameters for it_error - */ -function get_cache($p = array()) -{ - $p += array('timeout'=>10000, 'maxage'=>86400, 'cleanbefore'=>7200, 'safety'=>1, 'cachedir'=>$GLOBALS['ULTRAHOME']."/var/urlcache", 'it_error'=>array()); - - $path = it_url::get_cache_filename($p); - @mkdir(dirname($path)); - $age = time() - @filemtime($path); - - # expire forgotten locks - $lockmtime = @filemtime("$path.lock"); - if ($lockmtime && time()-$lockmtime > 30) - @unlink("$path.lock"); - - if ($p['keepfailed'] && ($age>$p['maxage']) && ($dummy = @fopen("$path.lock", EDC('nocache') ? "w" : "x"))) # update our copy if we get the lock - { - # my job to refresh the cache entry - fclose($dummy); - - # Touch existing file to prevent locking other getters - if (file_exists($path)) - touch($path); - - EDC('getcache', "refresh", $p['url'], $path); - if (($result = it_url::get($p['url'], $p['timeout']))) - it_url::_atomicwrite($path, $result); - else - touch($path); - - if ($p['safety'] == 1 && !$result) - { - $parts = @parse_url($p['url']); - it::error($p['it_error'] + array('title'=>"get_cache: download failures on {$p['url']}", 'id'=>$parts['host'])); # send err only if multi failure - } - @unlink("$path.lock"); - } - - # Remove ancient lock or cached file if it is too old - if (!$p['keepfailed']) - it_url::_expire($path, $p['maxage']); - - if ($dummy = @fopen($path, EDC('nocache') ? "w" : "x")) # succeeds if file is missing (or we are in nocache mode) - { - # fill cache myself - fclose($dummy); - EDC('getcache', "new", $p['url'], $path); - $result = it_url::_atomicwrite($path, it_url::get($p['url'], $p['timeout'])); - } - else - { - # get file from cache, potentially waiting if file is currently being transferred - EDC('getcache', "old", $p['url'], $path); - $result = it_url::_waitforpath($p + array('path' => $path)); - } - - if ($result && $p['preprocess']) - { - $srcpath = $path; - $path .= substr(md5(serialize($p['preprocess'])), 0, 2); - it_url::_expire($path, $p['maxage']); - - if ($dummy = @fopen($path, EDC('nocache') ? "w" : "x")) # in nocache mode, always succeed - { - fclose($dummy); - EDC('getcache', "process", $p['url'], $path); - $dstpath = "$path.preprocesstmp"; - - if (is_array($p['preprocess']) && $p['preprocess']['function']) # Needs is_array as it can be a string where dereferencing gives first character! - call_user_func($p['preprocess']['function'], array('in' => $srcpath, 'out' => $dstpath) + $p['preprocess']); - else - call_user_func($p['preprocess'], $srcpath, $dstpath); - - if (!@filesize($dstpath) || !@rename($dstpath, $path)) - { - @unlink($dstpath); - @unlink($path); - $result = false; - } - else - $result = $path; - } - else - $result = it_url::_waitforpath($p + array('path' => $path)); - - if ($result) - { - EDC('getcache', "processold", $p['url'], $path); - touch($result, @filemtime($srcpath)); # Ensure processed is never newer than src - } - } - - # cache cleanup at night - if ((time()%86400 < $p['cleanbefore']) && (time()-@filemtime($p['cachedir'] . "/cleaned") > 80000)) - { - touch($p['cachedir'] . "/cleaned"); - $maxagemin = intval($p['maxage']/60); - exec("nohup bash -c 'cd {$p['cachedir']} && sleep 10 && find -mmin +$maxagemin -print0 | xargs -0 -r rm' </dev/null >/dev/null 2>&1 &"); - } - - return $result; -} - -function _waitforpath($p) -{ - $p += array('sleeptime' => 100); # millisecs to wait - - # wait until cache is ready, then read from cache - for ($maxpasses = $p['timeout'] / $p['sleeptime'], $passes = 0; (@filesize($p['path']) <= 0) && ($passes < $maxpasses); ++$passes) - { - usleep($p['sleeptime'] * 1000); - clearstatcache(); - } - - if ($passes < $maxpasses) - $result = $p['path']; - else if ($p['safety'] == 1) - it::error("timeout in it_url::get_cache(): url={$p['url']}, passes=$passes, maxpasses=$maxpasses, path={$p['path']}"); - - return $result; -} - -function _atomicwrite($path, $data) -{ - if ($data !== false) - { - $tmpname = tempnam(dirname($path), "writetmp"); - fputs($cachetmp = fopen($tmpname, "w"), $data); - fclose($cachetmp); - chmod($tmpname, 0664); - rename($tmpname, $path); - $result = $path; - } - else - @unlink($path); - - return $result; -} - -function _expire($path, $maxage) -{ - # Remove ancient lock or cached file if it is too old - if (file_exists($path) && (((@filesize($path) == 0) && (time() - @filemtime($path)) > 30) || (time() - @filemtime($path) > $maxage))) - { - EDC('getcache', "expire", $path); - @unlink($path); - } -} - -/** - * Make an URL absolute by using host an protocol from current Apache request (but not port number) - * @param $url Optional URL ( foo.html, /foo.html, //host/bar.html, http://host/bar.html ), default self - * @return absolute version of URL ( http[s]://host/bar.html ) - */ -function absolute($url=null) -{ - if (!isset($url)) - $url = $_SERVER['PHP_SELF']; - - if (!ereg('^http', $url)) - { - if (!ereg('//', $url)) - { - $dir = ereg_replace('/[^/]*$', '/', $_SERVER['PHP_SELF']); - $url = ereg('^/', $url) ? $url : "$dir$url"; - $url = "//" . $_SERVER['HTTP_HOST'] . $url; - } - $url = "http" . (isset($_SERVER['HTTPS']) ? 's':'') . ":$url"; - } - - return $url; -} - -/** - * Craft a valid redirect URL, send Location: header and terminate execution - * @param $url Optional URL ( foo.html, /foo.html, //host/bar.html, http://host/bar.html ), default self - * @return This method never returns. - */ -function redirect($url = null) -{ - if (EDC('noredir')) - echo "<a href='" . htmlspecialchars(it_url::absolute($url)) . "'>" . htmlspecialchars($url) . "</a><br />"; - else - header('Location: '.preg_replace("/[\r\n].*/", '', it_url::absolute($url))); # Security: cut after CR/LF - - exit; -} - -/** - * Urlencode but leave some chars - */ -function encode($str) -{ - return strtr(urlencode($str), array("%2C"=>",", "%28"=>"(", "%29"=>")")); -} - -/** - * Create GET request from params, optionally only using given fields - * @param $params Array to take values from, usually $_GET - * @param $keys Keys to use; default: all - */ -function params($params, $keys = null) -{ - return join("&", it_url::_params($params, $keys)); -} - -function _params($params, $keys = null) -{ - $result = array(); - - if (!isset($keys)) - $keys = array_keys($params); - - foreach ($keys as $key) - { - if (is_array($params[$key])) - { - foreach (it_url::_params($params[$key]) as $value) - { - if (strlen($value)) - $result[] = it::replace(array('^([^=\[]*)' => $key . '[$1]'), $value); - } - } - else if (strlen($params[$key])) - $result[] = urlencode($key) . "=" . it_url::encode($params[$key]); - } - - return $result; -} - -} - -?> |