diff options
Diffstat (limited to 'url.class')
-rw-r--r-- | url.class | 484 |
1 files changed, 484 insertions, 0 deletions
diff --git a/url.class b/url.class new file mode 100644 index 0000000..d11f9dd --- /dev/null +++ b/url.class @@ -0,0 +1,484 @@ +<?php +/* +** $Id$ +** +** ITools - the Internet Tools Library +** +** Copyright (C) 1995-2003 by the ITools Authors. +** This program is free software; you can redistribute it and/or +** modify it under the terms of either the GNU General Public License +** or the GNU Lesser General Public License, as published by the Free +** Software Foundation. See http://www.gnu.org/licenses/ for details. +** +** url.class - Create an URL object and canonize it +*/ + +class it_url +{ + /* E.g. HTTP://www.Relog.CH.:80/default.asp */ + var $url; /* E.g. http://www.relog.ch/ */ + var $protocol; /* E.g. http */ + var $hostname; /* E.g. relog.ch */ + var $realhostname; /* E.g. www.relog.ch */ + var $port; /* E.g. 80 */ + var $path; /* E.g. / */ + var $rawurl; /* E.g. HTTP://www.Relog.CH.:80/default.asp */ + + var $page; /* Page or empty */ + var $page_read; /* true if page read */ + var $title; /* Page title or empty */ + var $description; /* Page description or empty */ + + var $headers; /* Headers of page fetched by get() */ + var $data; /* Data part, even if return code is not 200 */ + var $result; /* Return code of get() */ + + +/** + * Constructor: canonicalize an URL + * @param $url URL this object represents + */ +function it_url($url, $options = array()) +{ + $this->rawurl = $url; + + if (eregi('^([a-z]+):/+(.*)$', $url, $regs)) + { + $this->protocol = strtolower($regs[1]); + $url = $regs[2]; + } + else if (ereg('^[a-z]:', $url) || ereg('^/', $url)) + { + $this->protocol = 'file'; + } + else + $this->protocol = 'http'; + + /* Default port */ + if ($this->protocol == 'http') + $protoport = 80; + else if ($this->protocol == 'https') + $protoport = 443; + + $this->port = $protoport; + + if (class_exists('Net_IDNA')) + $idn = Net_IDNA::getInstance(); + + if ($idn) + $pattern = '^([^/]+)/*(.*)$'; + else + $pattern = '^([a-z0-9_:\.-]+)/*(.*)$'; + + if (eregi($pattern, $url, $regs)) + { + list($hostname, $port) = explode(':', $regs[1]); + + $this->realhostname = strtolower($hostname); + + if ($port) + $this->port = $port; + + $url = $regs[2]; + } + + if (ereg('^www\.(.*)$', $this->realhostname, $regs)) + $this->hostname = $regs[1]; + else + $this->hostname = $this->realhostname; + + $index_files = array('index.html', 'index.htm', 'index.phtml', 'index.shtml', 'index.php3', 'index.php', 'default.asp'); + + for ($i = 0; $i < count($index_files); $i++) + { + $url = eregi_replace("^$index_files[$i]\$", '', $url); + $url = eregi_replace("/$index_files[$i]\$", '', $url); + } + + $this->path = ereg_replace('^/$', '', $url); + + if ($this->port != $protoport) + $this->url = "$this->protocol://$this->realhostname:$this->port/$this->path"; + else + $this->url = "$this->protocol://$this->realhostname/$this->path"; + + if ($idn) + { + $realhostname = $this->realhostname; + + if (!preg_match('/^utf-?8$/i', $options['encoding'])) + $realhostname = utf8_encode($realhostname); + + $encoded = $idn->encode($realhostname); + + if ($encoded != $realhostname) + $this->realhostname = $encoded; + } +} + + +/** + * Read the page into memory, extract title and description and + * set $this->page, $this->title and $this->description + * @param $timeout Timeout for operation, defaults to unlimited (0) + * @return True if page has been read and $this->page is set + */ +function read_page($timeout = 0) +{ + unset($this->page); + unset($this->title); + unset($this->description); + + /* + ** If the URL does not contain a dot followed by at least one character, + ** it is considered bogus. This prevents 'localhost', 'www', and numerical IP addresses. + */ + if (!eregi('\.[a-z]+$', $this->realhostname)) + return 0; + + $url = $this->rawurl; + while ($this->page == '') + { + $cmd = 'LANG=C wget 2>&1 -T ' . ((int)$timeout) . ' -q -U "Mozilla/4.0 (Compatible; Relog ITools)" -O - ' . ereg_replace("[ \t]", '\\ ', escapeshellcmd("$url")); + $this->page = `$cmd`; + + if ($this->page == '') /* An error occurred. Find out what it was. */ + { + $cmd = 'LANG=C wget 2>&1 -T' . ((int)$timeout) . ' -v -U "Mozilla/4.0 (Compatible; Relog ITools)" -O - ' . ereg_replace("[ \t]", '\\ ', escapeshellcmd($url)); + $error = `$cmd`; + if (eregi('Location: ([^ ]*)', $error, $regs)) /* Redirect ? */ + { + $url = $regs[1]; + if (!eregi('^[a-z]+:', $url)) /* Kludge for Miss Kournikova's admirers: grok local redirects (in violation of RFC) */ + $url = $this->rawurl.'/'.$url; + } + else + break; + } + + if (++$count > 4) /* Avoid infinite redirect loops */ + break; + } + + $this->page_read = 1; + + if (eregi('<title>([^<]*)</title>', $this->page, $regs)) + $this->title = it_htmlentities_decode($regs[1]); + + if (eregi('<meta name="description"[^>]+content="([^"]*)">', $this->page, $regs)) + $this->description = it_htmlentities_decode($regs[1]); + + return ($this->page != ''); +} + + +/* Return the description of this page */ +function get_description() +{ + if (!$this->page_read) + $this->read_page(); + + return $this->description; +} + + +/* Return the title of this page */ +function get_title() +{ + if (!$this->page_read) + $this->read_page(); + + return $this->title; +} + +/** + * Check if a given url (currently http:port80-only) can be fetched + * Note: Redirects are treated as succesful + * $timeout Timeout for connection in seconds + * @return true if url could be fetched + */ +function is_reachable($timeout = 5) +{ + $result = false; + + if ($fp = @fsockopen($this->realhostname, $this->port, $errno, $errstr, $timeout)) + { + fputs($fp, "GET /$this->path HTTP/1.0\r\nHost: $this->realhostname\r\nUser-Agent: ITools\r\n\r\n"); + $line = fgets($fp, 128); + fclose($fp); + + #debug("it_url::is_reachable($this->rawurl: $line"); + $result = eregi("^$this->protocol/[^ ]+ +[23]", $line); + } + + return $result; +} + +/** + * Get simple URL with timeout. Can be called statically + * @url url to get, defaults to constructor URL + * @timeout timeout per read in milliseconds. + * @return contents of resulting page, considering redirects, excluding headers, or false on error + */ +function get($url=null, $timeout=5000) +{ + if ($url) + $url = new it_url($url); + else + $url =& $this; # Must be reference for $url->result and $url->data to work + + $url->result = $result = false; + unset($url->data); + $url->headers = array(); + + if ($url->protocol == 'http') + { + if ($fp = @fsockopen($url->realhostname, $url->port, $errno, $errstr, $timeout/1000)) + { + stream_set_timeout($fp, intval($timeout/1000), ($timeout%1000)*1000); + @fputs($fp, "GET /$url->path HTTP/1.0\r\nHost: $url->realhostname\r\nUser-Agent: Mozilla/4.0 (compatible; MSIE 6.0; ITools)\r\n\r\n"); + + while (!feof($fp) && ($line = @fgets($fp, 10240)) && ($line = trim($line))) + { + if (preg_match('#^(HTTP\S+)\s(\d+)#', $line, $parts)) # Parse result code + $url->headers[$parts[1]] = $url->result = $parts[2]; + elseif (preg_match('#^Location: (https?://[^/]*)?(/)?(.*)$#', $line, $parts) && ($parts[1] != $url->url)) # Handle redirects (supports relative and global) + { + $url->it_url($parts[1] ? $parts[1].$parts[2].$parts[3] : $url->protocol.'://'.$url->realhostname.($parts[2] ? $parts[2].$parts[3] : '/'.dirname($url->path).'/'.$parts[3])); + return $url->get(null, $timeout); + } + elseif (preg_match('#^([^:]+): (.*)$#', $line, $parts)) + $url->headers[$parts[1]] = $parts[2]; + } + + if ($url->result) + { + while (!feof($fp)) + $url->data .= @fread($fp, 20480); + + if ($url->result < 400) + $result =& $url->data; + } + + @fclose($fp); + } + } + + return $result; +} + + +/** + * Construct a local file name to cache an URL. Named args: + * @url remote url to get + * @cachedir path to cache directory + */ +function get_cache_filename($p) +{ + if (!is_array($p)) + $p = array('url'=>$p); + $p += array('cachedir' => $GLOBALS['ULTRAHOME'] . "/var/urlcache"); + + return $p['cachedir'] . "/" . substr(md5($p['url']), 0, 2) . "/" . md5($p['url']); +} + + + +/** + * Store contents of url in a file and return file name. Provides locking. Call statically. + * Requires www writeable var/urlcache in your service dir. Params in assoc array: + * @url url to get + * @timeout timeout in milliseconds, default 10000 + * @maxage maximum age of cache entries in seconds, default 86400 + * @cleanbefore maximum daytime when attempting cleanup, default 7200 + * @preprocess callback function (or array for methods) to change received file + * @safety value 0 means dont generate alert, value 1 means generate alerts on timeouts and failures + * @keepfailed keep old versions of files if download fails (sending alerts conservatively) + * @cachedir directory to store cache files in. NO TRAILING SLASH + */ +function get_cache($p = array()) +{ + $p += array('timeout'=>10000, 'maxage'=>86400, 'cleanbefore'=>7200, 'safety'=>1, 'cachedir'=>$GLOBALS['ULTRAHOME']."/var/urlcache"); + + $path = it_url::get_cache_filename($p); + @mkdir(dirname($path)); + $age = time() - @filemtime($path); + + # expire forgotten locks + $lockmtime = @filemtime("$path.lock"); + if ($lockmtime && time()-$lockmtime > 30) + @unlink("$path.lock"); + + if ($p['keepfailed'] && ($age>$p['maxage']) && ($dummy = @fopen("$path.lock", EDC('nocache') ? "w" : "x"))) # update our copy if we get the lock + { + # my job to refresh the cache entry + fclose($dummy); + EDC('getcache', "refresh", $p['url'], $path); + if (($result = it_url::get($p['url'], $p['timeout']))) + it_url::_atomicwrite($path, $result); + else + touch($path); + + $parts = parse_url($p['url']); + if ($p['safety'] == 1) + it::error(array('title'=>"get_cache: download failures on $path", 'ok_key'=>md5($parts['host']), 'ok_delay'=>$p['maxage'], 'ok'=>$result ? 1 : 0)); # send err only if multi failure + @unlink("$path.lock"); + } + + # Remove ancient lock or cached file if it is too old + if (!$p['keepfailed']) + it_url::_expire($path, $p['maxage']); + + if ($dummy = @fopen($path, EDC('nocache') ? "w" : "x")) # succeeds if file is missing (or we are in nocache mode) + { + # fill cache myself + fclose($dummy); + EDC('getcache', "new", $p['url'], $path); + $result = it_url::_atomicwrite($path, it_url::get($p['url'], $p['timeout'])); + } + else + { + # get file from cache, potentially waiting if file is currently being transferred + EDC('getcache', "old", $p['url'], $path); + $result = it_url::_waitforpath($p + array('path' => $path)); + } + + if ($result && $p['preprocess']) + { + $srcpath = $path; + $path .= substr(md5(serialize($p['preprocess'])), 0, 2); + it_url::_expire($path, $p['maxage']); + + if ($dummy = @fopen($path, EDC('nocache') ? "w" : "x")) # in nocache mode, always succeed + { + fclose($dummy); + EDC('getcache', "process", $p['url'], $path); + $dstpath = "$path.preprocesstmp"; + call_user_func($p['preprocess'], $srcpath, $dstpath); + + if (!@filesize($dstpath) || !@rename($dstpath, $path)) + { + @unlink($dstpath); + @unlink($path); + $result = false; + } + else + $result = $path; + } + else + $result = it_url::_waitforpath($p + array('path' => $path)); + + if ($result) + { + EDC('getcache', "processold", $p['url'], $path); + touch($result, filemtime($srcpath)); # Ensure processed is never newer than src + } + } + + # cache cleanup at night + if ((time()%86400 < $p['cleanbefore']) && (time()-@filemtime($p['cachedir'] . "/cleaned") > 80000)) + { + touch($p['cachedir'] . "/cleaned"); + $maxagemin = intval($p['maxage']/60); + exec("nohup bash -c 'cd {$p['cachedir']} && sleep 10 && find -mmin +$maxagemin -print0 | xargs -0 -r rm' </dev/null >/dev/null 2>&1 &"); + } + + return $result; +} + +function _waitforpath($p) +{ + $p += array('sleeptime' => 100); # millisecs to wait + + # wait until cache is ready, then read from cache + for ($maxpasses = $p['timeout'] / $p['sleeptime'], $passes = 0; (@filesize($p['path']) <= 0) && ($passes < $maxpasses); ++$passes) + { + usleep($p['sleeptime'] * 1000); + clearstatcache(); + } + + if ($passes < $maxpasses) + $result = $p['path']; + else if ($p['safety'] == 1) + it::error("timeout in it_url::get_cache(): url={$p['url']}, passes=$passes, maxpasses=$maxpasses, path={$p['path']}"); + + return $result; +} + +function _atomicwrite($path, $data) +{ + if ($data !== false) + { + $tmpname = tempnam(dirname($path), "writetmp"); + fputs($cachetmp = fopen($tmpname, "w"), $data); + fclose($cachetmp); + chmod($tmpname, 0664); + rename($tmpname, $path); + $result = $path; + } + else + unlink($path); + + return $result; +} + +function _expire($path, $maxage) +{ + # Remove ancient lock or cached file if it is too old + if (file_exists($path) && (((@filesize($path) == 0) && (time() - @filemtime($path)) > 30) || (time() - @filemtime($path) > $maxage))) + { + EDC('getcache', "expire", $path); + @unlink($path); + } +} + +/** + * Make an URL absolute by using host an protocol from current Apache request (but not port number) + * @param $url Optional URL ( foo.html, /foo.html, //host/bar.html, http://host/bar.html ), default self + * @return absolute version of URL ( http[s]://host/bar.html ) + */ +function absolute($url=null) +{ + if (!isset($url)) + $url = $_SERVER['PHP_SELF']; + + if (!ereg('^http', $url)) + { + if (!ereg('//', $url)) + { + $dir = ereg_replace('/[^/]*$', '/', $_SERVER['PHP_SELF']); + $url = ereg('^/', $url) ? $url : "$dir$url"; + $url = "//" . $_SERVER['HTTP_HOST'] . $url; + } + $url = "http" . (isset($_SERVER['HTTPS']) ? 's':'') . ":$url"; + } + + return $url; +} + +/** + * Craft a valid redirect URL, send Location: header and terminate execution + * @param $url Optional URL ( foo.html, /foo.html, //host/bar.html, http://host/bar.html ), default self + * @return This method never returns. + */ +function redirect($url = null) +{ + if (EDC('noredir')) + echo "<a href='" . htmlspecialchars(it_url::absolute($url)) . "'>" . htmlspecialchars($url) . "</a><br />"; + else + header('Location: '.preg_replace("/[\r\n].*/", '', it_url::absolute($url))); # Security: cut after CR/LF + + exit; +} + +/** + * Urlencode but leave some chars + */ +function encode($str) +{ + return strtr(urlencode($str), array("%2C"=>",", "%28"=>"(", "%29"=>")")); +} + +} + +?> |