. ** ** url.class - URL parsing, retrieval and caching functions */ class it_url { /* E.g. HTTP://falcon:joshua@www.Relog.CH.:80/default.asp */ var $url; /* E.g. http://www.relog.ch/ */ var $protocol; /* E.g. http */ var $hostname; /* E.g. relog.ch */ var $realhostname; /* E.g. www.relog.ch */ var $port; /* E.g. 80 */ var $path; /* E.g. / */ var $rawurl; /* E.g. HTTP://falcon:joshua@www.Relog.CH.:80/default.asp */ var $user; /* E.g. falcon */ var $pass; /* E.g. joshua */ var $page; /* Page or empty */ var $page_read; /* true if page read */ var $title; /* Page title or empty */ var $description; /* Page description or empty */ var $headers; /* Headers of page fetched by get() */ var $data; /* Data part, even if return code is not 200 */ var $result; /* Return code of get() */ var $redir = 0; /* Redirect count */ /** * Constructor: canonicalize an URL * @param $url URL this object represents */ function it_url($url, $options = array()) { $this->rawurl = $url; if (preg_match('#^([a-z]+):/+(?:([^:]*):([^@]*)@)?(.*)$#i', $url, $regs)) { $this->protocol = strtolower($regs[1]); $this->user = $regs[2]; $this->pass = $regs[3]; $url = $regs[4]; } else if (preg_match('/^[a-z]:/', $url) || preg_match('#^/#', $url)) { $this->protocol = 'file'; } else $this->protocol = 'http'; /* Default port */ if ($this->protocol == 'http') $protoport = 80; else if ($this->protocol == 'https') $protoport = 443; $this->port = intval($protoport); if (class_exists('Net_IDNA', false)) $idn = Net_IDNA::getInstance(); if ($idn) $pattern = '^([^/]+)/*(.*)$'; else $pattern = '^([a-z0-9_:\.-]+)/*(.*)$'; if (preg_match("#$pattern#i", $url, $regs)) { list($hostname, $port) = explode(':', $regs[1]); $this->realhostname = strtolower($hostname); if ($port) $this->port = intval($port); $url = $regs[2]; } $this->hostname = preg_replace('/^www\./', '', $this->realhostname); # Get rid of common index file names $url = preg_replace('#(^|/)(index\.[ps]?html?|index\.php[34]?|default\.aspx?)$#', '', $url); $this->path = preg_replace('#^/$#', '', $url); if ($this->port != $protoport) $this->url = "$this->protocol://$this->realhostname:$this->port/$this->path"; else $this->url = "$this->protocol://$this->realhostname/$this->path"; if ($idn) { $realhostname = $this->realhostname; if (!preg_match('/^utf-?8$/i', $options['encoding'])) $realhostname = utf8_encode($realhostname); $encoded = $idn->encode($realhostname); if ($encoded != $realhostname) $this->realhostname = $encoded; } } /** * Read the page into memory, extract title and description and * set $this->page, $this->title and $this->description * @param $timeout Timeout for operation, defaults to unlimited (0) * @return True if page has been read and $this->page is set */ function read_page($timeout = 0) { unset($this->page); unset($this->title); unset($this->description); /* ** If the URL does not contain a dot followed by at least one character, ** it is considered bogus. This prevents 'localhost', 'www', and numerical IP addresses. */ if (!preg_match('/\.[a-z]+$/i', $this->realhostname)) return 0; $url = $this->rawurl; while ($this->page == '') { $cmd = 'LANG=C wget 2>&1 -T ' . ((int)$timeout) . ' -q -U "Mozilla/4.0 (Compatible; Relog ITools)" -O - ' . preg_replace("/[ \t]/", '\\ ', escapeshellcmd("$url")); $this->page = `$cmd`; if ($this->page == '') /* An error occurred. Find out what it was. */ { $cmd = 'LANG=C wget 2>&1 -T' . ((int)$timeout) . ' -v -U "Mozilla/4.0 (Compatible; Relog ITools)" -O - ' . preg_replace("/[ \t]/", '\\ ', escapeshellcmd($url)); $error = `$cmd`; if (preg_match('/Location: ([^ ]*)/i', $error, $regs)) /* Redirect ? */ { $url = $regs[1]; if (!preg_match('/^[a-z]+:/i', $url)) /* Kludge for Miss Kournikova's admirers: grok local redirects (in violation of RFC) */ $url = $this->rawurl.'/'.$url; } else break; } if (++$count > 4) /* Avoid infinite redirect loops */ break; } $this->page_read = 1; if (preg_match('#([^<]*)#i', $this->page, $regs)) $this->title = it_htmlentities_decode($regs[1]); if (preg_match('/]+content="([^"]*)">/i', $this->page, $regs)) $this->description = it_htmlentities_decode($regs[1]); return ($this->page != ''); } /* Return the description of this page */ function get_description() { if (!$this->page_read) $this->read_page(); return $this->description; } /* Return the title of this page */ function get_title() { if (!$this->page_read) $this->read_page(); return $this->title; } /** * Check if a given url (currently http:port80-only) can be fetched * Note: Redirects are treated as succesful * $timeout Timeout for connection in seconds * @return true if url could be fetched */ function is_reachable($timeout = 5) { $result = false; if ($fp = @fsockopen($this->realhostname, $this->port, $dummy_errno, $dummy_errstr, $timeout)) { fputs($fp, "GET /$this->path HTTP/1.0\r\nHost: $this->realhostname\r\nUser-Agent: ITools\r\n\r\n"); $line = fgets($fp, 128); fclose($fp); #debug("it_url::is_reachable($this->rawurl: $line"); $result = preg_match("#^$this->protocol/[^ ]+ +[23]#i", $line); } return $result; } /** * Get simple URL with timeout. Can be called statically * @param $p parameter array with the following keys * @param $p['url']: url to get, defaults to constructor URL * @param $p['timeout']: timeout per read in seconds, defaults to 5. fractions allowed * @param $p['totaltimeout']: timeout for the whole function call * @param $p['data']: POST data array with key-value pairs * @return contents of resulting page, considering redirects, excluding headers, or false on error */ function get($p=null, $timeout=5) { if (!is_array($p)) $p = array('url' => $p, 'timeout' => $timeout); $p += array('totaltimeout' => "999999", 'timeout' => 5); if ($p['url']) $url = new it_url($p['url']); else $url =& $this; # Must be reference for $url->result and $url->data to work $url->result = $result = false; unset($url->data); $url->headers = array(); $p['timeout'] = min($p['timeout'], $p['totaltimeout']); # No operation may be longer than totaltimeout $endtime = time() + $p['totaltimeout']; if ($url->protocol == 'http') { if ($fp = @fsockopen($url->realhostname, $url->port, $dummy_errno, $dummy_errstr, $p['timeout'])) { # urlencode data pairs if is array if (is_array($p['data'])) $data = it_url::params($p['data']); $p['headers'] = (array)$p['headers'] + array( 'Host' => $url->realhostname, 'User-Agent' => "Mozilla/4.0 (compatible; MSIE 6.0; ITools)", 'Accept-Language' => T_lang(), ); if ($datalen = strlen($data)) { $method = "POST"; $p['headers'] += array( 'Content-Type' => "application/x-www-form-urlencoded", 'Content-Length' => $datalen, ); } else $method = "GET"; if ($url->user || $url->pass) $p['headers'] += array('Authorization' => 'Basic ' . base64_encode($url->user . ':' . $url->pass)); foreach ($p['headers'] as $header => $value) $headers .= "$header: $value\r\n"; stream_set_timeout($fp, intval($p['timeout']), intval(($p['timeout']*1000000)%1000000)); @fputs($fp, "$method /$url->path HTTP/1.0\r\n$headers\r\n$data"); while (!feof($fp) && ($line = @fgets($fp, 10240)) && ($line = trim($line)) && (time() < $endtime)) { if (preg_match('#^(HTTP\S+)\s(\d+)#', $line, $parts)) # Parse result code $url->headers[$parts[1]] = $url->result = $parts[2]; elseif (preg_match('#^Location: (https?://[^/]*)?(/)?(.*)$#i', $line, $parts) && ($parts[1] != $url->url)) # Handle redirects (supports relative and global) { unset($p['url'], $p['headers']['Host']); $url->it_url($parts[1] ? $parts[1].$parts[2].$parts[3] : $url->protocol.'://'.$url->realhostname.($parts[2] ? $parts[2].$parts[3] : '/'.dirname($url->path).'/'.$parts[3])); if (++$url->redir <= 4) /* Avoid infinite redirects */ return $url->get($p); } elseif (preg_match('#^([^:]+): (.*)$#', $line, $parts)) $url->headers[$parts[1]] = $parts[2]; } if ($url->result) { if ($url->headers['Transfer-Encoding'] == "chunked") # Bogus HTTP/1.1 chunked answer from server (e.g. Wordpress/Apach2/PHP5) { while ($len = hexdec(fgets($fp))) { $chunk = ""; while (!feof($fp) && (strlen($chunk) < $len) && (time() < $endtime)) $chunk .= @fread($fp, $len - strlen($chunk)); $url->data .= $chunk; } } else { while (!feof($fp) && (time() < $endtime)) $url->data .= @fread($fp, 20480); } if ($url->result < 400) $result =& $url->data; } @fclose($fp); } } return time() < $endtime ? $result : false; } /** * Construct a local directory name to cache an URL. Named args: * @param $p['cachedir'] directory to store cache files in, relative paths are appended to $ULTRAHOME/var/urlcache and that is also the default path */ function get_cache_dir($p) { $p += array('cachedir' => $GLOBALS['ULTRAHOME'] . "/var/urlcache"); return rtrim($p['cachedir'] . "/" . $p['id'], "/"); } /** * Construct a local file name to cache an URL. Named args: * @param $p['url'] remote url to get * @param $p['cachedir'] directory to store cache files in, @see get_cache_dir */ function get_cache_filename($p) { if (!is_array($p)) $p = array('url'=>$p); $p['cachedir'] = it_url::get_cache_dir($p); $filename = md5(T_lang() . $p['url']); return $p['cachedir'] . "/" . substr($filename, 0, 2) . "/$filename"; } /** * Store contents of url in a file and return file name. Threadsafe: Provides locking. Called statically. * Requires webserver writeable directory in $p['cachdedir']. Params in associative array p: * @param $p['url'] url to get * @param $p['cachedir'] directory to store cache files in, @see get_cache_dir * @param $p['id'] If you need more than one type of cache (e.g. different maxage) you can specify an id * @param $p['timeout'] timeout in seconds, default 10. fractions allowed * @param $p['maxage'] maximum age of cache entries in seconds, default 86400 * @param $p['cleanbefore'] maximum daytime when attempting cleanup, default 7200 * @param $p['preprocess'] callback function (or array for methods) to change received file or array('function' => ..., 'in' => $src, 'out' => $dst, ...) with callback function plus args * @param $p['safety'] value 0 means dont generate alert, value 1 means generate alerts on timeouts and failures * @param $p['keepfailed'] keep old versions of files if download fails (sending alerts conservatively) * @param $p['it_error'] parameters for it::error() */ function get_cache($p = array()) { $p += array('timeout' => 10, 'maxage' => 86400, 'cleanbefore' => 7200, 'safety' => 1, 'it_error' => array()); $p['totaltimeout'] = $p['timeout']; $path = it_url::get_cache_filename($p); # Must be before changing cachedir below $p['cachedir'] = it_url::get_cache_dir($p); @mkdir($p['cachedir']); @mkdir(dirname($path)); $age = file_exists($path) ? (time() - @filemtime($path)) : 0; # expire forgotten locks $lockmtime = @filemtime("$path.lock"); if ($lockmtime && time()-$lockmtime > 30) @unlink("$path.lock"); if ($p['keepfailed'] && ($age>$p['maxage']) && ($dummy = @fopen("$path.lock", EDC('nocache') ? "w" : "x"))) # update our copy if we get the lock { # my job to refresh the cache entry fclose($dummy); # Touch existing file to prevent locking other getters touch($path); EDC('getcache', "refresh", $p['url'], $path); if (($result = it_url::get($p))) it_url::_atomicwrite($path, $result); else touch($path); if ($p['safety'] == 1 && !$result) { $parts = @parse_url($p['url']); it::error($p['it_error'] + array('title'=>"get_cache: download failures on {$p['url']}", 'id'=>$parts['host'])); # send err only if multi failure } @unlink("$path.lock"); } # Remove ancient lock or cached file if it is too old if (!$p['keepfailed']) it_url::_expire($path, $p['maxage']); if ($dummy = @fopen($path, EDC('nocache') ? "w" : "x")) # succeeds if file is missing (or we are in nocache mode) { # fill cache myself fclose($dummy); EDC('getcache', "new", $p['url'], $path); $result = it_url::_atomicwrite($path, it_url::get($p)); } else { # get file from cache, potentially waiting if file is currently being transferred EDC('getcache', "old", $p['url'], $path); $result = it_url::_waitforpath($p + array('path' => $path)); } if ($result && $p['preprocess']) { $srcpath = $path; $path .= substr(md5(serialize($p['preprocess'])), 0, 2); it_url::_expire($path, $p['maxage']); if ($dummy = @fopen($path, EDC('nocache') ? "w" : "x")) # in nocache mode, always succeed { fclose($dummy); EDC('getcache', "process", $p['url'], $path); $dstpath = "$path.preprocesstmp"; if (is_array($p['preprocess']) && $p['preprocess']['function']) # Needs is_array as it can be a string where dereferencing gives first character! call_user_func($p['preprocess']['function'], array('in' => $srcpath, 'out' => $dstpath) + $p['preprocess']); else call_user_func($p['preprocess'], $srcpath, $dstpath); if (!@filesize($dstpath) || !@rename($dstpath, $path)) { @unlink($dstpath); @unlink($path); $result = false; } else $result = $path; if ($result) { EDC('getcache', "processold", $p['url'], $path); touch($result, @filemtime($srcpath)); # Ensure processed is never newer than src } } else $result = it_url::_waitforpath($p + array('path' => $path)); } # cache cleanup at night if ((date('H')*3600 + date('i')*60 < $p['cleanbefore']) && (time()-@filemtime($p['cachedir'] . "/cleaned") > 80000)) { touch($p['cachedir'] . "/cleaned"); $maxagemin = intval($p['maxage']/60); exec("nohup bash -c 'cd {$p['cachedir']} && sleep 10 && find ?? -mmin +$maxagemin -print0 | xargs -0 -r rm' /dev/null 2>&1 &"); } return $result; } function _waitforpath($p) { $p += array('sleeptime' => 0.1); # seconds to wait per pass # wait until cache is ready, then read from cache for ($maxpasses = $p['timeout'] / $p['sleeptime'], $passes = 0; (($size = @filesize($p['path'])) === 0) && ($passes < $maxpasses); ++$passes) { usleep($p['sleeptime'] * 1000000); clearstatcache(); } if ($size) $result = $p['path']; else if ($p['safety'] == 1) it::error(($passes < $maxpasses ? "error getting url" : "timeout") . " in it_url::get_cache(): url={$p['url']}, passes=$passes, maxpasses=$maxpasses, path={$p['path']}"); return $result; } function _atomicwrite($path, $data) { if ($data !== false) { $tmpname = tempnam(dirname($path), "writetmp"); fputs($cachetmp = fopen($tmpname, "w"), $data); fclose($cachetmp); chmod($tmpname, 0664); rename($tmpname, $path); $result = $path; } else @unlink($path); return $result; } function _expire($path, $maxage) { # Remove ancient lock or cached file if it is too old if (file_exists($path) && (((@filesize($path) == 0) && (time() - @filemtime($path)) > 30) || (time() - @filemtime($path) > $maxage))) { EDC('getcache', "expire", $path); @unlink($path); } } /** * Make an URL absolute by using host an protocol from current Apache request (but not port number) * @param $url Optional URL ( foo.html, /foo.html, //host/bar.html, http://host/bar.html ), default self * @return absolute version of URL ( http[s]://host/bar.html ) */ function absolute($url=null) { if (!isset($url)) $url = $_SERVER['PHP_SELF']; if (!preg_match('/^http/', $url)) { if (!preg_match('#//#', $url)) { $dir = preg_replace('#/[^/]*$#', '/', $_SERVER['PHP_SELF']); $url = preg_match('#^/#', $url) ? $url : "$dir$url"; $url = "//" . $_SERVER['HTTP_HOST'] . $url; } $url = "http" . (isset($_SERVER['HTTPS']) ? 's':'') . ":$url"; } return $url; } /** * Craft a valid redirect URL, send Location: header and terminate execution * @param $url Optional URL ( foo.html, /foo.html, //host/bar.html, http://host/bar.html ), default self * @return This method never returns. */ function redirect($url = null) { if (EDC('noredir')) echo "" . htmlspecialchars($url) . "
"; else { $url = preg_replace("/[\r\n].*/", '', it_url::absolute($url)); # Security: cut after CR/LF header('Location: ' . it_untaint($url, TC_SELF)); } exit; } /** * Urlencode but leave some chars */ function encode($str) { return strtr(urlencode($str), array("%2C"=>",", "%28"=>"(", "%29"=>")")); } /** * Create GET request from params, optionally only using given fields * @param $params Array to take values from, usually $_GET * @param $keys Keys to use; default: all */ function params($params, $keys = null) { return join("&", it_url::_params($params, $keys)); } function _params($params, $keys = null) { $result = array(); if (!isset($keys)) $keys = array_keys($params); foreach ($keys as $key) { if (is_array($params[$key])) { foreach (it_url::_params($params[$key]) as $value) { if (strlen($value)) $result[] = it::replace(array('^([^=\[]*)' => $key . '[$1]'), $value); } } else if (strlen($params[$key])) $result[] = urlencode($key) . "=" . it_url::encode($params[$key]); } return $result; } } ?>