summaryrefslogtreecommitdiff
path: root/url.class
diff options
context:
space:
mode:
authorUrban Müller2007-07-26 13:02:24 +0000
committerUrban Müller2007-07-26 13:02:24 +0000
commit806a5297e7e99d455b97a4f0acaba2f40f470584 (patch)
treeb9fc43ef227da87d873cf3676c08c49fa0dc1240 /url.class
parentc3cba034c8009b65c25dd4ef5f54b18d9c8ee7d4 (diff)
downloaditools-806a5297e7e99d455b97a4f0acaba2f40f470584.tar.gz
itools-806a5297e7e99d455b97a4f0acaba2f40f470584.tar.bz2
itools-806a5297e7e99d455b97a4f0acaba2f40f470584.zip
renamed files for autoloader
Diffstat (limited to 'url.class')
-rw-r--r--url.class580
1 files changed, 0 insertions, 580 deletions
diff --git a/url.class b/url.class
deleted file mode 100644
index c94f4af..0000000
--- a/url.class
+++ /dev/null
@@ -1,580 +0,0 @@
-<?php
-/*
-** $Id$
-**
-** ITools - the Internet Tools Library
-**
-** Copyright (C) 1995-2007 by the ITools Authors.
-** This program is free software; you can redistribute it and/or
-** modify it under the terms of either the GNU General Public License
-** or the GNU Lesser General Public License, as published by the Free
-** Software Foundation. See http://www.gnu.org/licenses/ for details.
-**
-** url.class - URL parsing, retrieval and caching functions
-*/
-
-class it_url
-{
- /* E.g. HTTP://falcon:joshua@www.Relog.CH.:80/default.asp */
- var $url; /* E.g. http://www.relog.ch/ */
- var $protocol; /* E.g. http */
- var $hostname; /* E.g. relog.ch */
- var $realhostname; /* E.g. www.relog.ch */
- var $port; /* E.g. 80 */
- var $path; /* E.g. / */
- var $rawurl; /* E.g. HTTP://falcon:joshua@www.Relog.CH.:80/default.asp */
- var $user; /* E.g. falcon */
- var $pass; /* E.g. joshua */
-
- var $page; /* Page or empty */
- var $page_read; /* true if page read */
- var $title; /* Page title or empty */
- var $description; /* Page description or empty */
-
- var $headers; /* Headers of page fetched by get() */
- var $data; /* Data part, even if return code is not 200 */
- var $result; /* Return code of get() */
- var $redir = 0; /* Redirect count */
-
-
-/**
- * Constructor: canonicalize an URL
- * @param $url URL this object represents
- */
-function it_url($url, $options = array())
-{
- $this->rawurl = $url;
-
- if (preg_match('#^([a-z]+):/+(?:([^:]*):([^@]*)@)?(.*)$#i', $url, $regs))
- {
- $this->protocol = strtolower($regs[1]);
- $this->user = $regs[2];
- $this->pass = $regs[3];
- $url = $regs[4];
- }
- else if (ereg('^[a-z]:', $url) || ereg('^/', $url))
- {
- $this->protocol = 'file';
- }
- else
- $this->protocol = 'http';
-
- /* Default port */
- if ($this->protocol == 'http')
- $protoport = 80;
- else if ($this->protocol == 'https')
- $protoport = 443;
-
- $this->port = intval($protoport);
-
- if (class_exists('Net_IDNA'))
- $idn = Net_IDNA::getInstance();
-
- if ($idn)
- $pattern = '^([^/]+)/*(.*)$';
- else
- $pattern = '^([a-z0-9_:\.-]+)/*(.*)$';
-
- if (eregi($pattern, $url, $regs))
- {
- list($hostname, $port) = explode(':', $regs[1]);
-
- $this->realhostname = strtolower($hostname);
-
- if ($port)
- $this->port = intval($port);
-
- $url = $regs[2];
- }
-
- $this->hostname = preg_replace('/^www\./', '', $this->realhostname);
-
- # Get rid of common index file names
- $url = preg_replace('#(^|/)(index\.[ps]?html?|index\.php[34]?|default\.aspx?)$#', '', $url);
-
- $this->path = ereg_replace('^/$', '', $url);
-
- if ($this->port != $protoport)
- $this->url = "$this->protocol://$this->realhostname:$this->port/$this->path";
- else
- $this->url = "$this->protocol://$this->realhostname/$this->path";
-
- if ($idn)
- {
- $realhostname = $this->realhostname;
-
- if (!preg_match('/^utf-?8$/i', $options['encoding']))
- $realhostname = utf8_encode($realhostname);
-
- $encoded = $idn->encode($realhostname);
-
- if ($encoded != $realhostname)
- $this->realhostname = $encoded;
- }
-}
-
-
-/**
- * Read the page into memory, extract title and description and
- * set $this->page, $this->title and $this->description
- * @param $timeout Timeout for operation, defaults to unlimited (0)
- * @return True if page has been read and $this->page is set
- */
-function read_page($timeout = 0)
-{
- unset($this->page);
- unset($this->title);
- unset($this->description);
-
- /*
- ** If the URL does not contain a dot followed by at least one character,
- ** it is considered bogus. This prevents 'localhost', 'www', and numerical IP addresses.
- */
- if (!eregi('\.[a-z]+$', $this->realhostname))
- return 0;
-
- $url = $this->rawurl;
- while ($this->page == '')
- {
- $cmd = 'LANG=C wget 2>&1 -T ' . ((int)$timeout) . ' -q -U "Mozilla/4.0 (Compatible; Relog ITools)" -O - ' . ereg_replace("[ \t]", '\\ ', escapeshellcmd("$url"));
- $this->page = `$cmd`;
-
- if ($this->page == '') /* An error occurred. Find out what it was. */
- {
- $cmd = 'LANG=C wget 2>&1 -T' . ((int)$timeout) . ' -v -U "Mozilla/4.0 (Compatible; Relog ITools)" -O - ' . ereg_replace("[ \t]", '\\ ', escapeshellcmd($url));
- $error = `$cmd`;
- if (eregi('Location: ([^ ]*)', $error, $regs)) /* Redirect ? */
- {
- $url = $regs[1];
- if (!eregi('^[a-z]+:', $url)) /* Kludge for Miss Kournikova's admirers: grok local redirects (in violation of RFC) */
- $url = $this->rawurl.'/'.$url;
- }
- else
- break;
- }
-
- if (++$count > 4) /* Avoid infinite redirect loops */
- break;
- }
-
- $this->page_read = 1;
-
- if (eregi('<title>([^<]*)</title>', $this->page, $regs))
- $this->title = it_htmlentities_decode($regs[1]);
-
- if (eregi('<meta name="description"[^>]+content="([^"]*)">', $this->page, $regs))
- $this->description = it_htmlentities_decode($regs[1]);
-
- return ($this->page != '');
-}
-
-
-/* Return the description of this page */
-function get_description()
-{
- if (!$this->page_read)
- $this->read_page();
-
- return $this->description;
-}
-
-
-/* Return the title of this page */
-function get_title()
-{
- if (!$this->page_read)
- $this->read_page();
-
- return $this->title;
-}
-
-/**
- * Check if a given url (currently http:port80-only) can be fetched
- * Note: Redirects are treated as succesful
- * $timeout Timeout for connection in seconds
- * @return true if url could be fetched
- */
-function is_reachable($timeout = 5)
-{
- $result = false;
-
- if ($fp = @fsockopen($this->realhostname, $this->port, $errno, $errstr, $timeout))
- {
- fputs($fp, "GET /$this->path HTTP/1.0\r\nHost: $this->realhostname\r\nUser-Agent: ITools\r\n\r\n");
- $line = fgets($fp, 128);
- fclose($fp);
-
- #debug("it_url::is_reachable($this->rawurl: $line");
- $result = eregi("^$this->protocol/[^ ]+ +[23]", $line);
- }
-
- return $result;
-}
-
-/**
- * Get simple URL with timeout. Can be called statically
- * @p parameter array with the following keys
- * - url: url to get, defaults to constructor URL
- * - timeout: timeout per read in milliseconds, defaults to 5000
- * - data: post data array with key-value pairs
- * @return contents of resulting page, considering redirects, excluding headers, or false on error
- */
-function get($p=null, $timeout=5000)
-{
- if (!is_array($p))
- $p = array('url' => $p);
-
- if (!isset($p['timeout']))
- $p['timeout'] = $timeout;
-
- if ($p['url'])
- $url = new it_url($p['url']);
- else
- $url =& $this; # Must be reference for $url->result and $url->data to work
-
- $url->result = $result = false;
- unset($url->data);
- $url->headers = array();
-
- if ($url->protocol == 'http')
- {
- if ($fp = @fsockopen($url->realhostname, $url->port, $errno, $errstr, $p['timeout']/1000))
- {
- # urlencode data pairs if is array
- if (is_array($p['data']))
- $data = it_url::params($p['data']);
-
- $p['headers'] = (array)$p['headers'] + array(
- 'Host' => $url->realhostname,
- 'User-Agent' => "Mozilla/4.0 (compatible; MSIE 6.0; ITools)",
- 'Accept-Language' => T_lang(),
- );
-
- if ($datalen = strlen($data))
- {
- $method = "POST";
- $p['headers'] += array(
- 'Content-Type' => "application/x-www-form-urlencoded",
- 'Content-Length' => $datalen,
- );
- }
- else
- $method = "GET";
-
- if ($url->user || $url->pass)
- $p['headers'] += array('Authorization' => 'Basic ' . base64_encode($url->user . ':' . $url->pass));
-
- foreach ($p['headers'] as $header => $value)
- $headers .= "$header: $value\r\n";
-
- stream_set_timeout($fp, intval($p['timeout']/1000), ($p['timeout']%1000)*1000);
- @fputs($fp, "$method /$url->path HTTP/1.0\r\n$headers\r\n$data");
-
- while (!feof($fp) && ($line = @fgets($fp, 10240)) && ($line = trim($line)))
- {
- if (preg_match('#^(HTTP\S+)\s(\d+)#', $line, $parts)) # Parse result code
- $url->headers[$parts[1]] = $url->result = $parts[2];
- elseif (preg_match('#^Location: (https?://[^/]*)?(/)?(.*)$#i', $line, $parts) && ($parts[1] != $url->url)) # Handle redirects (supports relative and global)
- {
- unset($p['url'], $p['headers']['Host']);
- $url->it_url($parts[1] ? $parts[1].$parts[2].$parts[3] : $url->protocol.'://'.$url->realhostname.($parts[2] ? $parts[2].$parts[3] : '/'.dirname($url->path).'/'.$parts[3]));
- if (++$url->redir <= 4) /* Avoid infinite redirects */
- return $url->get($p);
- }
- elseif (preg_match('#^([^:]+): (.*)$#', $line, $parts))
- $url->headers[$parts[1]] = $parts[2];
- }
-
- if ($url->result)
- {
- if ($url->headers['Transfer-Encoding'] == "chunked") # Bogus HTTP/1.1 chunked answer from server (e.g. Wordpress/Apach2/PHP5)
- {
- while ($len = hexdec(fgets($fp)))
- {
- $chunk = "";
-
- while (!feof($fp) && (strlen($chunk) < $len))
- $chunk .= @fread($fp, $len - strlen($chunk));
-
- $url->data .= $chunk;
- }
- }
- else
- {
- while (!feof($fp))
- $url->data .= @fread($fp, 20480);
- }
-
- if ($url->result < 400)
- $result =& $url->data;
- }
-
- @fclose($fp);
- }
- }
-
- return $result;
-}
-
-
-/**
- * Construct a local file name to cache an URL. Named args:
- * @url remote url to get
- * @cachedir path to cache directory
- */
-function get_cache_filename($p)
-{
- if (!is_array($p))
- $p = array('url'=>$p);
- $p += array('cachedir' => $GLOBALS['ULTRAHOME'] . "/var/urlcache");
- $filename = md5(T_lang() . $p['url']);
-
- return $p['cachedir'] . "/" . substr($filename, 0, 2) . "/$filename";
-}
-
-
-
-/**
- * Store contents of url in a file and return file name. Provides locking. Call statically.
- * Requires www writeable var/urlcache in your service dir. Params in assoc array:
- * @p['url'] url to get
- * @p['timeout'] timeout in milliseconds, default 10000
- * @p['maxage'] maximum age of cache entries in seconds, default 86400
- * @p['cleanbefore'] maximum daytime when attempting cleanup, default 7200
- * @p['preprocess'] callback function (or array for methods) to change received file or array('function' => ..., 'in' => $src, 'out' => $dst, ...) with callback function plus args
- * @p['safety'] value 0 means dont generate alert, value 1 means generate alerts on timeouts and failures
- * @p['keepfailed'] keep old versions of files if download fails (sending alerts conservatively)
- * @p['cachedir'] directory to store cache files in. NO TRAILING SLASH
- * @p['it_error'] parameters for it_error
- */
-function get_cache($p = array())
-{
- $p += array('timeout'=>10000, 'maxage'=>86400, 'cleanbefore'=>7200, 'safety'=>1, 'cachedir'=>$GLOBALS['ULTRAHOME']."/var/urlcache", 'it_error'=>array());
-
- $path = it_url::get_cache_filename($p);
- @mkdir(dirname($path));
- $age = time() - @filemtime($path);
-
- # expire forgotten locks
- $lockmtime = @filemtime("$path.lock");
- if ($lockmtime && time()-$lockmtime > 30)
- @unlink("$path.lock");
-
- if ($p['keepfailed'] && ($age>$p['maxage']) && ($dummy = @fopen("$path.lock", EDC('nocache') ? "w" : "x"))) # update our copy if we get the lock
- {
- # my job to refresh the cache entry
- fclose($dummy);
-
- # Touch existing file to prevent locking other getters
- if (file_exists($path))
- touch($path);
-
- EDC('getcache', "refresh", $p['url'], $path);
- if (($result = it_url::get($p['url'], $p['timeout'])))
- it_url::_atomicwrite($path, $result);
- else
- touch($path);
-
- if ($p['safety'] == 1 && !$result)
- {
- $parts = @parse_url($p['url']);
- it::error($p['it_error'] + array('title'=>"get_cache: download failures on {$p['url']}", 'id'=>$parts['host'])); # send err only if multi failure
- }
- @unlink("$path.lock");
- }
-
- # Remove ancient lock or cached file if it is too old
- if (!$p['keepfailed'])
- it_url::_expire($path, $p['maxage']);
-
- if ($dummy = @fopen($path, EDC('nocache') ? "w" : "x")) # succeeds if file is missing (or we are in nocache mode)
- {
- # fill cache myself
- fclose($dummy);
- EDC('getcache', "new", $p['url'], $path);
- $result = it_url::_atomicwrite($path, it_url::get($p['url'], $p['timeout']));
- }
- else
- {
- # get file from cache, potentially waiting if file is currently being transferred
- EDC('getcache', "old", $p['url'], $path);
- $result = it_url::_waitforpath($p + array('path' => $path));
- }
-
- if ($result && $p['preprocess'])
- {
- $srcpath = $path;
- $path .= substr(md5(serialize($p['preprocess'])), 0, 2);
- it_url::_expire($path, $p['maxage']);
-
- if ($dummy = @fopen($path, EDC('nocache') ? "w" : "x")) # in nocache mode, always succeed
- {
- fclose($dummy);
- EDC('getcache', "process", $p['url'], $path);
- $dstpath = "$path.preprocesstmp";
-
- if (is_array($p['preprocess']) && $p['preprocess']['function']) # Needs is_array as it can be a string where dereferencing gives first character!
- call_user_func($p['preprocess']['function'], array('in' => $srcpath, 'out' => $dstpath) + $p['preprocess']);
- else
- call_user_func($p['preprocess'], $srcpath, $dstpath);
-
- if (!@filesize($dstpath) || !@rename($dstpath, $path))
- {
- @unlink($dstpath);
- @unlink($path);
- $result = false;
- }
- else
- $result = $path;
- }
- else
- $result = it_url::_waitforpath($p + array('path' => $path));
-
- if ($result)
- {
- EDC('getcache', "processold", $p['url'], $path);
- touch($result, @filemtime($srcpath)); # Ensure processed is never newer than src
- }
- }
-
- # cache cleanup at night
- if ((time()%86400 < $p['cleanbefore']) && (time()-@filemtime($p['cachedir'] . "/cleaned") > 80000))
- {
- touch($p['cachedir'] . "/cleaned");
- $maxagemin = intval($p['maxage']/60);
- exec("nohup bash -c 'cd {$p['cachedir']} && sleep 10 && find -mmin +$maxagemin -print0 | xargs -0 -r rm' </dev/null >/dev/null 2>&1 &");
- }
-
- return $result;
-}
-
-function _waitforpath($p)
-{
- $p += array('sleeptime' => 100); # millisecs to wait
-
- # wait until cache is ready, then read from cache
- for ($maxpasses = $p['timeout'] / $p['sleeptime'], $passes = 0; (@filesize($p['path']) <= 0) && ($passes < $maxpasses); ++$passes)
- {
- usleep($p['sleeptime'] * 1000);
- clearstatcache();
- }
-
- if ($passes < $maxpasses)
- $result = $p['path'];
- else if ($p['safety'] == 1)
- it::error("timeout in it_url::get_cache(): url={$p['url']}, passes=$passes, maxpasses=$maxpasses, path={$p['path']}");
-
- return $result;
-}
-
-function _atomicwrite($path, $data)
-{
- if ($data !== false)
- {
- $tmpname = tempnam(dirname($path), "writetmp");
- fputs($cachetmp = fopen($tmpname, "w"), $data);
- fclose($cachetmp);
- chmod($tmpname, 0664);
- rename($tmpname, $path);
- $result = $path;
- }
- else
- @unlink($path);
-
- return $result;
-}
-
-function _expire($path, $maxage)
-{
- # Remove ancient lock or cached file if it is too old
- if (file_exists($path) && (((@filesize($path) == 0) && (time() - @filemtime($path)) > 30) || (time() - @filemtime($path) > $maxage)))
- {
- EDC('getcache', "expire", $path);
- @unlink($path);
- }
-}
-
-/**
- * Make an URL absolute by using host an protocol from current Apache request (but not port number)
- * @param $url Optional URL ( foo.html, /foo.html, //host/bar.html, http://host/bar.html ), default self
- * @return absolute version of URL ( http[s]://host/bar.html )
- */
-function absolute($url=null)
-{
- if (!isset($url))
- $url = $_SERVER['PHP_SELF'];
-
- if (!ereg('^http', $url))
- {
- if (!ereg('//', $url))
- {
- $dir = ereg_replace('/[^/]*$', '/', $_SERVER['PHP_SELF']);
- $url = ereg('^/', $url) ? $url : "$dir$url";
- $url = "//" . $_SERVER['HTTP_HOST'] . $url;
- }
- $url = "http" . (isset($_SERVER['HTTPS']) ? 's':'') . ":$url";
- }
-
- return $url;
-}
-
-/**
- * Craft a valid redirect URL, send Location: header and terminate execution
- * @param $url Optional URL ( foo.html, /foo.html, //host/bar.html, http://host/bar.html ), default self
- * @return This method never returns.
- */
-function redirect($url = null)
-{
- if (EDC('noredir'))
- echo "<a href='" . htmlspecialchars(it_url::absolute($url)) . "'>" . htmlspecialchars($url) . "</a><br />";
- else
- header('Location: '.preg_replace("/[\r\n].*/", '', it_url::absolute($url))); # Security: cut after CR/LF
-
- exit;
-}
-
-/**
- * Urlencode but leave some chars
- */
-function encode($str)
-{
- return strtr(urlencode($str), array("%2C"=>",", "%28"=>"(", "%29"=>")"));
-}
-
-/**
- * Create GET request from params, optionally only using given fields
- * @param $params Array to take values from, usually $_GET
- * @param $keys Keys to use; default: all
- */
-function params($params, $keys = null)
-{
- return join("&", it_url::_params($params, $keys));
-}
-
-function _params($params, $keys = null)
-{
- $result = array();
-
- if (!isset($keys))
- $keys = array_keys($params);
-
- foreach ($keys as $key)
- {
- if (is_array($params[$key]))
- {
- foreach (it_url::_params($params[$key]) as $value)
- {
- if (strlen($value))
- $result[] = it::replace(array('^([^=\[]*)' => $key . '[$1]'), $value);
- }
- }
- else if (strlen($params[$key]))
- $result[] = urlencode($key) . "=" . it_url::encode($params[$key]);
- }
-
- return $result;
-}
-
-}
-
-?>