Moved itools to live branch

author: Christian Schneider 2006-10-26 13:35:12 +0000
committer: Christian Schneider 2006-10-26 13:35:12 +0000
commit: a5a19fd672bc0b8113d620669b557f17dccd343a (patch)
tree: 876ba4fec8362ac2e9374f61b9b7f67fcd2b8e59 /url.class
download: itools-a5a19fd672bc0b8113d620669b557f17dccd343a.tar.gz
itools-a5a19fd672bc0b8113d620669b557f17dccd343a.tar.bz2
itools-a5a19fd672bc0b8113d620669b557f17dccd343a.zip
1 files changed, 484 insertions, 0 deletions
diff --git a/url.class b/url.class
new file mode 100644
index 0000000..d11f9dd
--- /dev/null
+++ b/url.class
@@ -0,0 +1,484 @@
+<?php
+/*
+**	$Id$
+**
+**	ITools - the Internet Tools Library
+**
+**	Copyright (C) 1995-2003 by the ITools Authors.
+**	This program is free software; you can redistribute it and/or
+**	modify it under the terms of either the GNU General Public License
+**	or the GNU Lesser General Public License, as published by the Free
+**	Software Foundation. See http://www.gnu.org/licenses/ for details.
+**
+**	url.class - Create an URL object and canonize it
+*/
+
+class it_url
+{
+	/* E.g. HTTP://www.Relog.CH.:80/default.asp */
+	var $url;		/* E.g. http://www.relog.ch/ */
+	var $protocol;		/* E.g. http */
+	var $hostname;		/* E.g. relog.ch */
+	var $realhostname;	/* E.g. www.relog.ch */
+	var $port;		/* E.g. 80 */
+	var $path;		/* E.g. / */
+	var $rawurl;		/* E.g. HTTP://www.Relog.CH.:80/default.asp */
+
+	var $page;		/* Page or empty */
+	var $page_read;		/* true if page read */
+	var $title;		/* Page title or empty */
+	var $description;	/* Page description or empty */
+
+	var $headers;		/* Headers of page fetched by get() */
+	var $data;		/* Data part, even if return code is not 200 */
+	var $result;		/* Return code of get() */
+
+
+/**
+ * Constructor: canonicalize an URL
+ * @param $url URL this object represents
+ */
+function it_url($url, $options = array())
+{
+	$this->rawurl = $url;
+
+	if (eregi('^([a-z]+):/+(.*)$', $url, $regs))
+	{
+		$this->protocol = strtolower($regs[1]);
+		$url = $regs[2];
+	}
+	else if (ereg('^[a-z]:', $url) || ereg('^/', $url))
+	{
+		$this->protocol = 'file';
+	}
+	else
+		$this->protocol = 'http';
+
+	/* Default port */
+	if ($this->protocol == 'http')
+		$protoport = 80;
+	else if ($this->protocol == 'https')
+		$protoport = 443;
+
+	$this->port = $protoport;
+
+	if (class_exists('Net_IDNA'))
+		$idn = Net_IDNA::getInstance();
+
+	if ($idn)
+		$pattern = '^([^/]+)/*(.*)$';
+	else
+		$pattern = '^([a-z0-9_:\.-]+)/*(.*)$';
+
+	if (eregi($pattern, $url, $regs))
+	{
+		list($hostname, $port) = explode(':', $regs[1]);
+
+		$this->realhostname = strtolower($hostname);
+
+		if ($port)
+			$this->port = $port;
+
+		$url = $regs[2];
+	}
+
+	if (ereg('^www\.(.*)$', $this->realhostname, $regs))
+		$this->hostname = $regs[1];
+	else
+		$this->hostname = $this->realhostname;
+
+	$index_files = array('index.html', 'index.htm', 'index.phtml', 'index.shtml', 'index.php3', 'index.php', 'default.asp');
+
+	for ($i = 0; $i < count($index_files); $i++)
+	{
+		$url = eregi_replace("^$index_files[$i]\$", '', $url);
+		$url = eregi_replace("/$index_files[$i]\$", '', $url);
+	}
+
+	$this->path = ereg_replace('^/$', '', $url);
+
+	if ($this->port != $protoport)
+		$this->url = "$this->protocol://$this->realhostname:$this->port/$this->path";
+	else
+		$this->url = "$this->protocol://$this->realhostname/$this->path";
+
+	if ($idn)
+	{
+		$realhostname = $this->realhostname;
+
+		if (!preg_match('/^utf-?8$/i', $options['encoding']))
+			$realhostname = utf8_encode($realhostname);
+
+		$encoded = $idn->encode($realhostname);
+
+		if ($encoded != $realhostname)
+			$this->realhostname = $encoded;
+	}
+}
+
+
+/**
+ * Read the page into memory, extract title and description and
+ * set $this->page, $this->title and $this->description
+ * @param $timeout Timeout for operation, defaults to unlimited (0)
+ * @return True if page has been read and $this->page is set
+ */
+function read_page($timeout = 0)
+{
+	unset($this->page);
+	unset($this->title);
+	unset($this->description);
+
+	/*
+	** If the URL does not contain a dot followed by at least one character,
+	** it is considered bogus. This prevents 'localhost', 'www', and numerical IP addresses.
+	*/
+	if (!eregi('\.[a-z]+$', $this->realhostname))
+		return 0;
+
+	$url = $this->rawurl;
+	while ($this->page == '')
+	{
+		$cmd = 'LANG=C wget 2>&1 -T ' . ((int)$timeout) . ' -q -U "Mozilla/4.0 (Compatible; Relog ITools)" -O - ' . ereg_replace("[ \t]", '\\ ', escapeshellcmd("$url"));
+		$this->page = `$cmd`;
+
+		if ($this->page == '')	/* An error occurred. Find out what it was. */
+		{
+			$cmd = 'LANG=C wget 2>&1 -T' . ((int)$timeout) . ' -v -U "Mozilla/4.0 (Compatible; Relog ITools)" -O - ' . ereg_replace("[ \t]", '\\ ', escapeshellcmd($url));
+			$error = `$cmd`;
+			if (eregi('Location: ([^ ]*)', $error, $regs)) /* Redirect ? */
+			{
+				$url = $regs[1];
+				if (!eregi('^[a-z]+:', $url))	/* Kludge for Miss Kournikova's admirers: grok local redirects (in violation of RFC) */
+					$url = $this->rawurl.'/'.$url;
+			}
+			else
+				break;
+		}
+
+		if (++$count > 4)	/* Avoid infinite redirect loops */
+			break;
+	}
+
+	$this->page_read = 1;
+
+	if (eregi('<title>([^<]*)</title>', $this->page, $regs))
+		$this->title = it_htmlentities_decode($regs[1]);
+
+	if (eregi('<meta name="description"[^>]+content="([^"]*)">', $this->page, $regs))
+		$this->description = it_htmlentities_decode($regs[1]);
+
+	return ($this->page != '');
+}
+
+
+/* Return the description of this page */
+function get_description()
+{
+	if (!$this->page_read)
+		$this->read_page();
+
+	return $this->description;
+}
+
+
+/* Return the title of this page */
+function get_title()
+{
+	if (!$this->page_read)
+		$this->read_page();
+
+	return $this->title;
+}
+
+/**
+ * Check if a given url (currently http:port80-only) can be fetched
+ * Note: Redirects are treated as succesful
+ * $timeout Timeout for connection in seconds
+ * @return true if url could be fetched
+ */
+function is_reachable($timeout = 5)
+{
+	$result = false;
+
+	if ($fp = @fsockopen($this->realhostname, $this->port, $errno, $errstr, $timeout))
+	{
+		fputs($fp, "GET /$this->path HTTP/1.0\r\nHost: $this->realhostname\r\nUser-Agent: ITools\r\n\r\n");
+		$line = fgets($fp, 128);
+		fclose($fp);
+
+		#debug("it_url::is_reachable($this->rawurl: $line");
+		$result = eregi("^$this->protocol/[^ ]+ +[23]", $line);
+	}
+
+	return $result;
+}
+
+/**
+ * Get simple URL with timeout. Can be called statically
+ * @url url to get, defaults to constructor URL
+ * @timeout timeout per read in milliseconds.
+ * @return contents of resulting page, considering redirects, excluding headers, or false on error
+ */
+function get($url=null, $timeout=5000)
+{
+	if ($url)
+		$url = new it_url($url);
+	else
+		$url =& $this;	# Must be reference for $url->result and $url->data to work
+
+	$url->result = $result = false;
+	unset($url->data);
+	$url->headers = array();
+
+	if ($url->protocol == 'http')
+	{
+		if ($fp = @fsockopen($url->realhostname, $url->port, $errno, $errstr, $timeout/1000))
+		{
+			stream_set_timeout($fp, intval($timeout/1000), ($timeout%1000)*1000);
+			@fputs($fp, "GET /$url->path HTTP/1.0\r\nHost: $url->realhostname\r\nUser-Agent: Mozilla/4.0 (compatible; MSIE 6.0; ITools)\r\n\r\n");
+
+			while (!feof($fp) && ($line = @fgets($fp, 10240)) && ($line = trim($line)))
+			{
+				if (preg_match('#^(HTTP\S+)\s(\d+)#', $line, $parts)) # Parse result code
+					$url->headers[$parts[1]] = $url->result = $parts[2];
+				elseif (preg_match('#^Location: (https?://[^/]*)?(/)?(.*)$#', $line, $parts) && ($parts[1] != $url->url)) # Handle redirects (supports relative and global)
+				{
+					$url->it_url($parts[1] ? $parts[1].$parts[2].$parts[3] : $url->protocol.'://'.$url->realhostname.($parts[2] ? $parts[2].$parts[3] : '/'.dirname($url->path).'/'.$parts[3]));
+					return $url->get(null, $timeout);
+				}
+				elseif (preg_match('#^([^:]+): (.*)$#', $line, $parts))
+					$url->headers[$parts[1]] = $parts[2];
+			}
+
+			if ($url->result)
+			{
+				while (!feof($fp))
+					$url->data .= @fread($fp, 20480);
+
+				if ($url->result < 400)
+					$result =& $url->data;
+			}
+
+			@fclose($fp);
+		}
+	}
+
+	return $result;
+}
+
+
+/**
+ * Construct a local file name to cache an URL. Named args:
+ * @url remote url to get
+ * @cachedir path to cache directory
+ */
+function get_cache_filename($p)
+{
+	if (!is_array($p))
+		$p = array('url'=>$p);
+	$p += array('cachedir' => $GLOBALS['ULTRAHOME'] . "/var/urlcache");
+
+	return $p['cachedir'] . "/" . substr(md5($p['url']), 0, 2) . "/" . md5($p['url']);
+}
+
+
+
+/**
+ * Store contents of url in a file and return file name. Provides locking. Call statically.
+ * Requires www writeable var/urlcache in your service dir. Params in assoc array:
+ * @url url to get
+ * @timeout timeout in milliseconds, default 10000
+ * @maxage maximum age of cache entries in seconds, default 86400
+ * @cleanbefore maximum daytime when attempting cleanup, default 7200
+ * @preprocess callback function (or array for methods) to change received file
+ * @safety value 0 means dont generate alert, value 1 means generate alerts on timeouts and failures
+ * @keepfailed keep old versions of files if download fails (sending alerts conservatively)
+ * @cachedir directory to store cache files in. NO TRAILING SLASH
+ */
+function get_cache($p = array())
+{
+	$p += array('timeout'=>10000, 'maxage'=>86400, 'cleanbefore'=>7200, 'safety'=>1, 'cachedir'=>$GLOBALS['ULTRAHOME']."/var/urlcache");
+
+	$path =  it_url::get_cache_filename($p);
+	@mkdir(dirname($path));
+	$age = time() - @filemtime($path);
+
+	# expire forgotten locks
+	$lockmtime = @filemtime("$path.lock");
+	if ($lockmtime && time()-$lockmtime > 30)
+		@unlink("$path.lock");
+
+	if ($p['keepfailed'] && ($age>$p['maxage']) && ($dummy = @fopen("$path.lock", EDC('nocache') ? "w" : "x"))) # update our copy if we get the lock
+	{ 
+		# my job to refresh the cache entry
+		fclose($dummy);
+		EDC('getcache', "refresh", $p['url'], $path);
+		if (($result = it_url::get($p['url'], $p['timeout'])))
+			it_url::_atomicwrite($path, $result);
+		else 
+			touch($path);
+
+		$parts = parse_url($p['url']);
+		if ($p['safety'] == 1)
+			it::error(array('title'=>"get_cache: download failures on $path", 'ok_key'=>md5($parts['host']), 'ok_delay'=>$p['maxage'], 'ok'=>$result ? 1 : 0)); # send err only if multi failure
+		@unlink("$path.lock");
+	}
+
+	# Remove ancient lock or cached file if it is too old
+	if (!$p['keepfailed'])
+		it_url::_expire($path, $p['maxage']);
+
+	if ($dummy = @fopen($path, EDC('nocache') ? "w" : "x")) # succeeds if file is missing (or we are in nocache mode)
+	{
+		# fill cache myself
+		fclose($dummy);
+		EDC('getcache', "new", $p['url'], $path);
+		$result = it_url::_atomicwrite($path, it_url::get($p['url'], $p['timeout']));
+	}
+	else
+	{
+		# get file from cache, potentially waiting if file is currently being transferred
+		EDC('getcache', "old", $p['url'], $path);
+		$result = it_url::_waitforpath($p + array('path' => $path));
+	}
+
+	if ($result && $p['preprocess'])
+	{
+		$srcpath = $path;
+		$path .= substr(md5(serialize($p['preprocess'])), 0, 2);
+		it_url::_expire($path, $p['maxage']);
+
+		if ($dummy = @fopen($path, EDC('nocache') ? "w" : "x")) # in nocache mode, always succeed
+		{
+			fclose($dummy);
+			EDC('getcache', "process", $p['url'], $path);
+			$dstpath = "$path.preprocesstmp";
+			call_user_func($p['preprocess'], $srcpath, $dstpath);
+
+			if (!@filesize($dstpath) || !@rename($dstpath, $path))
+			{
+				@unlink($dstpath);
+				@unlink($path);
+				$result = false;
+			}
+			else
+				$result = $path;
+		}
+		else
+			$result = it_url::_waitforpath($p + array('path' => $path));
+
+		if ($result)
+		{
+			EDC('getcache', "processold", $p['url'], $path);
+			touch($result, filemtime($srcpath));	# Ensure processed is never newer than src
+		}
+	}
+
+	# cache cleanup at night
+	if ((time()%86400 < $p['cleanbefore']) && (time()-@filemtime($p['cachedir'] . "/cleaned") > 80000))
+	{
+		touch($p['cachedir'] . "/cleaned");
+		$maxagemin = intval($p['maxage']/60);
+		exec("nohup bash -c 'cd {$p['cachedir']} && sleep 10 && find -mmin +$maxagemin -print0 | xargs -0 -r rm' </dev/null >/dev/null 2>&1 &");
+	}
+
+	return $result;
+}
+
+function _waitforpath($p)
+{
+	$p += array('sleeptime' => 100); # millisecs to wait
+
+	# wait until cache is ready, then read from cache
+	for ($maxpasses = $p['timeout'] / $p['sleeptime'], $passes = 0; (@filesize($p['path']) <= 0) && ($passes < $maxpasses); ++$passes)
+	{
+		usleep($p['sleeptime'] * 1000);
+		clearstatcache();
+	}
+
+	if ($passes < $maxpasses)
+		$result = $p['path'];
+	else if ($p['safety'] == 1)
+		it::error("timeout in it_url::get_cache(): url={$p['url']}, passes=$passes, maxpasses=$maxpasses, path={$p['path']}");
+
+	return $result;
+}
+
+function _atomicwrite($path, $data)
+{
+	if ($data !== false)
+	{
+		$tmpname = tempnam(dirname($path), "writetmp");
+		fputs($cachetmp = fopen($tmpname, "w"), $data);
+		fclose($cachetmp);
+		chmod($tmpname, 0664);
+		rename($tmpname, $path);
+		$result = $path;
+	}
+	else
+		unlink($path);
+
+	return $result;
+}
+
+function _expire($path, $maxage)
+{
+	# Remove ancient lock or cached file if it is too old
+	if (file_exists($path) && (((@filesize($path) == 0) && (time() - @filemtime($path)) > 30) || (time() - @filemtime($path) > $maxage)))
+	{
+		EDC('getcache', "expire", $path);
+		@unlink($path);
+	}
+}
+
+/**
+ * Make an URL absolute by using host an protocol from current Apache request (but not port number)
+ * @param $url Optional URL ( foo.html, /foo.html, //host/bar.html, http://host/bar.html ), default self
+ * @return absolute version of URL ( http[s]://host/bar.html )
+ */
+function absolute($url=null)
+{
+	if (!isset($url))
+		$url = $_SERVER['PHP_SELF'];
+
+	if (!ereg('^http', $url))
+	{
+		if (!ereg('//', $url))
+		{
+			$dir = ereg_replace('/[^/]*$', '/', $_SERVER['PHP_SELF']);
+			$url = ereg('^/', $url) ? $url : "$dir$url";
+			$url = "//" . $_SERVER['HTTP_HOST'] . $url;
+		}
+		$url = "http" . (isset($_SERVER['HTTPS']) ? 's':'') . ":$url";
+	}
+
+	return $url;
+}
+
+/**
+ * Craft a valid redirect URL, send Location: header and terminate execution
+ * @param $url Optional URL ( foo.html, /foo.html, //host/bar.html, http://host/bar.html ), default self
+ * @return This method never returns.
+ */
+function redirect($url = null)
+{
+	if (EDC('noredir'))
+		echo "<a href='" . htmlspecialchars(it_url::absolute($url)) . "'>" . htmlspecialchars($url) . "</a><br />";
+	else
+		header('Location: '.preg_replace("/[\r\n].*/", '', it_url::absolute($url)));	# Security: cut after CR/LF
+
+	exit;
+}
+
+/**
+ * Urlencode but leave some chars
+ */
+function encode($str)
+{
+    return strtr(urlencode($str), array("%2C"=>",", "%28"=>"(", "%29"=>")"));
+}
+
+}
+
+?>
author	Christian Schneider	2006-10-26 13:35:12 +0000
committer	Christian Schneider	2006-10-26 13:35:12 +0000
commit	a5a19fd672bc0b8113d620669b557f17dccd343a (patch)
tree	876ba4fec8362ac2e9374f61b9b7f67fcd2b8e59 /url.class
download	itools-a5a19fd672bc0b8113d620669b557f17dccd343a.tar.gz itools-a5a19fd672bc0b8113d620669b557f17dccd343a.tar.bz2 itools-a5a19fd672bc0b8113d620669b557f17dccd343a.zip