<?php
/*
**	$Id$
**
**	Copyright (C) 1995-2007 by the ITools Authors.
**	This file is part of ITools - the Internet Tools Library
**
**	ITools is free software; you can redistribute it and/or modify
**	it under the terms of the GNU General Public License as published by
**	the Free Software Foundation; either version 3 of the License, or
**	(at your option) any later version.
**
**	ITools is distributed in the hope that it will be useful,
**	but WITHOUT ANY WARRANTY; without even the implied warranty of
**	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**	GNU General Public License for more details.
**
**	You should have received a copy of the GNU General Public License
**	along with this program.  If not, see <http://www.gnu.org/licenses/>.
**
**	url.class - URL parsing, retrieval and caching functions
*/

class it_url
{
	/* E.g. HTTP://falcon:joshua@www.Relog.CH.:80/default.asp */
	var $url;		/* E.g. http://www.relog.ch/ */
	var $protocol;		/* E.g. http */
	var $hostname;		/* E.g. relog.ch */
	var $realhostname;	/* E.g. www.relog.ch */
	var $port;		/* E.g. 80 */
	var $path;		/* E.g. / */
	var $rawurl;		/* E.g. HTTP://falcon:joshua@www.Relog.CH.:80/default.asp */
	var $user;		/* E.g. falcon */
	var $pass;		/* E.g. joshua */

	var $page;		/* Page or empty */
	var $page_read;		/* true if page read */
	var $title;		/* Page title or empty */
	var $description;	/* Page description or empty */

	var $headers;		/* Headers of page fetched by get() */
	var $data;		/* Data part, even if return code is not 200 */
	var $result;		/* Return code of get() */
	var $redir = 0;		/* Redirect count */


/**
 * Constructor: canonicalize an URL
 * @param $url URL this object represents
 */
function it_url($url, $options = array())
{
	$this->rawurl = $url;

	if (preg_match('#^([a-z]+):/+(?:([^:]*):([^@]*)@)?(.*)$#i', $url, $regs))
	{
		$this->protocol = strtolower($regs[1]);
		$this->user = $regs[2];
		$this->pass = $regs[3];
		$url = $regs[4];
	}
	else if (ereg('^[a-z]:', $url) || ereg('^/', $url))
	{
		$this->protocol = 'file';
	}
	else
		$this->protocol = 'http';

	/* Default port */
	if ($this->protocol == 'http')
		$protoport = 80;
	else if ($this->protocol == 'https')
		$protoport = 443;

	$this->port = intval($protoport);

	if (class_exists('Net_IDNA', false))
		$idn = Net_IDNA::getInstance();

	if ($idn)
		$pattern = '^([^/]+)/*(.*)$';
	else
		$pattern = '^([a-z0-9_:\.-]+)/*(.*)$';

	if (eregi($pattern, $url, $regs))
	{
		list($hostname, $port) = explode(':', $regs[1]);

		$this->realhostname = strtolower($hostname);

		if ($port)
			$this->port = intval($port);

		$url = $regs[2];
	}

	$this->hostname = preg_replace('/^www\./', '', $this->realhostname);

	# Get rid of common index file names
	$url = preg_replace('#(^|/)(index\.[ps]?html?|index\.php[34]?|default\.aspx?)$#', '', $url);

	$this->path = ereg_replace('^/$', '', $url);

	if ($this->port != $protoport)
		$this->url = "$this->protocol://$this->realhostname:$this->port/$this->path";
	else
		$this->url = "$this->protocol://$this->realhostname/$this->path";

	if ($idn)
	{
		$realhostname = $this->realhostname;

		if (!preg_match('/^utf-?8$/i', $options['encoding']))
			$realhostname = utf8_encode($realhostname);

		$encoded = $idn->encode($realhostname);

		if ($encoded != $realhostname)
			$this->realhostname = $encoded;
	}
}


/**
 * Read the page into memory, extract title and description and
 * set $this->page, $this->title and $this->description
 * @param $timeout Timeout for operation, defaults to unlimited (0)
 * @return True if page has been read and $this->page is set
 */
function read_page($timeout = 0)
{
	unset($this->page);
	unset($this->title);
	unset($this->description);

	/*
	** If the URL does not contain a dot followed by at least one character,
	** it is considered bogus. This prevents 'localhost', 'www', and numerical IP addresses.
	*/
	if (!eregi('\.[a-z]+$', $this->realhostname))
		return 0;

	$url = $this->rawurl;
	while ($this->page == '')
	{
		$cmd = 'LANG=C wget 2>&1 -T ' . ((int)$timeout) . ' -q -U "Mozilla/4.0 (Compatible; Relog ITools)" -O - ' . ereg_replace("[ \t]", '\\ ', escapeshellcmd("$url"));
		$this->page = `$cmd`;

		if ($this->page == '')	/* An error occurred. Find out what it was. */
		{
			$cmd = 'LANG=C wget 2>&1 -T' . ((int)$timeout) . ' -v -U "Mozilla/4.0 (Compatible; Relog ITools)" -O - ' . ereg_replace("[ \t]", '\\ ', escapeshellcmd($url));
			$error = `$cmd`;
			if (eregi('Location: ([^ ]*)', $error, $regs)) /* Redirect ? */
			{
				$url = $regs[1];
				if (!eregi('^[a-z]+:', $url))	/* Kludge for Miss Kournikova's admirers: grok local redirects (in violation of RFC) */
					$url = $this->rawurl.'/'.$url;
			}
			else
				break;
		}

		if (++$count > 4)	/* Avoid infinite redirect loops */
			break;
	}

	$this->page_read = 1;

	if (eregi('<title>([^<]*)</title>', $this->page, $regs))
		$this->title = it_htmlentities_decode($regs[1]);

	if (eregi('<meta name="description"[^>]+content="([^"]*)">', $this->page, $regs))
		$this->description = it_htmlentities_decode($regs[1]);

	return ($this->page != '');
}


/* Return the description of this page */
function get_description()
{
	if (!$this->page_read)
		$this->read_page();

	return $this->description;
}


/* Return the title of this page */
function get_title()
{
	if (!$this->page_read)
		$this->read_page();

	return $this->title;
}

/**
 * Check if a given url (currently http:port80-only) can be fetched
 * Note: Redirects are treated as succesful
 * $timeout Timeout for connection in seconds
 * @return true if url could be fetched
 */
function is_reachable($timeout = 5)
{
	$result = false;

	if ($fp = @fsockopen($this->realhostname, $this->port, $errno, $errstr, $timeout))
	{
		fputs($fp, "GET /$this->path HTTP/1.0\r\nHost: $this->realhostname\r\nUser-Agent: ITools\r\n\r\n");
		$line = fgets($fp, 128);
		fclose($fp);

		#debug("it_url::is_reachable($this->rawurl: $line");
		$result = eregi("^$this->protocol/[^ ]+ +[23]", $line);
	}

	return $result;
}

/**
 * Get simple URL with timeout. Can be called statically
 * @param $p parameter array with the following keys
 * @param $p['url']: url to get, defaults to constructor URL
 * @param $p['timeout']: timeout per read in seconds, defaults to 5. fractions allowed
 * @param $p['totaltimeout']: timeout for the whole function call
 * @param $p['data']: POST data array with key-value pairs
 * @return contents of resulting page, considering redirects, excluding headers, or false on error
 */
function get($p=null, $timeout=5)
{
	if (!is_array($p))
		$p = array('url' => $p, 'timeout' => $timeout);

	$p += array('totaltimeout' => "999999", 'timeout' => 5);

	if ($p['url'])
		$url = new it_url($p['url']);
	else
		$url =& $this;	# Must be reference for $url->result and $url->data to work

	$url->result = $result = false;
	unset($url->data);
	$url->headers = array();
	$p['timeout'] = min($p['timeout'], $p['totaltimeout']);	# No operation may be longer than totaltimeout
	$endtime = time() + $p['totaltimeout'];

	if ($url->protocol == 'http')
	{
		if ($fp = @fsockopen($url->realhostname, $url->port, $errno, $errstr, $p['timeout']))
		{
			# urlencode data pairs if is array
			if (is_array($p['data']))
				$data = it_url::params($p['data']);

			$p['headers'] = (array)$p['headers'] + array(
				'Host' => $url->realhostname,
				'User-Agent' => "Mozilla/4.0 (compatible; MSIE 6.0; ITools)",
				'Accept-Language' => T_lang(),
			);

			if ($datalen = strlen($data))
			{
				$method = "POST";
				$p['headers'] += array(
					'Content-Type' => "application/x-www-form-urlencoded",
					'Content-Length' => $datalen,
				);
			}
			else
				$method = "GET";

			if ($url->user || $url->pass)
				$p['headers'] += array('Authorization' => 'Basic ' . base64_encode($url->user . ':' . $url->pass));

			foreach ($p['headers'] as $header => $value)
					$headers .= "$header: $value\r\n";

			stream_set_timeout($fp, intval($p['timeout']), intval(($p['timeout']*1000000)%1000000));
			@fputs($fp, "$method /$url->path HTTP/1.0\r\n$headers\r\n$data");

			while (!feof($fp) && ($line = @fgets($fp, 10240)) && ($line = trim($line)) && (time() < $endtime))
			{
				if (preg_match('#^(HTTP\S+)\s(\d+)#', $line, $parts)) # Parse result code
					$url->headers[$parts[1]] = $url->result = $parts[2];
				elseif (preg_match('#^Location: (https?://[^/]*)?(/)?(.*)$#i', $line, $parts) && ($parts[1] != $url->url)) # Handle redirects (supports relative and global)
				{
					unset($p['url'], $p['headers']['Host']);
					$url->it_url($parts[1] ? $parts[1].$parts[2].$parts[3] : $url->protocol.'://'.$url->realhostname.($parts[2] ? $parts[2].$parts[3] : '/'.dirname($url->path).'/'.$parts[3]));
					if (++$url->redir <= 4)  /* Avoid infinite redirects */
						return $url->get($p);
				}
				elseif (preg_match('#^([^:]+): (.*)$#', $line, $parts))
					$url->headers[$parts[1]] = $parts[2];
			}

			if ($url->result)
			{
				if ($url->headers['Transfer-Encoding'] == "chunked")	# Bogus HTTP/1.1 chunked answer from server (e.g. Wordpress/Apach2/PHP5)
				{
					while ($len = hexdec(fgets($fp)))
					{
						$chunk = "";

						while (!feof($fp) && (strlen($chunk) < $len) && (time() < $endtime))
							$chunk .= @fread($fp, $len - strlen($chunk));

						$url->data .= $chunk;
					}
				}
				else
				{
					while (!feof($fp) && (time() < $endtime))
						$url->data .= @fread($fp, 20480);
				}

				if ($url->result < 400)
					$result =& $url->data;
			}

			@fclose($fp);
		}
	}

	return time() < $endtime ? $result : false;
}


/**
 * Construct a local file name to cache an URL. Named args:
 * @param $p['url'] remote url to get
 * @param $p['cachedir'] path to cache directory
 */
function get_cache_filename($p)
{
	if (!is_array($p))
		$p = array('url'=>$p);
	$p += array('cachedir' => $GLOBALS['ULTRAHOME'] . "/var/urlcache");
	$filename = md5(T_lang() . $p['url']);

	return $p['cachedir'] . "/" . substr($filename, 0, 2) . "/$filename";
}


/**
 * Store contents of url in a file and return file name. Threadsafe: Provides locking. Called statically.
 * Requires webserver writeable directory in $p['cachdedir']. Params in associative array p:
 * @param $p['url']         url to get
 * @param $p['cachedir']    path to cache directory
 * @param $p['timeout']     timeout in seconds, default 10. fractions allowed
 * @param $p['maxage']      maximum age of cache entries in seconds, default 86400
 * @param $p['cleanbefore'] maximum daytime when attempting cleanup, default 7200
 * @param $p['preprocess']	 callback function (or array for methods) to change received file or array('function' => ..., 'in' => $src, 'out' => $dst, ...) with callback function plus args
 * @param $p['safety']      value 0 means dont generate alert, value 1 means generate alerts on timeouts and failures
 * @param $p['keepfailed']  keep old versions of files if download fails (sending alerts conservatively)
 * @param $p['cachedir']    directory to store cache files in. NO TRAILING SLASH
 * @param $p['it_error']    parameters for it::error()
 */
function get_cache($p = array())
{
	$p += array('timeout'=>10, 'maxage'=>86400, 'cleanbefore'=>7200, 'safety'=>1, 'cachedir'=>$GLOBALS['ULTRAHOME']."/var/urlcache", 'it_error'=>array());
	$p['totaltimeout'] = $p['timeout'];

	$path =  it_url::get_cache_filename($p);
	@mkdir(dirname($path));
	$age = file_exists($path) ? (time() - @filemtime($path)) : 0;

	# expire forgotten locks
	$lockmtime = @filemtime("$path.lock");
	if ($lockmtime && time()-$lockmtime > 30)
		@unlink("$path.lock");

	if ($p['keepfailed'] && ($age>$p['maxage']) && ($dummy = @fopen("$path.lock", EDC('nocache') ? "w" : "x"))) # update our copy if we get the lock
	{ 
		# my job to refresh the cache entry
		fclose($dummy);

		# Touch existing file to prevent locking other getters
		touch($path);

		EDC('getcache', "refresh", $p['url'], $path);
		if (($result = it_url::get($p)))
			it_url::_atomicwrite($path, $result);
		else 
			touch($path);

		if ($p['safety'] == 1 && !$result)
		{
			$parts = @parse_url($p['url']);
			it::error($p['it_error'] + array('title'=>"get_cache: download failures on {$p['url']}", 'id'=>$parts['host'])); # send err only if multi failure
		}
		@unlink("$path.lock");
	}

	# Remove ancient lock or cached file if it is too old
	if (!$p['keepfailed'])
		it_url::_expire($path, $p['maxage']);

	if ($dummy = @fopen($path, EDC('nocache') ? "w" : "x")) # succeeds if file is missing (or we are in nocache mode)
	{
		# fill cache myself
		fclose($dummy);
		EDC('getcache', "new", $p['url'], $path);
		$result = it_url::_atomicwrite($path, it_url::get($p));
	}
	else
	{
		# get file from cache, potentially waiting if file is currently being transferred
		EDC('getcache', "old", $p['url'], $path);
		$result = it_url::_waitforpath($p + array('path' => $path));
	}

	if ($result && $p['preprocess'])
	{
		$srcpath = $path;
		$path .= substr(md5(serialize($p['preprocess'])), 0, 2);
		it_url::_expire($path, $p['maxage']);

		if ($dummy = @fopen($path, EDC('nocache') ? "w" : "x")) # in nocache mode, always succeed
		{
			fclose($dummy);
			EDC('getcache', "process", $p['url'], $path);
			$dstpath = "$path.preprocesstmp";

			if (is_array($p['preprocess']) && $p['preprocess']['function'])	# Needs is_array as it can be a string where dereferencing gives first character!
				call_user_func($p['preprocess']['function'], array('in' => $srcpath, 'out' => $dstpath) + $p['preprocess']);
			else
				call_user_func($p['preprocess'], $srcpath, $dstpath);

			if (!@filesize($dstpath) || !@rename($dstpath, $path))
			{
				@unlink($dstpath);
				@unlink($path);
				$result = false;
			}
			else
				$result = $path;
		}
		else
			$result = it_url::_waitforpath($p + array('path' => $path));

		if ($result)
		{
			EDC('getcache', "processold", $p['url'], $path);
			touch($result, @filemtime($srcpath));	# Ensure processed is never newer than src
		}
	}

	# cache cleanup at night
	if ((date('H')*3600 + date('i')*60 < $p['cleanbefore']) && (time()-@filemtime($p['cachedir'] . "/cleaned") > 80000))
	{
		touch($p['cachedir'] . "/cleaned");
		$maxagemin = intval($p['maxage']/60);
		exec("nohup bash -c 'cd {$p['cachedir']} && sleep 10 && find -mmin +$maxagemin -print0 | xargs -0 -r rm' </dev/null >/dev/null 2>&1 &");
	}

	return $result;
}

function _waitforpath($p)
{
	$p += array('sleeptime' => 0.1); # seconds to wait per pass

	# wait until cache is ready, then read from cache
	for ($maxpasses = $p['timeout'] / $p['sleeptime'], $passes = 0; (($size = @filesize($p['path'])) === 0) && ($passes < $maxpasses); ++$passes)
	{
		usleep($p['sleeptime'] * 1000000);
		clearstatcache();
	}

	if ($size)
		$result = $p['path'];
	else if ($p['safety'] == 1)
		it::error(($passes < $maxpasses ? "error getting url" : "timeout") . " in it_url::get_cache(): url={$p['url']}, passes=$passes, maxpasses=$maxpasses, path={$p['path']}");

	return $result;
}

function _atomicwrite($path, $data)
{
	if ($data !== false)
	{
		$tmpname = tempnam(dirname($path), "writetmp");
		fputs($cachetmp = fopen($tmpname, "w"), $data);
		fclose($cachetmp);
		chmod($tmpname, 0664);
		rename($tmpname, $path);
		$result = $path;
	}
	else
		@unlink($path);

	return $result;
}

function _expire($path, $maxage)
{
	# Remove ancient lock or cached file if it is too old
	if (file_exists($path) && (((@filesize($path) == 0) && (time() - @filemtime($path)) > 30) || (time() - @filemtime($path) > $maxage)))
	{
		EDC('getcache', "expire", $path);
		@unlink($path);
	}
}

/**
 * Make an URL absolute by using host an protocol from current Apache request (but not port number)
 * @param $url Optional URL ( foo.html, /foo.html, //host/bar.html, http://host/bar.html ), default self
 * @return absolute version of URL ( http[s]://host/bar.html )
 */
function absolute($url=null)
{
	if (!isset($url))
		$url = $_SERVER['PHP_SELF'];

	if (!ereg('^http', $url))
	{
		if (!ereg('//', $url))
		{
			$dir = ereg_replace('/[^/]*$', '/', $_SERVER['PHP_SELF']);
			$url = ereg('^/', $url) ? $url : "$dir$url";
			$url = "//" . $_SERVER['HTTP_HOST'] . $url;
		}
		$url = "http" . (isset($_SERVER['HTTPS']) ? 's':'') . ":$url";
	}

	return $url;
}

/**
 * Craft a valid redirect URL, send Location: header and terminate execution
 * @param $url Optional URL ( foo.html, /foo.html, //host/bar.html, http://host/bar.html ), default self
 * @return This method never returns.
 */
function redirect($url = null)
{
	if (EDC('noredir'))
		echo "<a href='" . htmlspecialchars(it_url::absolute($url)) . "'>" . htmlspecialchars($url) . "</a><br />";
	else
	{
		$url = preg_replace("/[\r\n].*/", '', it_url::absolute($url));	# Security: cut after CR/LF
		header('Location: ' . it_untaint($url, TC_SELF));
	}

	exit;
}

/**
 * Urlencode but leave some chars
 */
function encode($str)
{
	return strtr(urlencode($str), array("%2C"=>",", "%28"=>"(", "%29"=>")"));
}

/**
 * Create GET request from params, optionally only using given fields
 * @param $params Array to take values from, usually $_GET
 * @param $keys Keys to use; default: all
 */
function params($params, $keys = null)
{
	return join("&", it_url::_params($params, $keys));
}

function _params($params, $keys = null)
{
	$result = array();

	if (!isset($keys))
		$keys = array_keys($params);

	foreach ($keys as $key)
	{
		if (is_array($params[$key]))
		{
			foreach (it_url::_params($params[$key]) as $value)
			{
				if (strlen($value))
					$result[] = it::replace(array('^([^=\[]*)' => $key . '[$1]'), $value);
			}
		}
		else if (strlen($params[$key]))
			$result[] = urlencode($key) . "=" . it_url::encode($params[$key]);
	}

	return $result;
}

}

?>