<?php
/*
**	Copyright (C) 1995-2022 by the ITools Authors.
**	This file is part of ITools - the Internet Tools Library
**
**	ITools is free software; you can redistribute it and/or modify
**	it under the terms of the GNU General Public License as published by
**	the Free Software Foundation; either version 3 of the License, or
**	(at your option) any later version.
**
**	ITools is distributed in the hope that it will be useful,
**	but WITHOUT ANY WARRANTY; without even the implied warranty of
**	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**	GNU General Public License for more details.
**
**	You should have received a copy of the GNU General Public License
**	along with this program.  If not, see <http://www.gnu.org/licenses/>.
**
**	url.class - URL parsing, retrieval and caching functions
*/

class it_url
{
	/* E.g. HTTP://falcon:joshua@www.Relog.CH.:80/default.asp */
	var $url;		/* E.g. https://www.relog.ch/ */
	var $protocol;		/* E.g. http */
	var $hostname;		/* E.g. relog.ch */
	var $realhostname;	/* E.g. www.relog.ch */
	var $port;		/* E.g. 80 */
	var $explicitport;	/* E.g. :80, explicitly set in rawurl */
	var $path;		/* E.g. / */
	var $rawurl;		/* E.g. HTTP://falcon:joshua@www.Relog.CH.:80/default.asp */
	var $user;		/* E.g. falcon */
	var $pass;		/* E.g. joshua */
	var $cookies;		/* key => values of cookies from server */
	var $headers;		/* Headers of page fetched by get() */
	var $data;		/* Data part, even if return code is not 200 */
	var $status;		/* HTTP response code of get() */
	var $result;		/* Deprecated, copy of $status */
	var $redir = 0;		/* Redirect count */
	var $header;    /* http header */
	var $errstr;    /* request error string */
	var $curlinfo;

	static $forceretry = "^(5..)$";

/**
 * Constructor: canonicalize an URL
 * @param $url URL this object represents
 */
function __construct($url = null)
{
	$this->rawurl = $url;
	$comp = parse_url($url);
	$this->protocol = strtolower($comp['scheme']) ?: "http";
	$protoport = $this->protocol == 'https' ? 443 : 80;			# port according to protocol
	$this->port = intval($comp['port'] ?: $protoport);			# this is set even in default case
	$this->explicitport = $comp['port'] ? ':' . $comp['port'] : '';		# only set if explicitly specified in url, contains leading :
	$this->user = $comp['user'];
	$this->pass = $comp['pass'];
	$this->realhostname = strtolower($comp['host']);
	$this->hostname = preg_replace('/^www\./', '', $this->realhostname);
	$this->path = ltrim($comp['path'] . ($comp['query'] ? '?' . $comp['query'] : ''), '/');	# $this->path is named poorly, it includes path and query
	$this->url = "$this->protocol://$this->realhostname" . ($this->port != $protoport ? $this->explicitport : '') . "/$this->path";
	$this->realhostname = idn_to_ascii($this->realhostname ?: "0", IDNA_DEFAULT, INTL_IDNA_VARIANT_UTS46) ?: $this->realhostname;		# punycode or original
}


/**
 * Check if a given url (currently http:port80-only) can be fetched
 * Note: Redirects are treated as succesful
 * @param $p parameter array passed on to get
 * @return true if url could be fetched
 */
static function is_reachable($p = [])
{
	$result = static::get((is_array($p) ? $p : ['url' => $p]) + ['maxlength' => 1000, 'totaltimeout' => 5, 'assoc' => true, 'it_error' => false]);
	return $result['status'] >= 200 && $result['status'] < 400;
}

# internal
static function _postprocess($data, $p)
{
	if ($p['postprocess'])
		$data = ($t = $p['postprocess']($data, ['it_error' => $p['retries'] > 0 ? false : (array)$p['it_error'] + ['title' => "invalid content from " . $p['url']]])) && $p['checkonly'] ? $data : $t;

	return $data;
}

/**
 * Get simple URL with timeout and one retry. Can be called statically. Times out, calls it::error for all errs
 *
 * Request initiation
 * @param $p parameter array with the following keys
 * @param $p['url']           url to get, defaults to constructor URL
 * @param $p['headers']       optional assoc array of HTTP headers to send, e.g. ['Host' => "foo"]
 * @param $p['data']          POST data array with key-value pairs
 * @param $p['files']         [fieldname => filename] of files to upload
 * @param $p['maxlength']     maximum length of response
 * @param $p['filemtime']     Add HTTP header to only fetch when newer than this, otherwise return true instead of data
 * @param $p['accept_encoding'] Contents of the "Accept-Encoding: " header. Enables decoding of the response. Set to null to disable, "" (default) for all supported encodings.
 * @param $p['protocols']     Array of protocols to accept, defaults to ['http', 'https'], @see curl_opts for other values
 *
 * Problem handling
 * @param $p['retries']       Number of retries if download fails, default 1
 * @param $p['timeout']       inactivity timeout seconds, default 5. fractions ok. silent by default
 * @param $p['totaltimeout']  timeout for the whole attempt but see $['retry']
 * @param $p['retrysleep']    Number of seconds to wait before retry (additional to fetchsleep), fractions ok
 * @param $p['safety']        DEPRECATED. 0 = ignore errors, 1 = errors, 2 = fatals
 * @param $p['it_error']      extra arguments for it_error or false to ignore errors
 * @param $p['fetchsleep']    Number of seconds to wait after fetch, fractions ok
 * @param $p['body_on_fail']  Return body of page even if http status code is >= 400, e.g. some JSON APIs return 404 with JSON data
 *
 * Result processing
 * @param $p['assoc']         Return [ 'data' => string, 'status' => int, 'cookies' => array, 'headers' => array, 'errstr' => string ] instead of just data
 * @param $p['writefunction'] function to be called whenever data is received (for server-sent-events etc.)
 * @param $p['postprocess']   function called with content and $p which has it_error. returns content or null (which triggers retry)
 * @param $p['followlocation']Follow redirects [true]
 *
 * @return Content of resulting page (considering redirects, excluding headers or false on error) or array if 'assoc' => true
 */
static function get($p = [])
{
	return (new static)->_get($p);
}

/**
 * Non-static alias for get so we can make get() static
 */
function _get($p = [])
{
	$p = is_string($p) ? ['url' => $p, 'timeout' => 5] : $p;
	$p += array('retries' => 1);

	if (($filter = EDC('req')) && ($filter == 1 || strstr($p['url'], "/$filter.")))
		if ($p['url'])
			ED($p);
		else
			ED($this->url, $p);

	if ($p['url'])
		$this->__construct($p['url']);

	$result = $this->request($p + ['followlocation' => true]);
	$result = self::_postprocess($result, $p);

	if ($p['retries'] > 0 && self::retry_warranted($result, $this->status))
	{
		usleep($p['retrysleep']*1000000);
		$result = $this->_get(array('retries' => $p['retries'] - 1) + $p);
	}

	if (($filter = EDC('res')) && strstr($p['url'], it::replace(array('1' => ":"), $filter)))
		ED($result);

	usleep($p['fetchsleep'] * 1000000);

	if ($p['assoc'])
		$result = [ 'status' => intval($this->status) ?: 503, 'data' => $result !== false ? $this->data : null, 'headers' => $this->headers, 'cookies' => $this->cookies, 'errstr' => $this->errstr ];

	EDC('curlinfo', $this->status, $this->headers, $this->cookies, $this->errstr);

	return $result;
}

static function retry_warranted($result, $status)
{
	return $result ? it::match(self::$forceretry, $status) : !it::match('^(204|4..)$', $status);
}

function parse_http_header($header)
{
	foreach (explode("\n", trim($header)) as $line)
	{
		$line = trim($line);
		if (preg_match('#^(HTTP)\S+\s(\d+)#', $line, $parts)) # Parse result code
			$this->headers[$parts[1]] = $this->status = $this->result = $parts[2];
		else if (preg_match('#^([^:]+): (.*)$#', $line, $parts))
			$this->headers[ucwords($parts[1], '-')] = $parts[2];
		if (strtolower($parts[1]) == 'set-cookie' && preg_match('/^([^=]+)=([^;]*)/', $parts[2], $cookie))
			$this->cookies[$cookie[1]] = $cookie[2];
	}
}

static function _default_headers($url, $p)
{
	$search_subrequest = it::match('search\.ch/', $p['url']);
	if ((!it::is_devel() || EDC('subreqcheck')) && $p['url'] && !$p['headers']['Accept-Language'] && T_lang() != T_defaultlang() && $search_subrequest && !it::match('/login\b|banner\.html|machines\.txt|mbtiles\.php|/fonts/|/itjs/|/images/|\.(de|fr|en|it)(\.js|\.html|\.txt|\.php|\.ics|\.pdf|\.json|\.csv|\.gif|\.jpg|\.png)', $p['url']))
		it::error(['title' => "Subrequest without language override", 'body' => [ $p ]]);

	$headers = array_filter([
		'Host' => $url->realhostname . $url->explicitport,
		'User-Agent' => "Mozilla/5.0 (compatible; ITools; Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582)",
		'Accept-Language' => $p['headers']['Accept-Language'] ?? ($search_subrequest ? T_defaultlang() : T_lang()), # can prevent loading of it_text
		'Referer' => it::match('([-\w]+\.\w+)$', $url->hostname) == it::match('([-\w]+\.\w+)$', $_SERVER['HTTP_HOST']) ? it::replace(['%[0-9a-f]?$' => ''], substr(static::absolute(U($_GET)), 0, 8000)) : null,	# Truncate overly long referers leading to failed subrequest but make sure it is still propery urlencoded
		'X-Ultra-Https' => $_SERVER['HTTPS'],
	]);

	if (is_int($p['filemtime']))
		$headers['If-Modified-Since'] = date("r", $p['filemtime']);
	return $headers;
}

static function curl_opts($p=array())
{
	$p += [
		'totaltimeout' => "999999",
		'timeout' => 5,
		'followlocation' => !$p['files'], # disallow redirects for file uploads as recommended by https://curl.se/libcurl/security.html
		'accept_encoding' => '',          # set header to accept any supported encoding and enable automatic decompression
		'protocols' => ['http', 'https'], # Array with allowed protocols, see list below
	];
	$protocols = [
		'file'  => CURLPROTO_FILE,
		'ftp'   => CURLPROTO_FTP,
		'ftps'  => CURLPROTO_FTPS,
		'http'  => CURLPROTO_HTTP,
		'https' => CURLPROTO_HTTPS,
		'scp'   => CURLPROTO_SCP,
		'sftp'  => CURLPROTO_SFTP,
	];

	$add = [];

	if (it::grep("[\n\r]", it::map('"$k$v"', $p['headers'])))
		it::error(['title' => "Newline in headers", 'body' => $p['headers']]);

	foreach ($p['headers'] as $header => $value)
		$headers[] = strtr("$header: $value", "\n\r", '  ');

	# file upload
	foreach ((array)$p['files'] as $field => $filename)
		$p['data'][$field] = new CURLFile($filename, mime_content_type($filename));

	if ($p['data'])
		$add += [ CURLOPT_POSTFIELDS => $p['data'] ];

	if ($p['pass'] || $p['user'])
		$add += [ CURLOPT_HTTPAUTH => CURLAUTH_BASIC, CURLOPT_USERPWD => $p['user'] . ':' . $p['pass'] ];

	if ($p['writefunction'])
	{
		$add += [
			CURLOPT_RETURNTRANSFER   => false,
			CURLOPT_WRITEFUNCTION    => $p['writefunction'],
		];
	}

	if ($p['sslkey'])
		$add += [CURLOPT_SSLKEY => $p['sslkey']];

	if ($p['sslcert'])
		$add += [CURLOPT_SSLCERT => $p['sslcert']];

	$add += EDC('curlinfo') ? [CURLINFO_HEADER_OUT => 1] : [];
	$add += [CURLOPT_COOKIEFILE => ""];

	if ($p['verbose'] || EDC('curlverbose'))
		$add += [ CURLOPT_VERBOSE => true ];

	if (isset($p['accept_encoding']))
		$add += [CURLOPT_ENCODING => $p['accept_encoding']]; # NOTE: the curl library renamed the option to CURLOPT_ACCEPT_ENCODING, in php both are possible, CURLOPT_ENCODING is documented

	return $add + [
		CURLOPT_HEADER => false,
		CURLOPT_RETURNTRANSFER => true,
		CURLOPT_TIMEOUT_MS => $p['totaltimeout'] * 1000,	# use _MS to support fractions of seconds
		CURLOPT_LOW_SPEED_LIMIT => 5,
		CURLOPT_LOW_SPEED_TIME => $p['timeout'],
		CURLOPT_FOLLOWLOCATION => $p['followlocation'],
		CURLOPT_MAXREDIRS => 20,
		CURLOPT_HTTPHEADER => $headers,
		CURLOPT_CUSTOMREQUEST => $p['method'] ?: null,
		CURLOPT_NOBODY => $p['method'] == 'HEAD',
		CURLOPT_SAFE_UPLOAD => true,	# disable special meaning of @value in POST forms (security)
		CURLOPT_PROTOCOLS => array_reduce($p['protocols'], fn($c, $v) => $c | $protocols[$v], 0),

		CURLOPT_CAPATH         => '/etc/ssl/certs/',
		CURLOPT_SSL_VERIFYPEER => !$p['allow_insecure_ssl'],
		CURLOPT_SSL_VERIFYHOST => $p['allow_insecure_ssl'] ? 0 : 2,
	];
}

/*
 * drop in replacement for request using curl
 *
 * @param $p['data']         POST data array with key-value pairs
 * @param $p['files']        [fieldname => filename] of files to upload
 * @param $p['method']       different HTTP method
 * @param $p['verbose']      generate and capture curl verbose output in $this->verbose and alert mails
*/

function request($p=array())
{
	static $curl_handles = [];
	$url = $this;
	if ($p['url'])
		$this->__construct($p['url']);
	$this->errstr = "";

	$url->headers = array();
	$p['headers'] = array_filter((array)$p['headers'] + self::_default_headers($url, $p), 'strlen');
	$opts = self::curl_opts($p + array('user' => $this->user, 'pass' => $this->pass, 'followlocation' => false));
	if ($p['verbose'])
	{
		$stderr = it::fopen("php://memory", "r+");
		$opts += [CURLOPT_STDERR => $stderr, CURLOPT_VERBOSE => 1];
	}
	if (!($curl = $curl_handles[getmypid()]))
		$curl = $curl_handles[getmypid()] = curl_init($url->url);
	else
	{
		curl_reset($curl);
		curl_setopt($curl, CURLOPT_URL, $url->url);
	}


	# FIXME 2025-01 NG just use CURLOPT_MAXFILESIZE if we have curl 8.4
	$content = "";
	if ($p['maxlength'] && !$p['writefunction'])
	{
		$opts[CURLOPT_WRITEFUNCTION] = function ($dummy, $data) use ($p, &$content) {
			static $space;
			$write = min($space ??= $p['maxlength'], strlen($data));
			$content .= substr($data, 0, $write);
			$space -= $write;
			return $write;
		};
	}

	$opts[CURLOPT_HEADERFUNCTION] = function ($dummy, $data) use (&$header) {
		$header .= $data;
		return strlen($data);
	};

	curl_setopt_array($curl, $opts);

	$got = curl_exec($curl);
	$body = $origbody = $p['maxlength'] && $got ? $content : $got;

	$this->curlinfo = curl_getinfo($curl);
	EDC('curlinfo', $this->curlinfo);

	if ($body !== false || curl_errno($curl) == 23)
	{
		$url->header = array_slice(explode("\r\n\r\n", trim($header)), -1)[0] . "\r\n\r\n";
		$url->data = $body;

		$url->parse_http_header($url->header);

		# Change result status for content longer than maxlength to 204 as we do not return partial data but still want to indicate success e.g. for is_reachable
		if ($p['maxlength'] && $url->status == 200 && strlen($content) && !$body)
			$url->status = $this->result = 204;

		if ($p['filemtime'] && ($url->status == 304))
		{
			$result = true;	# Not modified, success but no data
		}
		else if ($url->status == 414)
		{
			it::error((array)$p['it_error'] + ['title' => "Request-URI Too Long: " . substr($url->url, 0, 100) . "...(truncated " . (strlen($url->url) - 100) . " bytes)", 'body' => curl_getinfo($curl) + ($p['verbose'] ? ['verbose' => $this->verbose] : [])]);
			$this->errstr = "HTTP Status " . $url->status;
		}
		else
		{
			if ($url->status >= 400 && (!$p['body_on_fail'] || $p['keepfailed']))
				$body = $url->data = false;
			$result =& $url->data;
			$this->errstr = "HTTP Status " . $url->status;
		}
	}
	else
	{
		$result = $this->status = $this->result = false;
		$this->errstr = trim(curl_strerror(curl_errno($curl)) . "(" . curl_errno($curl) . ") " . curl_error($curl));
	}

	if ($p['verbose'])
	{
		rewind($stderr);
		$this->verbose = stream_get_contents($stderr);
		fclose($stderr);
	}

	if ($body === false && $p['retries'] <= 0 && self::retry_warranted($result, $this->status))
	{
		it::error((array)$p['it_error'] + [
			'title' => "problem " . ($p['method'] ?: "gett") . "ing $url->url: " . $this->errstr,
			'body' => $this->curlinfo + ($p['verbose'] ? ['verbose' => $this->verbose] : []) + ['body' => @grapheme_substr($origbody, 0, 2000)],
		]);
	}

	return $result;
}


/**
 * Get multiple URL in parallel with timeout. Needs to be called statically
 * @param $p parameter array with the following keys (same as it_url::get)
 * @param $p['urls']           array/generator of urls to get
 * @param $p['timeout']        timeout per read in seconds, defaults to 5. (TODO: fractions allowed?)
 * @param $p['totaltimeout']   timeout for the whole attempt (fractions ok). see $p['retry']
 * @param $p['followlocation'] follow redirects [true]
 * @param $p['headers']        optional array of HTTP headers to send
 * @param $p['parallel']       max number of parallel requests
 * @param $p['noresults']      do not keep results around
 * @return array of contents (or false for errors like timesou) of resulting page using same
 *         keys as the urls input array, considering redirects, excluding headers
 */
static function get_multi($p=null)
{
	static $curl_multi_handles = [];
	$p += array('retries' => 1);

	EDC('req', $p);

	$url = new it_url;
	$p['headers'] = (array)$p['headers'] + array_diff_key(self::_default_headers($url, $p), ['Host' => null]);

	$opts = self::curl_opts($p);

	if (!($mh = $curl_multi_handles[getmypid()]))
		$mh = $curl_multi_handles[getmypid()] = curl_multi_init();

	$keys = $handles = $urls = $retries = [];
	$addhandle = function ($key, $url) use (&$keys, &$handles, &$urls, $opts, $mh) {
		$urls[$key] = $url;
		$handle = curl_init();
		curl_setopt($handle, CURLOPT_URL, it::replace([ '^//' => "http://" ], is_array($url) ? $url['url'] : $url)); # HTTP OK
		curl_setopt_array($handle, $opts);
		curl_multi_add_handle($mh, $handle);
		$keys[(int)$handle] = $key;
		$handles[$key] = $handle;
	};
	$closehandle = function ($key) use (&$keys, &$handles, $mh) {
		curl_multi_remove_handle($mh, $handles[$key]);
		curl_close($handles[$key]);
		unset($keys[(int)$handles[$key]]);
		unset($handles[$key]);
	};

	if (!$p['noresults'])
		$keyorder = array_keys($p['urls']);

	if (is_array($p['urls']))
		$iterator  = (new ArrayObject($p['urls']))->getIterator();
	else
		$iterator = $p['urls'];

	$parallel = $p['parallel'] ?: PHP_INT_MAX;
	while (count($handles) < $parallel && $iterator->valid())
	{
		$addhandle($iterator->key(), $iterator->current());
		$iterator->next();
	}

	$start = gettimeofday(true);

	# curl_multi loop copied from example at https://php.net/manual/en/function.curl-multi-exec.php
	$active = null;
	do {
		$mrc = curl_multi_exec($mh, $active);
	} while ($mrc == CURLM_CALL_MULTI_PERFORM);

	$timeout = 0.001;	# Very short timeout to work around problem with first select call on cURL 7.25.0
	while (!$abort && (($active && $mrc == CURLM_OK) || count($handles) > 0 || $sleepuntils))
	{
		if (curl_multi_select($mh, $timeout) == -1)
			usleep($timeout * 1000000);

		do {
			$mrc = curl_multi_exec($mh, $active);

			while (($info = curl_multi_info_read($mh)) !== false)
			{
				if ($info['msg'] == CURLMSG_DONE)
				{
					$key = $keys[(int)$info['handle']];
					$content = curl_multi_getcontent($info['handle']);
					$status = curl_getinfo($handles[$key], CURLINFO_RESPONSE_CODE);
					if ($status >= 400 && (!$p['body_on_fail'] || $p['keepfailed']))
						$content = false;
					else if (isset($p['postprocess']))
						$content = $p['postprocess']($content, ['it_error' => $retries[$key] < $p['retries'] ? false : (array)$p['it_error'] + ['title' => "invalid content from " . $urls[$key]]]);

					EDC('reqtimings', $key, $info['result'], (gettimeofday(true) - $start) * 1000);
					if ($info['result'] == CURLE_OK && $content !== null)
					{
						if (!$p['noresults'])
							$results_unordered[$key] = $content;

						if (self::retry_warranted($content, $status) && $retries[$key]++ < $p['retries'])
						{
							$sleepuntils[$key] = microtime(true) + $p['retrysleep'];
						}
						else
						{
							if (is_array($urls[$key]) && ($handler = $urls[$key]['handler']))
								$abort = $handler($info['handle'], $content);
							unset($urls[$key]);
						}
						$closehandle($key);
					}
					else if($retries[$key]++ < $p['retries'])
					{
						$closehandle($key);  # closehandle must be called before addhandle as we use the same key
						$sleepuntils[$key] = microtime(true) + $p['retrysleep'];
					}
					else
					{
						$results_unordered[$key] = false;
						unset($urls[$key]);
						$closehandle($key);
					}

					if (!$abort && count($handles) < $parallel && $iterator->valid())
					{
						$addhandle($iterator->key(), $iterator->current());
						$iterator->next();
					}
				}
			}
		} while ($mrc == CURLM_CALL_MULTI_PERFORM);

		foreach ((array)$sleepuntils as $key => $time)
		{
			if (microtime(true) >= $time && count($handles) < $parallel)
			{
				$addhandle($key, $urls[$key]);
				unset($sleepuntils[$key]);
			}
			$active = 1;
		}
		usleep($sleepuntils ? 100000 : 0);

		$timeout = 0.1;	# Longer delay to avoid busy loop but shorter than default of 1s in case we stil hit cURL 7.25.0 problem
	}

	foreach ($handles as $key => $dummy)
		$closehandle($key);
	curl_multi_close($mh);

	$result = $p['noresults'] ? null : it::filter_keys($results_unordered, $keyorder, ['reorder' => true]);
	EDC('res', $result);

	return $result;
}

/**
 * Construct a local directory name to cache an URL. Named args:
 * @param $p['cachedir']    directory to store cache files in, defaults to $ULTRAHOME/var/urlcache
 * @param $p['id']          If you need more than one type of cache (e.g. different maxage) you can specify an id
 */
static function get_cache_dir($p)
{
	$p += array('cachedir' => $GLOBALS['ULTRAHOME'] . "/var/urlcache", 'id' => "default");
	return rtrim($p['cachedir'] . "/" . $p['id'], "/");
}


/**
 * Construct a local file name to cache an URL. Takes language into account. Named args:
 * @param $p['url']            remote url to get
 * @param $p['cachedir']       directory to store cache files in, @see get_cache_dir
 * @param $p['cachefilename']  Use this filename instead of calculating your own if this is given
 * @param $p['data']           POST data array with key-value pairs
 * @param $p['id']             If you need more than one type of cache (e.g. different maxage) you can specify an id
 */
static function get_cache_filename($p)
{
	if (!is_array($p))
		$p = array('url' => $p);

	$p['cachedir'] = it_url::get_cache_dir($p);
	unset($p['headers']['Authorization']); # prevent ever changing filenames due to changing Bearer tokens
	$filename = $p['cachefilename'] ?: md5(T_lang() . T_defaultlang() . $p['url'] . ($p['headers'] ? serialize($p['headers']) : "") . ($p['data'] ? serialize($p['data']) : "") . $_SERVER['HTTP_X_SERVICE_PATH']);

	return $p['cachedir'] . "/" . substr($filename, 0, 2) . "/$filename";
}


/**
 * Store contents of url in a file and return file name. Threadsafe: Provides locking. Called statically.
 * Requires webserver writeable directory in $p['cachdedir']. Sends it::error on fails by default. Params:
 * @param $p['url']         url to get
 * @param $p['id']          dirname for cached files; same id should have same expire policy
 * @param $p['headers']     optional array of HTTP headers to send
 * @param $p['cachedir']    directory to store cache files in, @see get_cache_dir
 * @param $p['timeout']     timeout in seconds, default 10. fractions allowed
 * @param $p['maxage']      maximum age of cache entries in seconds, default 23 hours. id mandatory if given
 * @param $p['randomexpire'] chance to randomly expunge an entry, 0..1
 * @param $p['cleanbefore'] maximum seconds since midnight when initiating expire, default 10800
 * @param $p['preprocess']  callback function (or array for methods) to change received file or array('function' => ..., 'in' => $src, 'out' => $dst, ...) with callback function plus args
 * @param $p['safety']      DEPRECATED. see $p['it_error']
 * @param $p['it_error']    parameters for it::error(), false means ignore errors, anything else gets passed to it::error() if errors occur
 * @param $p['keepfailed']  keep old versions of files if download fails
 * @param $p['returnheaders'] Return array($path, $headers) instead of simply $path
 * @param $p['postprocess'] UNSUPPORTED, use ::get_cache_contents
 * @param $p['lock']        prevent multiple requests to same url from different processes [true]
 * @return Cache filename or false if fetch failed
 */
static function get_cache($p = array())
{
	if (!$p['id'] && $p['maxage'])
		it::error("calling get_cache with maxage and without id");

	$p += ['timeout' => 10, 'maxage' => 23 * 3600, 'cleanbefore' => 10800, 'lock' => true, 'it_error' => $p['safety'] === 0 ? false : ($p['safety'] == 2 ? ['fatal' => true] : [])];
	$p['totaltimeout'] = $p['timeout'];
	$path = it_url::get_cache_filename($p);	# Must be before changing cachedir below
	$p['cachedir'] = it_url::get_cache_dir($p);

	@mkdir($p['cachedir']);
	@mkdir(dirname($path));

	if (!is_writable(dirname($path)))
		it::error("parent dir not writable: " . trim(it::exec('ls -ld {dir} 2>&1', ['dir' => dirname($path)])));

	if (($filemtime = it_url::_expired($path, $p['maxage'], $p['randomexpire'])) || ($p['returnheaders'] && !file_exists(("$path.json")))) # Outdated(non-zero int) or non-existant(true)?
	{
		$fileexists = $filemtime !== true;

		if ($lock = !$p['lock'] ?: it_url::_lock($path))
		{
			# Touch existing file to prevent locking other getters while refreshing
			if ($fileexists)
				touch($path);

			EDC('getcache', "new", $filemtime, $p['url'], $path);
			$url = new it_url;
			$data = $url->_get($p + ['checkonly' => true, 'filemtime' => EDC('nocache') ? null : $filemtime]);
			if ($p['assoc'] ? ($data['status'] < 500 || $data['data']) : $data)
			{
				$success = true;
				$isnewfile = it_url::_atomicwrite($path, $p['assoc'] ? ($data['status'] === 304 ? true : it::json_encode($data)) : $data);	# $data === true means not modified (no new data fetched) and instructs _atomicwrite to just touch the file
				if ($p['returnheaders'])
					it::file_put("$path.json", it::json_encode($url->headers));
			}
			else if ($p['keepfailed'])
				$success = $fileexists;
			else
				@unlink($path);	# Expired and failed to get

			it_url::_unlock($path, $lock);
		}
		else
		{
			# Wait for file currently being transferred
			EDC('getcache', "wait", $p['url'], $path);
			$success = it_url::_waitforlockedfile($path, $p);

			# If file could no be fetched by other thread but exists and we are in keepfailed mode then return old file
			if (!$success && $p['keepfailed'])
				$success = $fileexists;

		}
	}
	else
	{
		# Get file from cache
		EDC('getcache', "cached", $p['url'], $path);
		$success = true;	# Up to date
	}

	# Read headers before $path is modified for preprocessing
	if ($p['returnheaders'])
		$headers = it::json_decode(it::file_get("$path.json"), ['assoc' => true]);

	if ($success && $p['preprocess'])
	{
		$srcpath = $path;
		$path .= substr(md5(serialize($p['preprocess'])), 0, 2);

		if ($filemtime = $isnewfile ? true : it_url::_expired($path, $p['maxage']))	# Outdated(non-zero int) or non-existant(true)?
		{
			if ($lock = !$p['lock'] ?: it_url::_lock($path))
			{
				# Touch existing file to prevent locking other getters while refreshing
				if ($filemtime !== true)
					touch($path);

				EDC('getcache', "processnew", $p['url'], $path);
				$dstpath = "$path.preprocesstmp";

				if (is_array($p['preprocess']) && $p['preprocess']['function'])	# Needs is_array as it can be a string where dereferencing gives first character!
					$p['preprocess']['function'](['in' => $srcpath, 'out' => $dstpath] + $p['preprocess']);
				else
					$p['preprocess']($srcpath, $dstpath);

				if (!($success = @filesize($dstpath) && @rename($dstpath, $path)))
				{
					@unlink($dstpath);
					if (!$p['keepfailed'])
						@unlink($path);
					$success = file_exists($path);
				}

				it_url::_unlock($path, $lock);
			}
			else
			{
				# Wait for file currently being processed
				EDC('getcache', "processwait", $p['url'], $path);
				$success = it_url::_waitforlockedfile($path, $p);
			}
		}
	}

	# cache cleanup, preferably at night
	$isnight = date('H') >= 1 && date('H')*3600 + date('i')*60 < $p['cleanbefore'];
	if (time() - @filemtime($p['cachedir'] . "/cleaned") > ($isnight ? 80000 : 2*80000))
	{
		it::file_put($p['cachedir'] . "/cleaned", ""); # touch could have permission problems
		$maxagemin = intval($p['maxage']/60);
		exec("nohup bash -c 'cd {$p['cachedir']} && for i in [0-9a-f][0-9a-f]; do sleep 20; ionice -c 3 find \$i -mmin +$maxagemin -type f -delete; done' </dev/null >/dev/null 2>&1 &");
	}

	if (EDC('getcachelog'))
		it::log('debug', 'getcachelog', $p['id'], $p['url'], !$isnewfile ? "" : "fetched=" . mb_substr(is_string($data) ? $data : "(assoc)", 0, 400));

	### EDC('getcache', $success, $path); # too verbose
	return $success ? ($p['returnheaders'] || $p['returncachemiss'] ? [$path, $headers, (bool)$isnewfile] : $path) : false;
}

/**
 * Fetch a file, cache it and return contents
 * @param @see it_url::get_cache()
 * @param $p['assoc']         Return [ 'data' => string, 'status' => int, 'cookies' => array, 'headers' => array, 'errstr' => string, 'cachemiss' => bool ] instead of just data
 * @return @see it_url::get()
 */
static function get_cache_contents($p)
{
	[$fn, $dummy, $cachemiss] = self::get_cache($p + ['returncachemiss' => true]);
	if ($fn)
	{
		$result = it::file_get_contents($fn);
		if ($p['assoc'])
		{
			$response = it::json_decode($result, ['assoc' => true]);
			$response['data'] = self::_postprocess($response['data'], $p);
			$result = $response + ['cachemiss' => $cachemiss];
		}
		else
			$result = self::_postprocess($result, $p);
	}
	else
		$result = it::error((array)$p['it_error'] + ['title' => $p['safety'] === 0 ? false : "failed getting " . static::absolute($p['url']), 'body' => $p]);

	return $result;
}

/**
 * Check whether file at given path is older than maxage
 * @param $path File to check
 * @param $maxage Maximum age of file in seconds
 * @return Not expired: false | Non-existant file: true | Timestamp of expired file
 */
static function _expired($path, $maxage, $randomexpire = 0)
{
	if ($result = EDC('nocache') ? false : @filemtime($path))
	{
		if (time() - $result >= $maxage || rand(0, 100000) <= $randomexpire * 100000)
			EDC('getcache', "expired", $maxage, $path);
		else
			$result = false;
	}
	else	# File does not exists yet
		$result = true;

	return $result;
}

/**
 * Acquire lock for a given file
 * @param $path File to lock
 * @return Lock handle if successfully locked file
 */
static function _lock($path)
{
	if (!($fh = it::fopen("$path.lock", "w")))
		return false;

	if (!flock($fh, LOCK_EX | LOCK_NB))
		return false;

	return $fh;
}

/**
 * Release lock on a file
 * @param $path File to unlock
 * @param $lock Handle to lock acquird by _lock
 */
static function _unlock($path, $lock)
{
	if (is_resource($lock))
	{
		fclose($lock);
		@unlink("$path.lock");
	}
}

/**
 * Wait for file which is currently locked
 * @param $path File to wait for
 * @param $p Wait parameters, @see get_cache()
 * @return Whether lock was released within timeout and file is still there
 */
static function _waitforlockedfile($path, $p)
{
	$sleeptime = 0.1; # seconds to wait per pass

	# wait until cache is ready, then read from cache
	for ($maxpasses = $p['timeout'] / $sleeptime, $passes = 0; !($lock = self::_lock("$path")) && ($passes < $maxpasses); ++$passes)
		usleep($sleeptime * 1000000);

	if (!$lock)
		it::error((array)$p['it_error'] + ['title' => ($passes < $maxpasses ? "error getting url" : "timeout") . " in it_url::get_cache(): url={$p['url']}, passes=$passes, maxpasses=$maxpasses, path=$path"]);
	else
		self::_unlock($path, $lock);

	return $lock && file_exists($path);
}

/**
 * Write data to tmp file and atomically rename it to destination
 * @param $path Destination file to write data to
 * @param $data Data to write | true to just touch file
 * @return True if data was written to file
 */
static function _atomicwrite($path, $data)
{
	$result = false;

	if ($data === true)	# Not modified, no new data, just update timestamp
		touch($path);
	else if ($data !== false)
	{
		$tmpname = tempnam(dirname($path), "writetmp");
		fputs($cachetmp = it::fopen($tmpname, "w"), $data);
		fclose($cachetmp);
		chmod($tmpname, 0664);
		$result = rename($tmpname, $path);
	}
	else
		@unlink($path);

	return $result;
}

/**
 * Make an URL absolute by using host and protocol from current Apache request (but not port number)
 * @param $url Optional URL ( foo.html, /foo.html, //host/bar.html, https://host/bar.html ), default self
 * @param $proto_force Optional protocol to enforce, default protocol of current request or http if in script context
 * @return absolute version of URL ( http[s]://host/bar.html )
 */
static function absolute($url = null, $proto_force = null, $prefix = '')
{
	if (!isset($url))
		$url = $prefix . $_SERVER['PHP_SELF'];

	if (list($proto_url, $urltmp) = it::match('^(\w+):(.*)$', $url))
	{
		$url = $urltmp;
		$proto = $proto_force ?: $proto_url;
	}
	else
		$proto = $proto_force ?: (isset($_SERVER['HTTPS']) ? 'https' : 'http');

	if (!preg_match('#^//#', $url))
	{
		$dir = preg_replace('#/[^/]*$#', '/', $prefix . $_SERVER['PHP_SELF']);
		$url = preg_match('#^/#', $url) ? $url : "$dir$url";
		$url = "//" . $_SERVER['HTTP_HOST'] . $url;
	}

	return "$proto:$url";
}

/**
 * Craft a valid redirect URL, send Location: header and terminate execution
 * @param $url  Optional URL ( foo.html, /foo.html, //host/bar.html, https://host/bar.html ), default self
 * @param $type Type of redirect, "temporary" or "permanent", default temporary
 * @return This method never returns.
 */
static function redirect($url = null, $type = "temporary")
{
	$codes = array('permanent' => 301, 'temporary' => 303);	# NOTE: HTTP 303 is called "See Other", rather than Temporary (which would be HTTP 307), but is the behaviour one usually wants for temporary redirects
	if (!($code = $codes[$type]) || !$url)
		it::fatal("invalid redirect type or missing redirect url");

	$url = preg_replace("/[\r\n].*/", '', static::absolute($url));	# Security: cut after CR/LF

	#if (!$_POST && $url == $_SERVER['SCRIPT_URI'])
	#	it::error("redirect to self. " . $_SERVER['SCRIPT_URI'] . " -> $url");

	if (EDC('noredir'))
	{
		if (!function_exists('a'))
			new it_html();
		echo a(array('href' => $url), Q($url)) . Q(" (HTTP/1.1 $code, $type redirect)") . br() . Q("Trace: " . it_debug::backtrace());
	}
	else
		header('Location: ' . it_untaint($url, TC_SELF), true, $code);
	exit;
}

/**
 * Urlencode but leave some chars
 */
static function encode($str)
{
	return strtr(urlencode($str), array("%2C"=>",", "%28"=>"(", "%29"=>")"));
}

/**
 * Create GET request from params, optionally only using given fields
 * @param $params Array to take values from, usually $_GET. Values of zero length are ignored.
 * @param $keys Keys to use; default: all
 */
static function params($params, $keys = null)
{
	return implode("&", it_url::_params($params, $keys));
}

static function _params($params, $keys = null, $finalize = true)
{
	$result = array();

	if (!isset($keys))
		$keys = array_keys($params);

	foreach ($keys as $key)
	{
		if (is_array($params[$key]))
		{
			foreach (it_url::_params($params[$key], null, false) as $value)
			{
				if (strlen($value))
					$result[] = it::replace(array('^([^=\[]*)' => urlencode($key) . '[$1]'), $value);
			}
		}
		else if (strlen($params[$key]))
			$result[] = urlencode($key) . "=" . it_url::encode($params[$key]);
	}

	if ($finalize)
		$result = preg_replace(['#\[#', '#\]#'], ['%5B', '%5D'], $result);

	return $result;
}


/**
 * Similar to it::parse_str but leaves . and space in arg names intact
 */
static function parse_str($query)
{
	foreach (explode('&', $query) as $arg)
	{
		list($key, $value) = explode('=', $arg, 2);
		$result[it::urldecode($key)] = it::urldecode($value);
	}

	return (array)$result;
}


/**
 * Convert url into array with base url in $result[0] and GET params
 */
static function parse($url)
{
	list($path, $query) = explode("?", $url, 2);
	return (array)$path + (array)it::parse_str((string)$query);
}

}

?>