. ** ** url.class - URL parsing, retrieval and caching functions */ class it_url { /* E.g. HTTP://falcon:joshua@www.Relog.CH.:80/default.asp */ var $url; /* E.g. http://www.relog.ch/ */ var $protocol; /* E.g. http */ var $hostname; /* E.g. relog.ch */ var $realhostname; /* E.g. www.relog.ch */ var $port; /* E.g. 80 */ var $explicitport; /* E.g. :80, explicitly set in rawurl */ var $path; /* E.g. / */ var $rawurl; /* E.g. HTTP://falcon:joshua@www.Relog.CH.:80/default.asp */ var $user; /* E.g. falcon */ var $pass; /* E.g. joshua */ var $cookies; /* key => values of cookies from server */ var $headers; /* Headers of page fetched by get() */ var $data; /* Data part, even if return code is not 200 */ var $result; /* HTTP response code of get() */ var $redir = 0; /* Redirect count */ var $header; /* http header */ var $errstr; /* request error string */ var $curlinfo; static $retryable = "^(5..)$"; /** * Constructor: canonicalize an URL * @param $url URL this object represents */ function __construct($url = null) { $this->rawurl = $url; $comp = parse_url($url); $this->protocol = strtolower($comp['scheme']) ?: "http"; $protoport = $this->protocol == 'https' ? 443 : 80; # port according to protocol $this->port = intval($comp['port'] ?: $protoport); # this is set even in default case $this->explicitport = $comp['port'] ? ':' . $comp['port'] : ''; # only set if explicitly specified in url, contains leading : $this->user = $comp['user']; $this->pass = $comp['pass']; $this->realhostname = strtolower($comp['host']); $this->hostname = preg_replace('/^www\./', '', $this->realhostname); $this->path = ltrim($comp['path'] . ($comp['query'] ? '?' . $comp['query'] : ''), '/'); # $this->path is named poorly, it includes path and query $this->url = "$this->protocol://$this->realhostname" . ($this->port != $protoport ? $this->explicitport : '') . "/$this->path"; $this->realhostname = idn_to_ascii($this->realhostname, IDNA_DEFAULT, INTL_IDNA_VARIANT_UTS46) ?: $this->realhostname; # punycode or original } /** * Check if a given url (currently http:port80-only) can be fetched * Note: Redirects are treated as succesful * @param $p parameter array passed on to get * @return true if url could be fetched */ static function is_reachable($p = []) { $result = static::get((is_array($p) ? $p : ['url' => $p]) + ['maxlength' => 1000, 'totaltimeout' => 5, 'assoc' => true, 'it_error' => false]); return $result['status'] >= 200 && $result['status'] < 400; } # internal static function _postprocess($data, $p) { if ($p['postprocess']) $data = ($t = $p['postprocess']($data, ['it_error' => $p['retries'] > 0 ? false : (array)$p['it_error'] + ['title' => "invalid content from " . $p['url']]])) && $p['checkonly'] ? $data : $t; return $data; } /** * Get simple URL with timeout and one retry. Can be called statically. Times out, calls it::error for all errs * * Request initiation * @param $p parameter array with the following keys * @param $p['url'] url to get, defaults to constructor URL * @param $p['headers'] optional assoc array of HTTP headers to send, e.g. ['Host' => "foo"] * @param $p['data'] POST data array with key-value pairs * @param $p['files'] [fieldname => filename] of files to upload * @param $p['maxlength'] maximum length of response * @param $p['filemtime'] Add HTTP header to only fetch when newer than this, otherwise return true instead of data * @param $p['accept_encoding'] Contents of the "Accept-Encoding: " header. Enables decoding of the response. Set to null to disable, "" (default) for all supported encodings. * @param $p['protocols'] Array of protocols to accept, defaults to ['http', 'https'], @see curl_opts for other values * * Problem handling * @param $p['retries'] Number of retries if download fails, default 1 * @param $p['timeout'] inactivity timeout seconds, default 5. fractions ok. silent by default * @param $p['totaltimeout'] timeout for the whole attempt but see $['retry'] * @param $p['retrysleep'] Number of seconds to wait before retry (additional to fetchsleep), fractions ok * @param $p['safety'] DEPRECATED. 0 = ignore errors, 1 = errors, 2 = fatals * @param $p['it_error'] extra arguments for it_error or false to ignore errors * @param $p['fetchsleep'] Number of seconds to wait after fetch, fractions ok * @param $p['body_on_fail'] Return body of page even if http status code is >= 400, e.g. some JSON APIs return 404 with JSON data * * Result processing * @param $p['assoc'] Return [ 'data' => string, 'status' => int, 'cookies' => array, 'headers' => array, 'errstr' => string ] instead of just data * @param $p['writefunction'] function to be called whenever data is received (for server-sent-events etc.) * @param $p['postprocess'] function called with content and $p which has it_error. returns content or null (which triggers retry) * @param $p['followlocation']Follow redirects [true] * * @return Content of resulting page (considering redirects, excluding headers or false on error) or array if 'assoc' => true */ static function get($p = []) { return (new static)->_get($p); } /** * Non-static alias for get so we can make get() static */ function _get($p = []) { $p = is_string($p) ? ['url' => $p, 'timeout' => 5] : $p; $p += array('retries' => 1); if (($filter = EDC('req')) && ($filter == 1 || strstr($p['url'], "/$filter."))) if ($p['url']) ED($p); else ED($this->url, $p); if ($p['url']) $this->__construct($p['url']); $result = $this->request($p + ['followlocation' => true]); $result = self::_postprocess($result, $p); # FIXME 2024-07 UM some failures never send errs in request() because retries > 0 if ($p['retries'] > 0 && ((!$result && !it::match('^(204|4..)$', $this->result)) || it::match(self::$retryable, $this->result))) { usleep($p['retrysleep']*1000000); $result = $this->_get(array('retries' => $p['retries'] - 1) + $p); } if (($filter = EDC('res')) && strstr($p['url'], it::replace(array('1' => ":"), $filter))) ED($result); usleep($p['fetchsleep'] * 1000000); if ($p['assoc']) $result = [ 'status' => intval($this->result) ?: 503, 'data' => $result !== false ? $this->data : null, 'headers' => $this->headers, 'cookies' => $this->cookies, 'errstr' => $this->errstr ]; EDC('curlinfo', $this->result, $this->headers, $this->cookies, $this->errstr); return $result; } function parse_http_header($header) { foreach (explode("\n", trim($header)) as $line) { $line = trim($line); if (preg_match('#^(HTTP)\S+\s(\d+)#', $line, $parts)) # Parse result code $this->headers[$parts[1]] = $this->result = $parts[2]; else if (preg_match('#^([^:]+): (.*)$#', $line, $parts)) $this->headers[ucwords($parts[1], '-')] = $parts[2]; if (strtolower($parts[1]) == 'set-cookie' && preg_match('/^([^=]+)=([^;]*)/', $parts[2], $cookie)) $this->cookies[$cookie[1]] = $cookie[2]; } } static function _default_headers($url, $p) { $search_subrequest = it::match('search\.ch/', $p['url']); if ((!it::is_devel() || EDC('subreqcheck')) && $p['url'] && !$p['headers']['Accept-Language'] && T_lang() != T_defaultlang() && $search_subrequest && !it::match('/login\b|banner\.html|machines\.txt|mbtiles\.php|/fonts/|/itjs/|/images/|\.(de|fr|en|it)(\.js|\.html|\.txt|\.php|\.ics|\.pdf|\.json|\.csv|\.gif|\.jpg|\.png)', $p['url'])) it::error(['title' => "Subrequest without language override", 'body' => [ $p ]]); $headers = array_filter([ 'Host' => $url->realhostname . $url->explicitport, 'User-Agent' => "Mozilla/5.0 (compatible; ITools; Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582)", 'Accept-Language' => $p['headers']['Accept-Language'] ?? ($search_subrequest ? T_defaultlang() : T_lang()), # can prevent loading of it_text 'Referer' => it::match('([-\w]+\.\w+)$', $url->hostname) == it::match('([-\w]+\.\w+)$', $_SERVER['HTTP_HOST']) ? static::absolute(U($_GET)) : null, 'X-Ultra-Https' => $_SERVER['HTTPS'], ]); if (is_int($p['filemtime'])) $headers['If-Modified-Since'] = date("r", $p['filemtime']); return $headers; } static function curl_opts($p=array()) { $p += [ 'totaltimeout' => "999999", 'timeout' => 5, 'followlocation' => !$p['files'], # disallow redirects for file uploads as recommended by https://curl.se/libcurl/security.html 'accept_encoding' => '', # set header to accept any supported encoding and enable automatic decompression 'protocols' => ['http', 'https'], # Array with allowed protocols, see list below ]; $protocols = [ 'file' => CURLPROTO_FILE, 'ftp' => CURLPROTO_FTP, 'ftps' => CURLPROTO_FTPS, 'http' => CURLPROTO_HTTP, 'https' => CURLPROTO_HTTPS, 'scp' => CURLPROTO_SCP, 'sftp' => CURLPROTO_SFTP, ]; $add = []; if (it::grep("[\n\r]", it::map('"$k$v"', $p['headers']))) it::error(['title' => "Newline in headers", 'body' => $p['headers']]); foreach ($p['headers'] as $header => $value) $headers[] = strtr("$header: $value", "\n\r", ' '); # file upload foreach ((array)$p['files'] as $field => $filename) $p['data'][$field] = new CURLFile($filename, mime_content_type($filename)); if ($p['data']) $add += [ CURLOPT_POSTFIELDS => $p['data'] ]; if ($p['pass'] || $p['user']) $add += [ CURLOPT_HTTPAUTH => CURLAUTH_BASIC, CURLOPT_USERPWD => $p['user'] . ':' . $p['pass'] ]; if ($p['writefunction']) { $add += [ CURLOPT_RETURNTRANSFER => false, CURLOPT_WRITEFUNCTION => $p['writefunction'], ]; } if ($p['sslkey']) $add += [CURLOPT_SSLKEY => $p['sslkey']]; if ($p['sslcert']) $add += [CURLOPT_SSLCERT => $p['sslcert']]; $add += EDC('curlinfo') ? [CURLINFO_HEADER_OUT => 1] : []; $add += [CURLOPT_COOKIEFILE => ""]; if ($p['verbose'] || EDC('curlverbose')) $add += [ CURLOPT_VERBOSE => true ]; if (isset($p['accept_encoding'])) $add += [CURLOPT_ENCODING => $p['accept_encoding']]; # NOTE: the curl library renamed the option to CURLOPT_ACCEPT_ENCODING, in php both are possible, CURLOPT_ENCODING is documented return $add + [ CURLOPT_HEADER => false, CURLOPT_RETURNTRANSFER => true, CURLOPT_TIMEOUT_MS => $p['totaltimeout'] * 1000, # use _MS to support fractions of seconds CURLOPT_LOW_SPEED_LIMIT => 5, CURLOPT_LOW_SPEED_TIME => $p['timeout'], CURLOPT_FOLLOWLOCATION => $p['followlocation'], CURLOPT_MAXREDIRS => 20, CURLOPT_HTTPHEADER => $headers, CURLOPT_CUSTOMREQUEST => $p['method'] ?: null, CURLOPT_NOBODY => $p['method'] == 'HEAD', CURLOPT_SAFE_UPLOAD => true, # disable special meaning of @value in POST forms (security) CURLOPT_PROTOCOLS => array_reduce($p['protocols'], fn($c, $v) => $c | $protocols[$v], 0), CURLOPT_CAPATH => '/etc/ssl/certs/', CURLOPT_SSL_VERIFYPEER => !$p['allow_insecure_ssl'], CURLOPT_SSL_VERIFYHOST => $p['allow_insecure_ssl'] ? 0 : 2, ]; } /* * drop in replacement for request using curl * * @param $p['data'] POST data array with key-value pairs * @param $p['files'] [fieldname => filename] of files to upload * @param $p['method'] different HTTP method * @param $p['verbose'] generate and capture curl verbose output in $this->verbose and alert mails */ function request($p=array()) { static $curl_handles = []; $url = $this; if ($p['url']) $this->__construct($p['url']); $this->errstr = ""; $url->headers = array(); $p['headers'] = array_filter((array)$p['headers'] + self::_default_headers($url, $p), 'strlen'); $opts = self::curl_opts($p + array('user' => $this->user, 'pass' => $this->pass, 'followlocation' => false)); if ($p['verbose']) { $stderr = it::fopen("php://memory", "r+"); $opts += [CURLOPT_STDERR => $stderr, CURLOPT_VERBOSE => 1]; } if (!($curl = $curl_handles[getmypid()])) $curl = $curl_handles[getmypid()] = curl_init($url->url); else { curl_reset($curl); curl_setopt($curl, CURLOPT_URL, $url->url); } // FIXME 2025-01 NG just use CURLOPT_MAXFILESIZE if we have curl 8.4 $content = ""; if ($p['maxlength'] && !$p['writefunction']) { $opts[CURLOPT_WRITEFUNCTION] = function ($dummy, $data) use ($p, &$content) { static $space; $write = min($space ?? $p['maxlength'], strlen($data)); $content .= substr($data, 0, $write); $space -= $write; return $write; }; } $opts[CURLOPT_HEADERFUNCTION] = function ($dummy, $data) use (&$header) { $header .= $data; return strlen($data); }; curl_setopt_array($curl, $opts); $got = curl_exec($curl); if ($p['maxlength'] && $got) $got = $content; $this->curlinfo = curl_getinfo($curl); EDC('curlinfo', $this->curlinfo); if ($got !== false || curl_errno($curl) == 23) { $url->header = array_slice(explode("\r\n\r\n", trim($header)), -1)[0] . "\r\n\r\n"; $url->data = $got; $url->parse_http_header($url->header); # Change result status for content longer than maxlength to 204 as we do not return partial data but still want to indicate success e.g. for is_reachable if ($p['maxlength'] && $url->result == 200 && strlen($content) && !$got) $url->result = 204; if ($p['filemtime'] && ($url->result == 304)) { $result = true; # Not modified, success but no data } else if ($url->result == 414) { it::error((array)$p['it_error'] + ['title' => "Request-URI Too Long: " . substr($url->url, 0, 100) . "...(truncated " . (strlen($url->url) - 100) . " bytes)", 'body' => curl_getinfo($curl) + ($p['verbose'] ? ['verbose' => $this->verbose] : [])]); $this->errstr = "HTTP Status " . $url->result; } else { if ($url->result >= 400 && (!$p['body_on_fail'] || $p['keepfailed'])) $got = $url->data = false; $result =& $url->data; $this->errstr = "HTTP Status " . $url->result; } } else { $result = $this->result = false; $this->errstr = trim("(" . curl_errno($curl) . ") " . curl_error($curl)); } if ($p['verbose']) { rewind($stderr); $this->verbose = stream_get_contents($stderr); fclose($stderr); } if ($got === false && $p['retries'] <= 0) { it::error((array)$p['it_error'] + ['title' => "problem " . ($p['method'] ?: "gett") . "ing $url->url with curl: " . curl_strerror(curl_errno($curl)) . " (" . curl_errno($curl) . ") " . curl_error($curl), 'body' => ['Error description' => $this->errstr] + curl_getinfo($curl) + ($p['verbose'] ? ['verbose' => $this->verbose] : [])]); } return $result; } /** * Get multiple URL in parallel with timeout. Needs to be called statically * @param $p parameter array with the following keys (same as it_url::get) * @param $p['urls'] array/generator of urls to get * @param $p['timeout'] timeout per read in seconds, defaults to 5. (TODO: fractions allowed?) * @param $p['totaltimeout'] timeout for the whole attempt (fractions ok). see $p['retry'] * @param $p['followlocation'] follow redirects [true] * @param $p['headers'] optional array of HTTP headers to send * @param $p['parallel'] max number of parallel requests * @param $p['noresults'] do not keep results around * @return array of contents (or false for errors like timesou) of resulting page using same * keys as the urls input array, considering redirects, excluding headers */ static function get_multi($p=null) { static $curl_multi_handles = []; $p += array('retries' => 1); EDC('req', $p); $url = new it_url; $p['headers'] = (array)$p['headers'] + array_diff_key(self::_default_headers($url, $p), ['Host' => null]); $opts = self::curl_opts($p); if (!($mh = $curl_multi_handles[getmypid()])) $mh = $curl_multi_handles[getmypid()] = curl_multi_init(); $keys = $handles = $urls = $retries = []; $addhandle = function ($key, $url) use (&$keys, &$handles, &$urls, $opts, $mh) { $urls[$key] = $url; $handle = curl_init(); curl_setopt($handle, CURLOPT_URL, it::replace([ '^//' => "http://" ], is_array($url) ? $url['url'] : $url)); curl_setopt_array($handle, $opts); curl_multi_add_handle($mh, $handle); $keys[(int)$handle] = $key; $handles[$key] = $handle; }; $closehandle = function ($key) use (&$keys, &$handles, $mh) { curl_multi_remove_handle($mh, $handles[$key]); curl_close($handles[$key]); unset($keys[(int)$handles[$key]]); unset($handles[$key]); }; if (!$p['noresults']) $keyorder = array_keys($p['urls']); if (is_array($p['urls'])) $iterator = (new ArrayObject($p['urls']))->getIterator(); else $iterator = $p['urls']; $parallel = $p['parallel'] ?: PHP_INT_MAX; while (count($handles) < $parallel && $iterator->valid()) { $addhandle($iterator->key(), $iterator->current()); $iterator->next(); } $start = gettimeofday(true); # curl_multi loop copied from example at http://php.net/manual/en/function.curl-multi-exec.php $active = null; do { $mrc = curl_multi_exec($mh, $active); } while ($mrc == CURLM_CALL_MULTI_PERFORM); $timeout = 0.001; # Very short timeout to work around problem with first select call on cURL 7.25.0 while (!$abort && (($active && $mrc == CURLM_OK) || count($handles) > 0 || $sleepuntils)) { if (curl_multi_select($mh, $timeout) == -1) usleep($timeout * 1000000); do { $mrc = curl_multi_exec($mh, $active); while (($info = curl_multi_info_read($mh)) !== false) { if ($info['msg'] == CURLMSG_DONE) { $key = $keys[(int)$info['handle']]; $content = curl_multi_getcontent($info['handle']); if (isset($p['postprocess'])) $content = $p['postprocess']($content, ['it_error' => $retries[$key] < $p['retries'] ? false : (array)$p['it_error'] + ['title' => "invalid content from " . $urls[$key]]]); EDC('reqtimings', $key, $info['result'], (gettimeofday(true) - $start) * 1000); if ($info['result'] == CURLE_OK && $content !== null) { if (!$p['noresults']) $results_unordered[$key] = $content; if (it::match(self::$retryable, curl_getinfo($handles[$key], CURLINFO_RESPONSE_CODE)) && $retries[$key]++ < $p['retries']) { $sleepuntils[$key] = microtime(true) + $p['retrysleep']; } else { if (is_array($urls[$key]) && ($handler = $urls[$key]['handler'])) $abort = $handler($info['handle'], $content); unset($urls[$key]); } $closehandle($key); } else if($retries[$key]++ < $p['retries']) { $closehandle($key); # closehandle must be called before addhandle as we use the same key $sleepuntils[$key] = microtime(true) + $p['retrysleep']; } else { $results_unordered[$key] = false; unset($urls[$key]); $closehandle($key); } if (!$abort && count($handles) < $parallel && $iterator->valid()) { $addhandle($iterator->key(), $iterator->current()); $iterator->next(); } } } } while ($mrc == CURLM_CALL_MULTI_PERFORM); foreach ((array)$sleepuntils as $key => $time) { if (microtime(true) >= $time && count($handles) < $parallel) { $addhandle($key, $urls[$key]); unset($sleepuntils[$key]); } $active = 1; } usleep($sleepuntils ? 100000 : 0); $timeout = 0.1; # Longer delay to avoid busy loop but shorter than default of 1s in case we stil hit cURL 7.25.0 problem } foreach ($handles as $key => $dummy) $closehandle($key); curl_multi_close($mh); $result = $p['noresults'] ? null : it::filter_keys($results_unordered, $keyorder, ['reorder' => true]); EDC('res', $result); return $result; } /** * Construct a local directory name to cache an URL. Named args: * @param $p['cachedir'] directory to store cache files in, defaults to $ULTRAHOME/var/urlcache * @param $p['id'] If you need more than one type of cache (e.g. different maxage) you can specify an id */ static function get_cache_dir($p) { $p += array('cachedir' => $GLOBALS['ULTRAHOME'] . "/var/urlcache", 'id' => "default"); return rtrim($p['cachedir'] . "/" . $p['id'], "/"); } /** * Construct a local file name to cache an URL. Takes language into account. Named args: * @param $p['url'] remote url to get * @param $p['cachedir'] directory to store cache files in, @see get_cache_dir * @param $p['cachefilename'] Use this filename instead of calculating your own if this is given * @param $p['data'] POST data array with key-value pairs * @param $p['id'] If you need more than one type of cache (e.g. different maxage) you can specify an id */ static function get_cache_filename($p) { if (!is_array($p)) $p = array('url' => $p); $p['cachedir'] = it_url::get_cache_dir($p); unset($p['headers']['Authorization']); # prevent ever changing filenames due to changing Bearer tokens $filename = $p['cachefilename'] ?: md5(T_lang() . T_defaultlang() . $p['url'] . ($p['headers'] ? serialize($p['headers']) : "") . ($p['data'] ? serialize($p['data']) : "") . $_SERVER['HTTP_X_SERVICE_PATH']); return $p['cachedir'] . "/" . substr($filename, 0, 2) . "/$filename"; } /** * Store contents of url in a file and return file name. Threadsafe: Provides locking. Called statically. * Requires webserver writeable directory in $p['cachdedir']. Sends it::error on fails by default. Params: * @param $p['url'] url to get * @param $p['id'] dirname for cached files; same id should have same expire policy * @param $p['headers'] optional array of HTTP headers to send * @param $p['cachedir'] directory to store cache files in, @see get_cache_dir * @param $p['timeout'] timeout in seconds, default 10. fractions allowed * @param $p['maxage'] maximum age of cache entries in seconds, default 23 hours. id mandatory if given * @param $p['randomexpire'] chance to randomly expunge an entry, 0..1 * @param $p['cleanbefore'] maximum seconds since midnight when initiating expire, default 10800 * @param $p['preprocess'] callback function (or array for methods) to change received file or array('function' => ..., 'in' => $src, 'out' => $dst, ...) with callback function plus args * @param $p['safety'] DEPRECATED. see $p['it_error'] * @param $p['it_error'] parameters for it::error(), false means ignore errors, anything else gets passed to it::error() if errors occur * @param $p['keepfailed'] keep old versions of files if download fails * @param $p['returnheaders'] Return array($path, $headers) instead of simply $path * @param $p['postprocess'] UNSUPPORTED, use ::get_cache_contents * @param $p['lock'] prevent multiple requests to same url from different processes [true] * @return Cache filename or false if fetch failed */ static function get_cache($p = array()) { if (!$p['id'] && $p['maxage']) it::error("calling get_cache with maxage and without id"); $p += ['timeout' => 10, 'maxage' => 23 * 3600, 'cleanbefore' => 10800, 'lock' => true, 'it_error' => $p['safety'] === 0 ? false : ($p['safety'] == 2 ? ['fatal' => true] : [])]; $p['totaltimeout'] = $p['timeout']; $path = it_url::get_cache_filename($p); # Must be before changing cachedir below $p['cachedir'] = it_url::get_cache_dir($p); @mkdir($p['cachedir']); @mkdir(dirname($path)); if (!is_writable(dirname($path))) it::error("parent dir not writable: " . trim(it::exec('ls -ld {dir} 2>&1', ['dir' => dirname($path)]))); if (($filemtime = it_url::_expired($path, $p['maxage'], $p['randomexpire'])) || ($p['returnheaders'] && !file_exists(("$path.json")))) # Outdated(non-zero int) or non-existant(true)? { $fileexists = $filemtime !== true; if ($lock = !$p['lock'] ?: it_url::_lock($path, $p)) { # Touch existing file to prevent locking other getters while refreshing if ($fileexists) touch($path); EDC('getcache', "new", $filemtime, $p['url'], $path); $url = new it_url; $data = $url->_get($p + ['checkonly' => true, 'filemtime' => EDC('nocache') ? null : $filemtime]); if ($p['assoc'] ? ($data['status'] < 500 || $data['data']) : $data) { $success = true; $isnewfile = it_url::_atomicwrite($path, $p['assoc'] ? ($data['status'] === 304 ? true : it::json_encode($data)) : $data); # $data === true means not modified (no new data fetched) and instructs _atomicwrite to just touch the file if ($p['returnheaders']) it::file_put("$path.json", it::json_encode($url->headers)); } else if ($p['keepfailed']) $success = $fileexists; else @unlink($path); # Expired and failed to get it_url::_unlock($path, $lock); } else { # Wait for file currently being transferred EDC('getcache', "wait", $p['url'], $path); $success = it_url::_waitforlockedfile($path, $p); # If file could no be fetched by other thread but exists and we are in keepfailed mode then return old file if (!$success && $p['keepfailed']) $success = $fileexists; } } else { # Get file from cache EDC('getcache', "cached", $p['url'], $path); $success = true; # Up to date } # Read headers before $path is modified for preprocessing if ($p['returnheaders']) $headers = it::json_decode(it::file_get("$path.json"), ['assoc' => true]); if ($success && $p['preprocess']) { $srcpath = $path; $path .= substr(md5(serialize($p['preprocess'])), 0, 2); if ($filemtime = $isnewfile ? true : it_url::_expired($path, $p['maxage'])) # Outdated(non-zero int) or non-existant(true)? { if ($lock = !$p['lock'] ?: it_url::_lock($path, $p)) { # Touch existing file to prevent locking other getters while refreshing if ($filemtime !== true) touch($path); EDC('getcache', "processnew", $p['url'], $path); $dstpath = "$path.preprocesstmp"; if (is_array($p['preprocess']) && $p['preprocess']['function']) # Needs is_array as it can be a string where dereferencing gives first character! $p['preprocess']['function'](['in' => $srcpath, 'out' => $dstpath] + $p['preprocess']); else $p['preprocess']($srcpath, $dstpath); if (!($success = @filesize($dstpath) && @rename($dstpath, $path))) { @unlink($dstpath); if (!$p['keepfailed']) @unlink($path); $success = file_exists($path); } it_url::_unlock($path, $lock); } else { # Wait for file currently being processed EDC('getcache', "processwait", $p['url'], $path); $success = it_url::_waitforlockedfile($path, $p); } } } # cache cleanup, preferably at night $isnight = date('H') >= 1 && date('H')*3600 + date('i')*60 < $p['cleanbefore']; if (time() - @filemtime($p['cachedir'] . "/cleaned") > ($isnight ? 80000 : 2*80000)) { it::file_put($p['cachedir'] . "/cleaned", ""); # touch could have permission problems $maxagemin = intval($p['maxage']/60); exec("nohup bash -c 'cd {$p['cachedir']} && for i in [0-9a-f][0-9a-f]; do sleep 20; ionice -c 3 find \$i -mmin +$maxagemin -type f -delete; done' /dev/null 2>&1 &"); } if (EDC('getcachelog')) it::log('debug', 'getcachelog', $p['id'], $p['url'], !$isnewfile ? "" : "fetched=" . mb_substr(is_string($data) ? $data : "(assoc)", 0, 400)); ### EDC('getcache', $success, $path); # too verbose return $success ? ($p['returnheaders'] || $p['returncachemiss'] ? [$path, $headers, (bool)$isnewfile] : $path) : false; } /** * Fetch a file, cache it and return contents * @param @see it_url::get_cache() * @param $p['assoc'] Return [ 'data' => string, 'status' => int, 'cookies' => array, 'headers' => array, 'errstr' => string, 'cachemiss' => bool ] instead of just data * @return @see it_url::get() */ static function get_cache_contents($p) { [$fn, $dummy, $cachemiss] = self::get_cache($p + ['returncachemiss' => true]); if ($fn) { $result = it::file_get_contents($fn); if ($p['assoc']) { $response = it::json_decode($result, ['assoc' => true]); $response['data'] = self::_postprocess($response['data'], $p); $result = $response + ['cachemiss' => $cachemiss]; } else $result = self::_postprocess($result, $p); } else $result = it::error((array)$p['it_error'] + ['title' => $p['safety'] === 0 ? false : "failed getting " . static::absolute($p['url']), 'body' => $p]); return $result; } /** * Check whether file at given path is older than maxage * @param $path File to check * @param $maxage Maximum age of file in seconds * @return Not expired: false | Non-existant file: true | Timestamp of expired file */ static function _expired($path, $maxage, $randomexpire = 0) { if ($result = EDC('nocache') ? false : @filemtime($path)) { if (time() - $result >= $maxage || rand(0, 100000) <= $randomexpire * 100000) EDC('getcache', "expired", $maxage, $path); else $result = false; } else # File does not exists yet $result = true; return $result; } /** * Acquire lock for a given file * @param $path File to lock * @return Lock handle if successfully locked file */ static function _lock($path, $p = []) { $mtime = @filemtime("$path.lock"); if (!($fh = it::fopen("$path.lock", "w"))) return false; if (!flock($fh, LOCK_EX | LOCK_NB)) { if ($mtime && (time() - $mtime > 30)) it::error((array)$p['it_error'] + ['title' => "stale lock epired for $path"]); # FIXME 2024-07 DF remove stale lock expiration if never triggered else return false; } return $fh; } /** * Release lock on a file * @param $path File to unlock * @param $lock Handle to lock acquird by _lock */ static function _unlock($path, $lock) { if (is_resource($lock)) { fclose($lock); @unlink("$path.lock"); } } /** * Wait for file which is currently locked * @param $path File to wait for * @param $p Wait parameters, @see get_cache() * @return Whether lock was released within timeout and file is still there */ static function _waitforlockedfile($path, $p) { $sleeptime = 0.1; # seconds to wait per pass # wait until cache is ready, then read from cache for ($maxpasses = $p['timeout'] / $sleeptime, $passes = 0; !($lock = self::_lock("$path", $p)) && ($passes < $maxpasses); ++$passes) usleep($sleeptime * 1000000); if (!$lock) it::error((array)$p['it_error'] + ['title' => ($passes < $maxpasses ? "error getting url" : "timeout") . " in it_url::get_cache(): url={$p['url']}, passes=$passes, maxpasses=$maxpasses, path=$path"]); else self::_unlock($path, $lock); return $lock && file_exists($path); } /** * Write data to tmp file and atomically rename it to destination * @param $path Destination file to write data to * @param $data Data to write | true to just touch file * @return True if data was written to file */ static function _atomicwrite($path, $data) { $result = false; if ($data === true) # Not modified, no new data, just update timestamp touch($path); else if ($data !== false) { $tmpname = tempnam(dirname($path), "writetmp"); fputs($cachetmp = it::fopen($tmpname, "w"), $data); fclose($cachetmp); chmod($tmpname, 0664); $result = rename($tmpname, $path); } else @unlink($path); return $result; } /** * Make an URL absolute by using host and protocol from current Apache request (but not port number) * @param $url Optional URL ( foo.html, /foo.html, //host/bar.html, http://host/bar.html ), default self * @param $proto_force Optional protocol to enforce, default protocol of current request or http if in script context * @return absolute version of URL ( http[s]://host/bar.html ) */ static function absolute($url = null, $proto_force = null, $prefix = '') { if (!isset($url)) $url = $prefix . $_SERVER['PHP_SELF']; if (list($proto_url, $urltmp) = it::match('^(\w+):(.*)$', $url)) { $url = $urltmp; $proto = $proto_force ?: $proto_url; } else $proto = $proto_force ?: (isset($_SERVER['HTTPS']) ? 'https' : 'http'); if (!preg_match('#^//#', $url)) { $dir = preg_replace('#/[^/]*$#', '/', $prefix . $_SERVER['PHP_SELF']); $url = preg_match('#^/#', $url) ? $url : "$dir$url"; $url = "//" . $_SERVER['HTTP_HOST'] . $url; } return "$proto:$url"; } /** * Craft a valid redirect URL, send Location: header and terminate execution * @param $url Optional URL ( foo.html, /foo.html, //host/bar.html, http://host/bar.html ), default self * @param $type Type of redirect, "temporary" or "permanent", default temporary * @return This method never returns. */ static function redirect($url = null, $type = "temporary") { $codes = array('permanent' => 301, 'temporary' => 303); # NOTE: HTTP 303 is called "See Other", rather than Temporary (which would be HTTP 307), but is the behaviour one usually wants for temporary redirects if (!($code = $codes[$type]) || !$url) it::fatal("invalid redirect type or missing redirect url"); $url = preg_replace("/[\r\n].*/", '', static::absolute($url)); # Security: cut after CR/LF #if (!$_POST && $url == $_SERVER['SCRIPT_URI']) # it::error("redirect to self. " . $_SERVER['SCRIPT_URI'] . " -> $url"); if (EDC('noredir')) { if (!function_exists('a')) new it_html(); echo a(array('href' => $url), Q($url)) . Q(" (HTTP/1.1 $code, $type redirect)") . br() . Q("Trace: " . it_debug::backtrace()); } else header('Location: ' . it_untaint($url, TC_SELF), true, $code); exit; } /** * Urlencode but leave some chars */ static function encode($str) { return strtr(urlencode($str), array("%2C"=>",", "%28"=>"(", "%29"=>")")); } /** * Create GET request from params, optionally only using given fields * @param $params Array to take values from, usually $_GET. Values of zero length are ignored. * @param $keys Keys to use; default: all */ static function params($params, $keys = null) { return implode("&", it_url::_params($params, $keys)); } static function _params($params, $keys = null, $finalize = true) { $result = array(); if (!isset($keys)) $keys = array_keys($params); foreach ($keys as $key) { if (is_array($params[$key])) { foreach (it_url::_params($params[$key], null, false) as $value) { if (strlen($value)) $result[] = it::replace(array('^([^=\[]*)' => urlencode($key) . '[$1]'), $value); } } else if (strlen($params[$key])) $result[] = urlencode($key) . "=" . it_url::encode($params[$key]); } if ($finalize) $result = preg_replace(['#\[#', '#\]#'], ['%5B', '%5D'], $result); return $result; } /** * Similar to it::parse_str but leaves . and space in arg names intact */ static function parse_str($query) { foreach (explode('&', $query) as $arg) { list($key, $value) = explode('=', $arg, 2); $result[it::urldecode($key)] = it::urldecode($value); } return (array)$result; } /** * Convert url into array with base url in $result[0] and GET params */ static function parse($url) { list($path, $query) = explode("?", $url, 2); return (array)$path + (array)it::parse_str((string)$query); } } ?>