From 0ce1010d26e4c22cda2f3576ea45e3564119d004 Mon Sep 17 00:00:00 2001 From: Christian Schneider Date: Fri, 31 Jul 2009 17:12:13 +0000 Subject: New version of get_cache (using If-Not-Modified-Since support in get) --- it_url.class | 212 ++++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 129 insertions(+), 83 deletions(-) diff --git a/it_url.class b/it_url.class index 063aed0..7035291 100644 --- a/it_url.class +++ b/it_url.class @@ -225,6 +225,7 @@ function is_reachable($timeout = 5) * @param $p['url']: url to get, defaults to constructor URL * @param $p['timeout']: timeout per read in seconds, defaults to 5. fractions allowed * @param $p['totaltimeout']: timeout for the whole function call + * @param $p['filemtime']: Add HTTP header to only fetch when newer than this, otherwise return true instead of data * @param $p['data']: POST data array with key-value pairs * @return contents of resulting page, considering redirects, excluding headers, or false on error */ @@ -260,6 +261,9 @@ function get($p=null, $timeout=5) 'Accept-Language' => T_lang(), ); + if (is_int($p['filemtime'])) + $p['headers']['If-Modified-Since'] = date("r", $p['filemtime']); + if ($datalen = strlen($data)) { $method = "POST"; @@ -297,7 +301,7 @@ function get($p=null, $timeout=5) if ($url->result) { - if ($url->headers['Transfer-Encoding'] == "chunked") # Bogus HTTP/1.1 chunked answer from server (e.g. Wordpress/Apach2/PHP5) + if ($url->headers['Transfer-Encoding'] == "chunked") # Bogus HTTP/1.1 chunked answer from server (e.g. Wordpress/Apache2/PHP5) { while ($len = hexdec(fgets($fp))) { @@ -315,7 +319,9 @@ function get($p=null, $timeout=5) $url->data .= @fread($fp, 20480); } - if ($url->result < 400) + if ($p['filemtime'] && ($url->result == 304)) + $result = true; # Not modified, success but no data + else if ($url->result < 400) $result =& $url->data; } @@ -375,93 +381,83 @@ function get_cache($p = array()) { $p += array('timeout' => 10, 'maxage' => 86400, 'cleanbefore' => 7200, 'safety' => 1, 'it_error' => array()); $p['totaltimeout'] = $p['timeout']; - $path = it_url::get_cache_filename($p); # Must be before changing cachedir below + $path = it_url::get_cache_filename($p); # Must be before changing cachedir below $p['cachedir'] = it_url::get_cache_dir($p); @mkdir($p['cachedir']); @mkdir(dirname($path)); - $age = file_exists($path) ? (time() - @filemtime($path)) : 0; - - # expire forgotten locks - $lockmtime = @filemtime("$path.lock"); - if ($lockmtime && time()-$lockmtime > 30) - @unlink("$path.lock"); - if ($p['keepfailed'] && ($age>$p['maxage']) && ($dummy = @fopen("$path.lock", EDC('nocache') ? "w" : "x"))) # update our copy if we get the lock - { - # my job to refresh the cache entry - fclose($dummy); - - # Touch existing file to prevent locking other getters - touch($path); + if ($filemtime = it_url::_expired($path, $p['maxage'])) # Outdated(non-zero int) or non-existant(true)? + { + if ($lock = it_url::_lock($path)) + { + # Touch existing file to prevent locking other getters while refreshing + if ($filemtime !== true) + touch($path); - EDC('getcache', "refresh", $p['url'], $path); - if (($result = it_url::get($p))) - it_url::_atomicwrite($path, $result); - else - touch($path); + EDC('getcache', "new", $filemtime, $p['url'], $path); + if ($result = it_url::get($p + array('filemtime' => EDC('nocache') ? null : $filemtime))) # => true means not modified (no new data fetched) + $newfile = it_url::_atomicwrite($path, $result); + else if (!$p['keepfailed']) + @unlink($path); # Expired and failed to get - if ($p['safety'] == 1 && !$result) + it_url::_unlock($path, $lock); + } + else { - $parts = @parse_url($p['url']); - it::error($p['it_error'] + array('title'=>"get_cache: download failures on {$p['url']}", 'id'=>$parts['host'])); # send err only if multi failure + # Wait for file currently being transferred + EDC('getcache', "wait", $p['url'], $path); + $result = it_url::_waitforlock($path, $p); } - @unlink("$path.lock"); - } - - # Remove ancient lock or cached file if it is too old - if (!$p['keepfailed']) - it_url::_expire($path, $p['maxage']); - - if ($dummy = @fopen($path, EDC('nocache') ? "w" : "x")) # succeeds if file is missing (or we are in nocache mode) - { - # fill cache myself - fclose($dummy); - EDC('getcache', "new", $p['url'], $path); - $result = it_url::_atomicwrite($path, it_url::get($p)); } else { - # get file from cache, potentially waiting if file is currently being transferred - EDC('getcache', "old", $p['url'], $path); - $result = it_url::_waitforpath($p + array('path' => $path)); + # Get file from cache + EDC('getcache', "cached", $p['url'], $path); + $result = true; # Up to date } if ($result && $p['preprocess']) { $srcpath = $path; $path .= substr(md5(serialize($p['preprocess'])), 0, 2); - it_url::_expire($path, $p['maxage']); - if ($dummy = @fopen($path, EDC('nocache') ? "w" : "x")) # in nocache mode, always succeed + if ($filemtime = $newfile ? true : it_url::_expired($path, $p['maxage'])) # Outdated(non-zero int) or non-existant(true)? { - fclose($dummy); - EDC('getcache', "process", $p['url'], $path); - $dstpath = "$path.preprocesstmp"; + if ($result === true && $filemtime !== true) # Source not modified, destination exists => touch + { + EDC('getcache', "processtouch", $p['url'], $path); + touch($path); + } + else if ($lock = it_url::_lock($path)) + { + # Touch existing file to prevent locking other getters while refreshing + if ($filemtime !== true) + touch($path); - if (is_array($p['preprocess']) && $p['preprocess']['function']) # Needs is_array as it can be a string where dereferencing gives first character! - call_user_func($p['preprocess']['function'], array('in' => $srcpath, 'out' => $dstpath) + $p['preprocess']); - else - call_user_func($p['preprocess'], $srcpath, $dstpath); + EDC('getcache', "processnew", $p['url'], $path); + $dstpath = "$path.preprocesstmp"; - if (!@filesize($dstpath) || !@rename($dstpath, $path)) - { - @unlink($dstpath); - @unlink($path); - $result = false; + if (is_array($p['preprocess']) && $p['preprocess']['function']) # Needs is_array as it can be a string where dereferencing gives first character! + call_user_func($p['preprocess']['function'], array('in' => $srcpath, 'out' => $dstpath) + $p['preprocess']); + else + call_user_func($p['preprocess'], $srcpath, $dstpath); + + if (!($result = @filesize($dstpath) && @rename($dstpath, $path))) + { + @unlink($dstpath); + @unlink($path); + } + + it_url::_unlock($path, $lock); } else - $result = $path; - - if ($result) { - EDC('getcache', "processold", $p['url'], $path); - touch($result, @filemtime($srcpath)); # Ensure processed is never newer than src + # Wait for file currently being processed + EDC('getcache', "processwait", $p['url'], $path); + $result = it_url::_waitforlock($path, $p); } } - else - $result = it_url::_waitforpath($p + array('path' => $path)); - } # cache cleanup at night @@ -472,38 +468,98 @@ function get_cache($p = array()) exec("nohup bash -c 'cd {$p['cachedir']} && sleep 10 && find ?? -mmin +$maxagemin -print0 | xargs -0 -r rm' /dev/null 2>&1 &"); } + EDC('getcache', $result, $path); + return $result ? $path : false; +} + +/** + * Check whether file at given path is older than maxage + * @param $path File to check + * @param $maxage Maximum age of file in seconds + * @return Not expired: false | Non-existant file: true | Timestamp of expired file + */ +function _expired($path, $maxage) +{ + if ($result = EDC('nocache') ? false : @filemtime($path)) + { + if (time() - $result > $maxage) + EDC('getcache', "expired", $path); + else + $result = false; + } + else # File does not exists yet + $result = true; + return $result; } -function _waitforpath($p) +/** + * Acquire lock for a given file + * @param $path File to lock + * @return Lock handle if successfully locked file + */ +function _lock($path) +{ + # expire forgotten locks + if (($mtime = @filemtime("$path.lock")) && (time() - $mtime > 30)) + @unlink("$path.lock"); + + return @fopen("$path.lock", EDC('nocache') ? "w" : "x"); +} + +/** + * Release lock on a file + * @param $path File to unlock + * @param $lock Handle to lock acquird by _lock + */ +function _unlock($path, $lock) +{ + fclose($lock); + @unlink("$path.lock"); +} + +/** + * Wait for lock on a file to be released + * @param $path File to wait for lock + * @param $p Wait parameters, see @get_cache + * @return Whether lock was released within timeout + */ +function _waitforlock($path, $p) { - $p += array('sleeptime' => 0.1); # seconds to wait per pass + $sleeptime = 0.1; # seconds to wait per pass # wait until cache is ready, then read from cache - for ($maxpasses = $p['timeout'] / $p['sleeptime'], $passes = 0; (($size = @filesize($p['path'])) === 0) && ($passes < $maxpasses); ++$passes) + for ($maxpasses = $p['timeout'] / $sleeptime, $passes = 0; ($result = file_exists("$path.lock")) && ($passes < $maxpasses); ++$passes) { - usleep($p['sleeptime'] * 1000000); + usleep($sleeptime * 1000000); clearstatcache(); } - if ($size) - $result = $p['path']; - else if ($p['safety'] == 1) + if ($result && $p['safety'] == 1) it::error(($passes < $maxpasses ? "error getting url" : "timeout") . " in it_url::get_cache(): url={$p['url']}, passes=$passes, maxpasses=$maxpasses, path={$p['path']}"); - return $result; + return !$result; } +/** + * Write data to tmp file and atomically rename it to destination + * @param $path Destination file to write data to + * @param $data Data to write | true to just touch file + * @return True if data was written to file + */ function _atomicwrite($path, $data) { - if ($data !== false) + $result = false; + + if ($data === true) # Not modified, no new data, just update timestamp + touch($path); + else if ($data !== false) { $tmpname = tempnam(dirname($path), "writetmp"); fputs($cachetmp = fopen($tmpname, "w"), $data); fclose($cachetmp); chmod($tmpname, 0664); - rename($tmpname, $path); - $result = $path; + $result = rename($tmpname, $path); } else @unlink($path); @@ -511,16 +567,6 @@ function _atomicwrite($path, $data) return $result; } -function _expire($path, $maxage) -{ - # Remove ancient lock or cached file if it is too old - if (file_exists($path) && (((@filesize($path) == 0) && (time() - @filemtime($path)) > 30) || (time() - @filemtime($path) > $maxage))) - { - EDC('getcache', "expire", $path); - @unlink($path); - } -} - /** * Make an URL absolute by using host an protocol from current Apache request (but not port number) * @param $url Optional URL ( foo.html, /foo.html, //host/bar.html, http://host/bar.html ), default self -- cgit v1.2.3