summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--it_url.class212
1 files changed, 129 insertions, 83 deletions
diff --git a/it_url.class b/it_url.class
index 063aed0..7035291 100644
--- a/it_url.class
+++ b/it_url.class
@@ -225,6 +225,7 @@ function is_reachable($timeout = 5)
* @param $p['url']: url to get, defaults to constructor URL
* @param $p['timeout']: timeout per read in seconds, defaults to 5. fractions allowed
* @param $p['totaltimeout']: timeout for the whole function call
+ * @param $p['filemtime']: Add HTTP header to only fetch when newer than this, otherwise return true instead of data
* @param $p['data']: POST data array with key-value pairs
* @return contents of resulting page, considering redirects, excluding headers, or false on error
*/
@@ -260,6 +261,9 @@ function get($p=null, $timeout=5)
'Accept-Language' => T_lang(),
);
+ if (is_int($p['filemtime']))
+ $p['headers']['If-Modified-Since'] = date("r", $p['filemtime']);
+
if ($datalen = strlen($data))
{
$method = "POST";
@@ -297,7 +301,7 @@ function get($p=null, $timeout=5)
if ($url->result)
{
- if ($url->headers['Transfer-Encoding'] == "chunked") # Bogus HTTP/1.1 chunked answer from server (e.g. Wordpress/Apach2/PHP5)
+ if ($url->headers['Transfer-Encoding'] == "chunked") # Bogus HTTP/1.1 chunked answer from server (e.g. Wordpress/Apache2/PHP5)
{
while ($len = hexdec(fgets($fp)))
{
@@ -315,7 +319,9 @@ function get($p=null, $timeout=5)
$url->data .= @fread($fp, 20480);
}
- if ($url->result < 400)
+ if ($p['filemtime'] && ($url->result == 304))
+ $result = true; # Not modified, success but no data
+ else if ($url->result < 400)
$result =& $url->data;
}
@@ -375,93 +381,83 @@ function get_cache($p = array())
{
$p += array('timeout' => 10, 'maxage' => 86400, 'cleanbefore' => 7200, 'safety' => 1, 'it_error' => array());
$p['totaltimeout'] = $p['timeout'];
- $path = it_url::get_cache_filename($p); # Must be before changing cachedir below
+ $path = it_url::get_cache_filename($p); # Must be before changing cachedir below
$p['cachedir'] = it_url::get_cache_dir($p);
@mkdir($p['cachedir']);
@mkdir(dirname($path));
- $age = file_exists($path) ? (time() - @filemtime($path)) : 0;
-
- # expire forgotten locks
- $lockmtime = @filemtime("$path.lock");
- if ($lockmtime && time()-$lockmtime > 30)
- @unlink("$path.lock");
- if ($p['keepfailed'] && ($age>$p['maxage']) && ($dummy = @fopen("$path.lock", EDC('nocache') ? "w" : "x"))) # update our copy if we get the lock
- {
- # my job to refresh the cache entry
- fclose($dummy);
-
- # Touch existing file to prevent locking other getters
- touch($path);
+ if ($filemtime = it_url::_expired($path, $p['maxage'])) # Outdated(non-zero int) or non-existant(true)?
+ {
+ if ($lock = it_url::_lock($path))
+ {
+ # Touch existing file to prevent locking other getters while refreshing
+ if ($filemtime !== true)
+ touch($path);
- EDC('getcache', "refresh", $p['url'], $path);
- if (($result = it_url::get($p)))
- it_url::_atomicwrite($path, $result);
- else
- touch($path);
+ EDC('getcache', "new", $filemtime, $p['url'], $path);
+ if ($result = it_url::get($p + array('filemtime' => EDC('nocache') ? null : $filemtime))) # => true means not modified (no new data fetched)
+ $newfile = it_url::_atomicwrite($path, $result);
+ else if (!$p['keepfailed'])
+ @unlink($path); # Expired and failed to get
- if ($p['safety'] == 1 && !$result)
+ it_url::_unlock($path, $lock);
+ }
+ else
{
- $parts = @parse_url($p['url']);
- it::error($p['it_error'] + array('title'=>"get_cache: download failures on {$p['url']}", 'id'=>$parts['host'])); # send err only if multi failure
+ # Wait for file currently being transferred
+ EDC('getcache', "wait", $p['url'], $path);
+ $result = it_url::_waitforlock($path, $p);
}
- @unlink("$path.lock");
- }
-
- # Remove ancient lock or cached file if it is too old
- if (!$p['keepfailed'])
- it_url::_expire($path, $p['maxage']);
-
- if ($dummy = @fopen($path, EDC('nocache') ? "w" : "x")) # succeeds if file is missing (or we are in nocache mode)
- {
- # fill cache myself
- fclose($dummy);
- EDC('getcache', "new", $p['url'], $path);
- $result = it_url::_atomicwrite($path, it_url::get($p));
}
else
{
- # get file from cache, potentially waiting if file is currently being transferred
- EDC('getcache', "old", $p['url'], $path);
- $result = it_url::_waitforpath($p + array('path' => $path));
+ # Get file from cache
+ EDC('getcache', "cached", $p['url'], $path);
+ $result = true; # Up to date
}
if ($result && $p['preprocess'])
{
$srcpath = $path;
$path .= substr(md5(serialize($p['preprocess'])), 0, 2);
- it_url::_expire($path, $p['maxage']);
- if ($dummy = @fopen($path, EDC('nocache') ? "w" : "x")) # in nocache mode, always succeed
+ if ($filemtime = $newfile ? true : it_url::_expired($path, $p['maxage'])) # Outdated(non-zero int) or non-existant(true)?
{
- fclose($dummy);
- EDC('getcache', "process", $p['url'], $path);
- $dstpath = "$path.preprocesstmp";
+ if ($result === true && $filemtime !== true) # Source not modified, destination exists => touch
+ {
+ EDC('getcache', "processtouch", $p['url'], $path);
+ touch($path);
+ }
+ else if ($lock = it_url::_lock($path))
+ {
+ # Touch existing file to prevent locking other getters while refreshing
+ if ($filemtime !== true)
+ touch($path);
- if (is_array($p['preprocess']) && $p['preprocess']['function']) # Needs is_array as it can be a string where dereferencing gives first character!
- call_user_func($p['preprocess']['function'], array('in' => $srcpath, 'out' => $dstpath) + $p['preprocess']);
- else
- call_user_func($p['preprocess'], $srcpath, $dstpath);
+ EDC('getcache', "processnew", $p['url'], $path);
+ $dstpath = "$path.preprocesstmp";
- if (!@filesize($dstpath) || !@rename($dstpath, $path))
- {
- @unlink($dstpath);
- @unlink($path);
- $result = false;
+ if (is_array($p['preprocess']) && $p['preprocess']['function']) # Needs is_array as it can be a string where dereferencing gives first character!
+ call_user_func($p['preprocess']['function'], array('in' => $srcpath, 'out' => $dstpath) + $p['preprocess']);
+ else
+ call_user_func($p['preprocess'], $srcpath, $dstpath);
+
+ if (!($result = @filesize($dstpath) && @rename($dstpath, $path)))
+ {
+ @unlink($dstpath);
+ @unlink($path);
+ }
+
+ it_url::_unlock($path, $lock);
}
else
- $result = $path;
-
- if ($result)
{
- EDC('getcache', "processold", $p['url'], $path);
- touch($result, @filemtime($srcpath)); # Ensure processed is never newer than src
+ # Wait for file currently being processed
+ EDC('getcache', "processwait", $p['url'], $path);
+ $result = it_url::_waitforlock($path, $p);
}
}
- else
- $result = it_url::_waitforpath($p + array('path' => $path));
-
}
# cache cleanup at night
@@ -472,38 +468,98 @@ function get_cache($p = array())
exec("nohup bash -c 'cd {$p['cachedir']} && sleep 10 && find ?? -mmin +$maxagemin -print0 | xargs -0 -r rm' </dev/null >/dev/null 2>&1 &");
}
+ EDC('getcache', $result, $path);
+ return $result ? $path : false;
+}
+
+/**
+ * Check whether file at given path is older than maxage
+ * @param $path File to check
+ * @param $maxage Maximum age of file in seconds
+ * @return Not expired: false | Non-existant file: true | Timestamp of expired file
+ */
+function _expired($path, $maxage)
+{
+ if ($result = EDC('nocache') ? false : @filemtime($path))
+ {
+ if (time() - $result > $maxage)
+ EDC('getcache', "expired", $path);
+ else
+ $result = false;
+ }
+ else # File does not exists yet
+ $result = true;
+
return $result;
}
-function _waitforpath($p)
+/**
+ * Acquire lock for a given file
+ * @param $path File to lock
+ * @return Lock handle if successfully locked file
+ */
+function _lock($path)
+{
+ # expire forgotten locks
+ if (($mtime = @filemtime("$path.lock")) && (time() - $mtime > 30))
+ @unlink("$path.lock");
+
+ return @fopen("$path.lock", EDC('nocache') ? "w" : "x");
+}
+
+/**
+ * Release lock on a file
+ * @param $path File to unlock
+ * @param $lock Handle to lock acquird by _lock
+ */
+function _unlock($path, $lock)
+{
+ fclose($lock);
+ @unlink("$path.lock");
+}
+
+/**
+ * Wait for lock on a file to be released
+ * @param $path File to wait for lock
+ * @param $p Wait parameters, see @get_cache
+ * @return Whether lock was released within timeout
+ */
+function _waitforlock($path, $p)
{
- $p += array('sleeptime' => 0.1); # seconds to wait per pass
+ $sleeptime = 0.1; # seconds to wait per pass
# wait until cache is ready, then read from cache
- for ($maxpasses = $p['timeout'] / $p['sleeptime'], $passes = 0; (($size = @filesize($p['path'])) === 0) && ($passes < $maxpasses); ++$passes)
+ for ($maxpasses = $p['timeout'] / $sleeptime, $passes = 0; ($result = file_exists("$path.lock")) && ($passes < $maxpasses); ++$passes)
{
- usleep($p['sleeptime'] * 1000000);
+ usleep($sleeptime * 1000000);
clearstatcache();
}
- if ($size)
- $result = $p['path'];
- else if ($p['safety'] == 1)
+ if ($result && $p['safety'] == 1)
it::error(($passes < $maxpasses ? "error getting url" : "timeout") . " in it_url::get_cache(): url={$p['url']}, passes=$passes, maxpasses=$maxpasses, path={$p['path']}");
- return $result;
+ return !$result;
}
+/**
+ * Write data to tmp file and atomically rename it to destination
+ * @param $path Destination file to write data to
+ * @param $data Data to write | true to just touch file
+ * @return True if data was written to file
+ */
function _atomicwrite($path, $data)
{
- if ($data !== false)
+ $result = false;
+
+ if ($data === true) # Not modified, no new data, just update timestamp
+ touch($path);
+ else if ($data !== false)
{
$tmpname = tempnam(dirname($path), "writetmp");
fputs($cachetmp = fopen($tmpname, "w"), $data);
fclose($cachetmp);
chmod($tmpname, 0664);
- rename($tmpname, $path);
- $result = $path;
+ $result = rename($tmpname, $path);
}
else
@unlink($path);
@@ -511,16 +567,6 @@ function _atomicwrite($path, $data)
return $result;
}
-function _expire($path, $maxage)
-{
- # Remove ancient lock or cached file if it is too old
- if (file_exists($path) && (((@filesize($path) == 0) && (time() - @filemtime($path)) > 30) || (time() - @filemtime($path) > $maxage)))
- {
- EDC('getcache', "expire", $path);
- @unlink($path);
- }
-}
-
/**
* Make an URL absolute by using host an protocol from current Apache request (but not port number)
* @param $url Optional URL ( foo.html, /foo.html, //host/bar.html, http://host/bar.html ), default self