diff options
Diffstat (limited to 'urlcache/urlcache.class')
-rw-r--r-- | urlcache/urlcache.class | 225 |
1 files changed, 225 insertions, 0 deletions
diff --git a/urlcache/urlcache.class b/urlcache/urlcache.class new file mode 100644 index 0000000..790efa4 --- /dev/null +++ b/urlcache/urlcache.class @@ -0,0 +1,225 @@ +<?php +/* +** $Id$ +** +** Relog Internet Tools 3 Library ("ITOOLS3") +** +** it_urlcache.class - Interface to URL content cache in DB +** +** This class relies on a DB table with the following scheme: +** url CHAR(255) NOT NULL, +** updateinterval INT NOT NULL, +** lastupdate INT NOT NULL, +** nextupdate INT NOT NULL, +** contentmd5 CHAR(32) NOT NULL, +** content TEXT NOT NULL, +** PRIMARY KEY (url), +** KEY (nextupdate) +** +** $Log$ +** Revision 1.2 2002/01/22 17:42:14 weber +** Changed set_socket_blocking() to socekt_set_blocking() +** +** Revision 1.1 2000/01/28 02:37:55 cschneid +** Added URL cache class and simple script to be put into crontab +** +*/ + +/* PRIVATE */ +define("_IT_URLCACHE_DEFAULT_INTERVAL", 300); +define("_IT_URLCACHE_MAX_CONNECTIONS", 30); + +class it_urlcache extends it_db_record +{ + /* PRIVATE */ + var $default_interval = _IT_URLCACHE_DEFAULT_INTERVAL; + var $max_connections = _IT_URLCACHE_MAX_CONNECTIONS; + +function set_default_interval($default_interval) +{ + $this->default_interval = $default_interval; +} + + +function set_max_connections($max_connections) +{ + $this->max_connections = $max_connections; +} + + +/* + * Register an URL to be fetched by background process. + * Note: get() will return an empty string until the url is fetched for the + * first time + */ +function register($url) +{ + if (!$this->read($url)) + { + $tags = array("url" => $url, "updateinterval" => $this->default_interval); + $this->create($tags); + } +} + + +/* + * Removes the url from the cache. It won't be fetched again until register() + * is called again. + */ +function unregister($url) +{ + if ($this->read($url)) + $this->delete(); +} + + +/* + * Get content of url. Returns empty string if it was never successfully + * fetched. + */ +function get_content($url) +{ + $this->read($url); + return $this->data['content']; +} + + +/* + * Try to refetch all URLs in the database. + * Note: This is intented for background process use only as it may take + * a while to finish... + */ +function update_cache() +{ + $now = time(); + $pages = array(); + + $result = $this->table->safe_sql_select("WHERE nextupdate < $now", "url"); + + while (list($url) = $this->table->db->fetch_array($result)) + $pages[] = $url; + + $this->table->db->free($result); + + $this->_fetch_urls($pages, $now); +} + + +/* PRIVATE */ +function _update_cache($url, $content, $now) +{ + $contentmd5 = md5($content); + + if ($this->read($url)) + { + $interval = $this->data['updateinterval']; + $age = $now - $this->data['lastupdate']; + + if ($this->data['contentmd5'] == $contentmd5) + { + if ($age > ($interval * 2)) + $interval = $interval * 1.5; + + $tags = array("updateinterval" => $interval, "nextupdate" => $now + $interval); + } + else + { + if ($age < ($interval / 2)) + $interval = $interval / 1.5; + + if ($interval < $this->default_interval) + $interval = $this->default_interval; + + $tags = array("updateinterval" => $interval, "lastupdate" => $now, "nextupdate" => $now + $interval, "contentmd5" => $contentmd5, "content" => $content); + } + + $this->update($tags); + } + else + { + $tags = array("url" => $url, "updateinterval" => $this->default_interval, "lastupdate" => $now, "nextupdate" => $now + $this->default_interval, "contentmd5" => $contentmd5, "content" => $content); + $this->create($tags); + } +} + + +/* + * PRIVATE + * Fetch URL in array $urls and store update cache database accordingly + */ +function _fetch_urls($pages, $now) +{ + $pos = 0; + $fds = array(); + $urls = array(); + $content = array(); + $count = 0; + + for ($pos = 0; ($pos < count($pages)) && ($pos < $this->max_connections); $pos++) + { + debug("Opening $pages[$pos]", 10); + + if ($fd = fopen($pages[$pos], "r")) + { + socket_set_blocking($fd, false); + $fds[] = $fd; + $urls[] = $pages[$pos]; + $content[] = ""; + $count++; + debug("Opened $pages[$pos]", 10); + } + } + + while ($count > 0) + { + $read = 0; + + for ($i = 0; $i < count($fds); $i++) + { + if ($fds[$i]) + { + if (!feof($fds[$i])) + { + $data = fread($fds[$i], 4096); + $content[$i] .= $data; + $read += strlen($data); + debug("Read " . strlen($data) . " from $urls[$i]", 10); + } + + if (feof($fds[$i])) + { + fclose($fds[$i]); + $fds[$i] = 0; + $count--; + $this->_update_cache($urls[$i], $content[$i], $now); + debug("Finished $urls[$i]: " . strlen($content[$i]) . " bytes", 10); + + /* Schedule next url for retrieval */ + if ($pos < count($pages)) + { + if ($fd = fopen($pages[$pos], "r")) + { + socket_set_blocking($fd, false); + $fds[$i] = $fd; + $urls[$i] = $pages[$pos]; + $content[$i] = ""; + $count++; + debug("Opened $pages[$pos]", 10); + } + + $pos++; + } + } + } + } + + /* We are in non-blocking mode, be nice */ + if (($count > 0) && ($read == 0)) + { + sleep(1); + debug("Sleeping...", 10); + } + } +} + +} /* End class it_urlcache */ |