summaryrefslogtreecommitdiff
path: root/urlcache/urlcache.class
diff options
context:
space:
mode:
authorChristian Schneider2006-10-26 13:35:12 +0000
committerChristian Schneider2006-10-26 13:35:12 +0000
commita5a19fd672bc0b8113d620669b557f17dccd343a (patch)
tree876ba4fec8362ac2e9374f61b9b7f67fcd2b8e59 /urlcache/urlcache.class
downloaditools-a5a19fd672bc0b8113d620669b557f17dccd343a.tar.gz
itools-a5a19fd672bc0b8113d620669b557f17dccd343a.tar.bz2
itools-a5a19fd672bc0b8113d620669b557f17dccd343a.zip
Moved itools to live branch
Diffstat (limited to 'urlcache/urlcache.class')
-rw-r--r--urlcache/urlcache.class225
1 files changed, 225 insertions, 0 deletions
diff --git a/urlcache/urlcache.class b/urlcache/urlcache.class
new file mode 100644
index 0000000..790efa4
--- /dev/null
+++ b/urlcache/urlcache.class
@@ -0,0 +1,225 @@
+<?php
+/*
+** $Id$
+**
+** Relog Internet Tools 3 Library ("ITOOLS3")
+**
+** it_urlcache.class - Interface to URL content cache in DB
+**
+** This class relies on a DB table with the following scheme:
+** url CHAR(255) NOT NULL,
+** updateinterval INT NOT NULL,
+** lastupdate INT NOT NULL,
+** nextupdate INT NOT NULL,
+** contentmd5 CHAR(32) NOT NULL,
+** content TEXT NOT NULL,
+** PRIMARY KEY (url),
+** KEY (nextupdate)
+**
+** $Log$
+** Revision 1.2 2002/01/22 17:42:14 weber
+** Changed set_socket_blocking() to socekt_set_blocking()
+**
+** Revision 1.1 2000/01/28 02:37:55 cschneid
+** Added URL cache class and simple script to be put into crontab
+**
+*/
+
+/* PRIVATE */
+define("_IT_URLCACHE_DEFAULT_INTERVAL", 300);
+define("_IT_URLCACHE_MAX_CONNECTIONS", 30);
+
+class it_urlcache extends it_db_record
+{
+ /* PRIVATE */
+ var $default_interval = _IT_URLCACHE_DEFAULT_INTERVAL;
+ var $max_connections = _IT_URLCACHE_MAX_CONNECTIONS;
+
+function set_default_interval($default_interval)
+{
+ $this->default_interval = $default_interval;
+}
+
+
+function set_max_connections($max_connections)
+{
+ $this->max_connections = $max_connections;
+}
+
+
+/*
+ * Register an URL to be fetched by background process.
+ * Note: get() will return an empty string until the url is fetched for the
+ * first time
+ */
+function register($url)
+{
+ if (!$this->read($url))
+ {
+ $tags = array("url" => $url, "updateinterval" => $this->default_interval);
+ $this->create($tags);
+ }
+}
+
+
+/*
+ * Removes the url from the cache. It won't be fetched again until register()
+ * is called again.
+ */
+function unregister($url)
+{
+ if ($this->read($url))
+ $this->delete();
+}
+
+
+/*
+ * Get content of url. Returns empty string if it was never successfully
+ * fetched.
+ */
+function get_content($url)
+{
+ $this->read($url);
+ return $this->data['content'];
+}
+
+
+/*
+ * Try to refetch all URLs in the database.
+ * Note: This is intented for background process use only as it may take
+ * a while to finish...
+ */
+function update_cache()
+{
+ $now = time();
+ $pages = array();
+
+ $result = $this->table->safe_sql_select("WHERE nextupdate < $now", "url");
+
+ while (list($url) = $this->table->db->fetch_array($result))
+ $pages[] = $url;
+
+ $this->table->db->free($result);
+
+ $this->_fetch_urls($pages, $now);
+}
+
+
+/* PRIVATE */
+function _update_cache($url, $content, $now)
+{
+ $contentmd5 = md5($content);
+
+ if ($this->read($url))
+ {
+ $interval = $this->data['updateinterval'];
+ $age = $now - $this->data['lastupdate'];
+
+ if ($this->data['contentmd5'] == $contentmd5)
+ {
+ if ($age > ($interval * 2))
+ $interval = $interval * 1.5;
+
+ $tags = array("updateinterval" => $interval, "nextupdate" => $now + $interval);
+ }
+ else
+ {
+ if ($age < ($interval / 2))
+ $interval = $interval / 1.5;
+
+ if ($interval < $this->default_interval)
+ $interval = $this->default_interval;
+
+ $tags = array("updateinterval" => $interval, "lastupdate" => $now, "nextupdate" => $now + $interval, "contentmd5" => $contentmd5, "content" => $content);
+ }
+
+ $this->update($tags);
+ }
+ else
+ {
+ $tags = array("url" => $url, "updateinterval" => $this->default_interval, "lastupdate" => $now, "nextupdate" => $now + $this->default_interval, "contentmd5" => $contentmd5, "content" => $content);
+ $this->create($tags);
+ }
+}
+
+
+/*
+ * PRIVATE
+ * Fetch URL in array $urls and store update cache database accordingly
+ */
+function _fetch_urls($pages, $now)
+{
+ $pos = 0;
+ $fds = array();
+ $urls = array();
+ $content = array();
+ $count = 0;
+
+ for ($pos = 0; ($pos < count($pages)) && ($pos < $this->max_connections); $pos++)
+ {
+ debug("Opening $pages[$pos]", 10);
+
+ if ($fd = fopen($pages[$pos], "r"))
+ {
+ socket_set_blocking($fd, false);
+ $fds[] = $fd;
+ $urls[] = $pages[$pos];
+ $content[] = "";
+ $count++;
+ debug("Opened $pages[$pos]", 10);
+ }
+ }
+
+ while ($count > 0)
+ {
+ $read = 0;
+
+ for ($i = 0; $i < count($fds); $i++)
+ {
+ if ($fds[$i])
+ {
+ if (!feof($fds[$i]))
+ {
+ $data = fread($fds[$i], 4096);
+ $content[$i] .= $data;
+ $read += strlen($data);
+ debug("Read " . strlen($data) . " from $urls[$i]", 10);
+ }
+
+ if (feof($fds[$i]))
+ {
+ fclose($fds[$i]);
+ $fds[$i] = 0;
+ $count--;
+ $this->_update_cache($urls[$i], $content[$i], $now);
+ debug("Finished $urls[$i]: " . strlen($content[$i]) . " bytes", 10);
+
+ /* Schedule next url for retrieval */
+ if ($pos < count($pages))
+ {
+ if ($fd = fopen($pages[$pos], "r"))
+ {
+ socket_set_blocking($fd, false);
+ $fds[$i] = $fd;
+ $urls[$i] = $pages[$pos];
+ $content[$i] = "";
+ $count++;
+ debug("Opened $pages[$pos]", 10);
+ }
+
+ $pos++;
+ }
+ }
+ }
+ }
+
+ /* We are in non-blocking mode, be nice */
+ if (($count > 0) && ($read == 0))
+ {
+ sleep(1);
+ debug("Sleeping...", 10);
+ }
+ }
+}
+
+} /* End class it_urlcache */