summaryrefslogtreecommitdiff
path: root/urlcache
diff options
context:
space:
mode:
authorChristian Schneider2006-10-26 13:35:12 +0000
committerChristian Schneider2006-10-26 13:35:12 +0000
commita5a19fd672bc0b8113d620669b557f17dccd343a (patch)
tree876ba4fec8362ac2e9374f61b9b7f67fcd2b8e59 /urlcache
downloaditools-a5a19fd672bc0b8113d620669b557f17dccd343a.tar.gz
itools-a5a19fd672bc0b8113d620669b557f17dccd343a.tar.bz2
itools-a5a19fd672bc0b8113d620669b557f17dccd343a.zip
Moved itools to live branch
Diffstat (limited to 'urlcache')
-rw-r--r--urlcache/.cvsignore2
-rw-r--r--urlcache/Makefile48
-rw-r--r--urlcache/urlcache.class225
-rw-r--r--urlcache/urlcache.php28
-rw-r--r--urlcache/urlcache.sql14
5 files changed, 317 insertions, 0 deletions
diff --git a/urlcache/.cvsignore b/urlcache/.cvsignore
new file mode 100644
index 0000000..b8e8856
--- /dev/null
+++ b/urlcache/.cvsignore
@@ -0,0 +1,2 @@
+*.slib
+*.lib
diff --git a/urlcache/Makefile b/urlcache/Makefile
new file mode 100644
index 0000000..ef66132
--- /dev/null
+++ b/urlcache/Makefile
@@ -0,0 +1,48 @@
+##
+## $Id$
+##
+## Makefile for itools/urlcache.lib
+##
+## $Log$
+## Revision 1.1 2000/01/28 02:37:54 cschneid
+## Added URL cache class and simple script to be put into crontab
+##
+##
+
+CPP= cpp
+QUIETMAKE= $(MAKE) -s
+PHPCOMPILE= /usr/local/bin/phpcompile
+
+MODULE= urlcache
+SUBDIRS=
+CLASSES= urlcache.class
+
+#
+# Library creation rules, do not change stuff below...
+#
+SLIB= $(MODULE).slib
+LIB= ../$(MODULE).lib
+
+all: $(LIB)
+
+$(LIB): $(SLIB)
+ @if [ -x $(PHPCOMPILE) ]; then (echo Compiling $(SLIB) to $(LIB) ...) 1>&2; $(PHPCOMPILE) <$(SLIB) >$(LIB); else (echo $(PHPCOMPILE) not found, copying $(SLIB) to $(LIB) ...) 1>&2; cp $(SLIB) $(LIB); fi
+
+$(SLIB): $(CLASSES) DUMMY
+ @(echo Creating $(SLIB) from $(SUBDIRS) $(CLASSES) ...) 1>&2
+ @echo "<?php" >$(SLIB)
+ @(for dir in DUMMY $(SUBDIRS); do (test -d $$dir && cd $$dir && $(QUIETMAKE) cat); done; for class in DUMMY $(CLASSES); do test -f $$class && cat $$class; done) | $(CPP) -P -undef | perl -ne 's/^\s+//g; print unless /^\s*$$/' | grep -v "^<?php" | grep -v "^?>" >>$(SLIB)
+ @echo "?>" >>$(SLIB)
+
+$(SUBDIRS)::
+ @(cd $@; $(QUIETMAKE))
+
+DUMMY:
+
+cat: $(SLIB)
+ @cat $(SLIB)
+
+clean:
+ @(echo Cleaning $(SLIB) $(LIB) ...) 1>&2
+ @rm -f $(SLIB) $(LIB)
+ @for dir in DUMMY $(SUBDIRS); do (test -d $$dir && cd $$dir && $(QUIETMAKE) $@) || :; done
diff --git a/urlcache/urlcache.class b/urlcache/urlcache.class
new file mode 100644
index 0000000..790efa4
--- /dev/null
+++ b/urlcache/urlcache.class
@@ -0,0 +1,225 @@
+<?php
+/*
+** $Id$
+**
+** Relog Internet Tools 3 Library ("ITOOLS3")
+**
+** it_urlcache.class - Interface to URL content cache in DB
+**
+** This class relies on a DB table with the following scheme:
+** url CHAR(255) NOT NULL,
+** updateinterval INT NOT NULL,
+** lastupdate INT NOT NULL,
+** nextupdate INT NOT NULL,
+** contentmd5 CHAR(32) NOT NULL,
+** content TEXT NOT NULL,
+** PRIMARY KEY (url),
+** KEY (nextupdate)
+**
+** $Log$
+** Revision 1.2 2002/01/22 17:42:14 weber
+** Changed set_socket_blocking() to socekt_set_blocking()
+**
+** Revision 1.1 2000/01/28 02:37:55 cschneid
+** Added URL cache class and simple script to be put into crontab
+**
+*/
+
+/* PRIVATE */
+define("_IT_URLCACHE_DEFAULT_INTERVAL", 300);
+define("_IT_URLCACHE_MAX_CONNECTIONS", 30);
+
+class it_urlcache extends it_db_record
+{
+ /* PRIVATE */
+ var $default_interval = _IT_URLCACHE_DEFAULT_INTERVAL;
+ var $max_connections = _IT_URLCACHE_MAX_CONNECTIONS;
+
+function set_default_interval($default_interval)
+{
+ $this->default_interval = $default_interval;
+}
+
+
+function set_max_connections($max_connections)
+{
+ $this->max_connections = $max_connections;
+}
+
+
+/*
+ * Register an URL to be fetched by background process.
+ * Note: get() will return an empty string until the url is fetched for the
+ * first time
+ */
+function register($url)
+{
+ if (!$this->read($url))
+ {
+ $tags = array("url" => $url, "updateinterval" => $this->default_interval);
+ $this->create($tags);
+ }
+}
+
+
+/*
+ * Removes the url from the cache. It won't be fetched again until register()
+ * is called again.
+ */
+function unregister($url)
+{
+ if ($this->read($url))
+ $this->delete();
+}
+
+
+/*
+ * Get content of url. Returns empty string if it was never successfully
+ * fetched.
+ */
+function get_content($url)
+{
+ $this->read($url);
+ return $this->data['content'];
+}
+
+
+/*
+ * Try to refetch all URLs in the database.
+ * Note: This is intented for background process use only as it may take
+ * a while to finish...
+ */
+function update_cache()
+{
+ $now = time();
+ $pages = array();
+
+ $result = $this->table->safe_sql_select("WHERE nextupdate < $now", "url");
+
+ while (list($url) = $this->table->db->fetch_array($result))
+ $pages[] = $url;
+
+ $this->table->db->free($result);
+
+ $this->_fetch_urls($pages, $now);
+}
+
+
+/* PRIVATE */
+function _update_cache($url, $content, $now)
+{
+ $contentmd5 = md5($content);
+
+ if ($this->read($url))
+ {
+ $interval = $this->data['updateinterval'];
+ $age = $now - $this->data['lastupdate'];
+
+ if ($this->data['contentmd5'] == $contentmd5)
+ {
+ if ($age > ($interval * 2))
+ $interval = $interval * 1.5;
+
+ $tags = array("updateinterval" => $interval, "nextupdate" => $now + $interval);
+ }
+ else
+ {
+ if ($age < ($interval / 2))
+ $interval = $interval / 1.5;
+
+ if ($interval < $this->default_interval)
+ $interval = $this->default_interval;
+
+ $tags = array("updateinterval" => $interval, "lastupdate" => $now, "nextupdate" => $now + $interval, "contentmd5" => $contentmd5, "content" => $content);
+ }
+
+ $this->update($tags);
+ }
+ else
+ {
+ $tags = array("url" => $url, "updateinterval" => $this->default_interval, "lastupdate" => $now, "nextupdate" => $now + $this->default_interval, "contentmd5" => $contentmd5, "content" => $content);
+ $this->create($tags);
+ }
+}
+
+
+/*
+ * PRIVATE
+ * Fetch URL in array $urls and store update cache database accordingly
+ */
+function _fetch_urls($pages, $now)
+{
+ $pos = 0;
+ $fds = array();
+ $urls = array();
+ $content = array();
+ $count = 0;
+
+ for ($pos = 0; ($pos < count($pages)) && ($pos < $this->max_connections); $pos++)
+ {
+ debug("Opening $pages[$pos]", 10);
+
+ if ($fd = fopen($pages[$pos], "r"))
+ {
+ socket_set_blocking($fd, false);
+ $fds[] = $fd;
+ $urls[] = $pages[$pos];
+ $content[] = "";
+ $count++;
+ debug("Opened $pages[$pos]", 10);
+ }
+ }
+
+ while ($count > 0)
+ {
+ $read = 0;
+
+ for ($i = 0; $i < count($fds); $i++)
+ {
+ if ($fds[$i])
+ {
+ if (!feof($fds[$i]))
+ {
+ $data = fread($fds[$i], 4096);
+ $content[$i] .= $data;
+ $read += strlen($data);
+ debug("Read " . strlen($data) . " from $urls[$i]", 10);
+ }
+
+ if (feof($fds[$i]))
+ {
+ fclose($fds[$i]);
+ $fds[$i] = 0;
+ $count--;
+ $this->_update_cache($urls[$i], $content[$i], $now);
+ debug("Finished $urls[$i]: " . strlen($content[$i]) . " bytes", 10);
+
+ /* Schedule next url for retrieval */
+ if ($pos < count($pages))
+ {
+ if ($fd = fopen($pages[$pos], "r"))
+ {
+ socket_set_blocking($fd, false);
+ $fds[$i] = $fd;
+ $urls[$i] = $pages[$pos];
+ $content[$i] = "";
+ $count++;
+ debug("Opened $pages[$pos]", 10);
+ }
+
+ $pos++;
+ }
+ }
+ }
+ }
+
+ /* We are in non-blocking mode, be nice */
+ if (($count > 0) && ($read == 0))
+ {
+ sleep(1);
+ debug("Sleeping...", 10);
+ }
+ }
+}
+
+} /* End class it_urlcache */
diff --git a/urlcache/urlcache.php b/urlcache/urlcache.php
new file mode 100644
index 0000000..fb6fd33
--- /dev/null
+++ b/urlcache/urlcache.php
@@ -0,0 +1,28 @@
+#!/www/server/bin/php -q
+<?php
+/*
+** $Id$
+**
+** ITools - the Internet Tools Library
+**
+** Copyright (C) 1995-2003 by the ITools Authors.
+** This program is free software; you can redistribute it and/or
+** modify it under the terms of either the GNU General Public License
+** or the GNU Lesser General Public License, as published by the Free
+** Software Foundation. See http://www.gnu.org/licenses/ for details.
+**
+** cacher.php - Script to be called from crontab to keep url cache up-to-date
+*/
+
+set_time_limit(4 * 60);
+
+require("itools/itools.lib");
+require("itools/urlcache.lib");
+
+$it_debug = new it_debug(10, "weber@search.ch");
+$it_db = new it_db("urlcache", "urlcache", "JKhsad34H");
+$table = new it_db_table($it_db, "urlcache");
+$it_urlcache = new it_urlcache($table, "url");
+$it_urlcache->update_cache();
+
+?>
diff --git a/urlcache/urlcache.sql b/urlcache/urlcache.sql
new file mode 100644
index 0000000..fcf8659
--- /dev/null
+++ b/urlcache/urlcache.sql
@@ -0,0 +1,14 @@
+DROP TABLE IF EXISTS urlcache;
+
+CREATE TABLE urlcache
+(
+ url CHAR(255) NOT NULL,
+ updateinterval INT NOT NULL,
+ lastupdate INT NOT NULL,
+ nextupdate INT NOT NULL,
+ contentmd5 CHAR(32) NOT NULL,
+ content TEXT NOT NULL,
+
+ PRIMARY KEY (url),
+ KEY (nextupdate)
+);