From 9376ddee188693e983df55dc87e427938c1f52af Mon Sep 17 00:00:00 2001 From: Urban Müller Date: Wed, 16 Sep 2015 16:12:38 +0200 Subject: support keepfailed in conjunction with failed preprocessors --- it_url.class | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'it_url.class') diff --git a/it_url.class b/it_url.class index 984f621..33bf4b1 100644 --- a/it_url.class +++ b/it_url.class @@ -702,7 +702,8 @@ function get_cache($p = array()) if (!($result = @filesize($dstpath) && @rename($dstpath, $path))) { @unlink($dstpath); - @unlink($path); + if (!$p['keepfailed']) + @unlink($path); } it_url::_unlock($path, $lock); -- cgit v1.2.3 From 0f9e9ea62699f0d82ab11eb4377eb889857ac3bc Mon Sep 17 00:00:00 2001 From: Urban Müller Date: Wed, 16 Sep 2015 16:57:40 +0200 Subject: correct returncode in keepfailed case --- it_url.class | 1 + 1 file changed, 1 insertion(+) (limited to 'it_url.class') diff --git a/it_url.class b/it_url.class index 33bf4b1..b0320a6 100644 --- a/it_url.class +++ b/it_url.class @@ -704,6 +704,7 @@ function get_cache($p = array()) @unlink($dstpath); if (!$p['keepfailed']) @unlink($path); + $result = file_exists($path); } it_url::_unlock($path, $lock); -- cgit v1.2.3 From 5efecd703cebc19ed1fb08725f243920e57271a0 Mon Sep 17 00:00:00 2001 From: Christian A. Weber Date: Tue, 13 Oct 2015 12:52:46 +0200 Subject: remove ultra-obsolete read_page(), get_description() and get_title() api (introduced for Myax Knowledge Manager) --- it_url.class | 81 +----------------------------------------------------------- 1 file changed, 1 insertion(+), 80 deletions(-) (limited to 'it_url.class') diff --git a/it_url.class b/it_url.class index b0320a6..e58ad14 100644 --- a/it_url.class +++ b/it_url.class @@ -32,13 +32,7 @@ class it_url var $rawurl; /* E.g. HTTP://falcon:joshua@www.Relog.CH.:80/default.asp */ var $user; /* E.g. falcon */ var $pass; /* E.g. joshua */ - - var $page; /* Page or empty */ - var $page_read; /* true if page read */ - var $title; /* Page title or empty */ - var $description; /* Page description or empty */ var $cookies; /* key => values of cookies from server */ - var $headers; /* Headers of page fetched by get() */ var $data; /* Data part, even if return code is not 200 */ var $result; /* Return code of get() */ @@ -48,6 +42,7 @@ class it_url /** * Constructor: canonicalize an URL * @param $url URL this object represents + * @param $options['encoding'] encoding of hostname ('utf-8', 'iso-8859-1' etc.) */ function it_url($url = null, $options = array()) { @@ -122,80 +117,6 @@ function it_url($url = null, $options = array()) } -/** - * Read the page into memory, extract title and description and - * set $this->page, $this->title and $this->description - * @param $timeout Timeout for operation, defaults to unlimited (0) - * @return True if page has been read and $this->page is set - */ -function read_page($timeout = 0) -{ - unset($this->page); - unset($this->title); - unset($this->description); - - /* - ** If the URL does not contain a dot followed by at least one character, - ** it is considered bogus. This prevents 'localhost', 'www', and numerical IP addresses. - */ - if (!preg_match('/\.[a-z]+$/i', $this->realhostname)) - return 0; - - $url = $this->rawurl; - while ($this->page == '') - { - $cmd = 'LANG=C wget 2>&1 -T ' . ((int)$timeout) . ' -q -U "Mozilla/4.0 (Compatible; Relog ITools)" -O - ' . preg_replace("/[ \t]/", '\\ ', escapeshellcmd("$url")); - $this->page = `$cmd`; - - if ($this->page == '') /* An error occurred. Find out what it was. */ - { - $cmd = 'LANG=C wget 2>&1 -T' . ((int)$timeout) . ' -v -U "Mozilla/4.0 (Compatible; Relog ITools)" -O - ' . preg_replace("/[ \t]/", '\\ ', escapeshellcmd($url)); - $error = `$cmd`; - if (preg_match('/Location: ([^ ]*)/i', $error, $regs)) /* Redirect ? */ - { - $url = $regs[1]; - if (!preg_match('/^[a-z]+:/i', $url)) /* Kludge for Miss Kournikova's admirers: grok local redirects (in violation of RFC) */ - $url = $this->rawurl.'/'.$url; - } - else - break; - } - - if (++$count > 4) /* Avoid infinite redirect loops */ - break; - } - - $this->page_read = 1; - - if (preg_match('#([^<]*)#i', $this->page, $regs)) - $this->title = it_htmlentities_decode($regs[1]); - - if (preg_match('/]+content="([^"]*)">/i', $this->page, $regs)) - $this->description = it_htmlentities_decode($regs[1]); - - return ($this->page != ''); -} - - -/* Return the description of this page */ -function get_description() -{ - if (!$this->page_read) - $this->read_page(); - - return $this->description; -} - - -/* Return the title of this page */ -function get_title() -{ - if (!$this->page_read) - $this->read_page(); - - return $this->title; -} - /** * Check if a given url (currently http:port80-only) can be fetched * Note: Redirects are treated as succesful -- cgit v1.2.3 From 65cfeaef1fd38b5494238ea039655c7623d7b4cd Mon Sep 17 00:00:00 2001 From: Urban Müller Date: Fri, 16 Oct 2015 15:46:21 +0200 Subject: more compact getcache debug output --- it_url.class | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'it_url.class') diff --git a/it_url.class b/it_url.class index e58ad14..0f44639 100644 --- a/it_url.class +++ b/it_url.class @@ -647,7 +647,7 @@ function get_cache($p = array()) exec("nohup bash -c 'cd {$p['cachedir']} && sleep 10 && find ?? -mmin +$maxagemin -print0 | xargs -0 -r rm' /dev/null 2>&1 &"); } - EDC('getcache', $result, $path); + ### EDC('getcache', $result, $path); # too verbose return $result ? ($p['returnheaders'] ? array($path, $headers) : $path) : false; } @@ -671,7 +671,7 @@ function _expired($path, $maxage) if ($result = EDC('nocache') ? false : @filemtime($path)) { if (time() - $result > $maxage) - EDC('getcache', "expired", $path); + EDC('getcache', "expired", $maxage, $path); else $result = false; } -- cgit v1.2.3 From 29852dbc1e993bc5115604ab21c253f7ffe6b0d3 Mon Sep 17 00:00:00 2001 From: Urban Müller Date: Mon, 19 Oct 2015 18:20:00 +0200 Subject: optional cache hit rate logging --- it_url.class | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'it_url.class') diff --git a/it_url.class b/it_url.class index 0f44639..a54d1f6 100644 --- a/it_url.class +++ b/it_url.class @@ -628,6 +628,7 @@ function get_cache($p = array()) $result = file_exists($path); } + $cachemiss = 1; it_url::_unlock($path, $lock); } else @@ -647,6 +648,9 @@ function get_cache($p = array()) exec("nohup bash -c 'cd {$p['cachedir']} && sleep 10 && find ?? -mmin +$maxagemin -print0 | xargs -0 -r rm' /dev/null 2>&1 &"); } + if (EDC('getcachelog')) + it::log('debug', 'getcachelog', "miss=" . intval($cachemiss), $p['url']); + ### EDC('getcache', $result, $path); # too verbose return $result ? ($p['returnheaders'] ? array($path, $headers) : $path) : false; } -- cgit v1.2.3 From bb29c3272cfedc6bb5de1a216d587deccb1c8a49 Mon Sep 17 00:00:00 2001 From: Urban Müller Date: Mon, 9 Nov 2015 14:10:05 +0100 Subject: allow delay before retry --- it_url.class | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'it_url.class') diff --git a/it_url.class b/it_url.class index a54d1f6..fb5a1ff 100644 --- a/it_url.class +++ b/it_url.class @@ -156,6 +156,7 @@ function is_reachable($timeout = 5) * @param $p['filemtime'] Add HTTP header to only fetch when newer than this, otherwise return true instead of data * @param $p['data']: POST data array with key-value pairs * @param $p['retries']: Number of retries if download fails, default 1 + * @param $p['retrysleep'] Number of seconds to wait before retry, fractions ok * @return contents of resulting page, considering redirects, excluding headers, or false on error */ function get($p=null, $timeout=5) @@ -190,7 +191,10 @@ function get($p=null, $timeout=5) } if (!$result && $p['retries'] > 0 && $url->result < 400) + { + usleep($p['retrysleep']*1000000); $result = $url->get(array('retries' => $p['retries'] - 1) + $p); + } if (($filter = EDC('res')) && strstr($p['url'], it::replace(array('1' => ":"), $filter))) ED($result); -- cgit v1.2.3