rawurl = $url;
if (eregi('^([a-z]+):/+(.*)$', $url, $regs))
{
$this->protocol = strtolower($regs[1]);
$url = $regs[2];
}
else if (ereg('^[a-z]:', $url) || ereg('^/', $url))
{
$this->protocol = 'file';
}
else
$this->protocol = 'http';
/* Default port */
if ($this->protocol == 'http')
$protoport = 80;
else if ($this->protocol == 'https')
$protoport = 443;
$this->port = $protoport;
if (class_exists('Net_IDNA'))
$idn = Net_IDNA::getInstance();
if ($idn)
$pattern = '^([^/]+)/*(.*)$';
else
$pattern = '^([a-z0-9_:\.-]+)/*(.*)$';
if (eregi($pattern, $url, $regs))
{
list($hostname, $port) = explode(':', $regs[1]);
$this->realhostname = strtolower($hostname);
if ($port)
$this->port = $port;
$url = $regs[2];
}
if (ereg('^www\.(.*)$', $this->realhostname, $regs))
$this->hostname = $regs[1];
else
$this->hostname = $this->realhostname;
$index_files = array('index.html', 'index.htm', 'index.phtml', 'index.shtml', 'index.php3', 'index.php', 'default.asp');
for ($i = 0; $i < count($index_files); $i++)
{
$url = eregi_replace("^$index_files[$i]\$", '', $url);
$url = eregi_replace("/$index_files[$i]\$", '', $url);
}
$this->path = ereg_replace('^/$', '', $url);
if ($this->port != $protoport)
$this->url = "$this->protocol://$this->realhostname:$this->port/$this->path";
else
$this->url = "$this->protocol://$this->realhostname/$this->path";
if ($idn)
{
$realhostname = $this->realhostname;
if (!preg_match('/^utf-?8$/i', $options['encoding']))
$realhostname = utf8_encode($realhostname);
$encoded = $idn->encode($realhostname);
if ($encoded != $realhostname)
$this->realhostname = $encoded;
}
}
/**
* Read the page into memory, extract title and description and
* set $this->page, $this->title and $this->description
* @param $timeout Timeout for operation, defaults to unlimited (0)
* @return True if page has been read and $this->page is set
*/
function read_page($timeout = 0)
{
unset($this->page);
unset($this->title);
unset($this->description);
/*
** If the URL does not contain a dot followed by at least one character,
** it is considered bogus. This prevents 'localhost', 'www', and numerical IP addresses.
*/
if (!eregi('\.[a-z]+$', $this->realhostname))
return 0;
$url = $this->rawurl;
while ($this->page == '')
{
$cmd = 'LANG=C wget 2>&1 -T ' . ((int)$timeout) . ' -q -U "Mozilla/4.0 (Compatible; Relog ITools)" -O - ' . ereg_replace("[ \t]", '\\ ', escapeshellcmd("$url"));
$this->page = `$cmd`;
if ($this->page == '') /* An error occurred. Find out what it was. */
{
$cmd = 'LANG=C wget 2>&1 -T' . ((int)$timeout) . ' -v -U "Mozilla/4.0 (Compatible; Relog ITools)" -O - ' . ereg_replace("[ \t]", '\\ ', escapeshellcmd($url));
$error = `$cmd`;
if (eregi('Location: ([^ ]*)', $error, $regs)) /* Redirect ? */
{
$url = $regs[1];
if (!eregi('^[a-z]+:', $url)) /* Kludge for Miss Kournikova's admirers: grok local redirects (in violation of RFC) */
$url = $this->rawurl.'/'.$url;
}
else
break;
}
if (++$count > 4) /* Avoid infinite redirect loops */
break;
}
$this->page_read = 1;
if (eregi('
([^<]*)', $this->page, $regs))
$this->title = it_htmlentities_decode($regs[1]);
if (eregi(']+content="([^"]*)">', $this->page, $regs))
$this->description = it_htmlentities_decode($regs[1]);
return ($this->page != '');
}
/* Return the description of this page */
function get_description()
{
if (!$this->page_read)
$this->read_page();
return $this->description;
}
/* Return the title of this page */
function get_title()
{
if (!$this->page_read)
$this->read_page();
return $this->title;
}
/**
* Check if a given url (currently http:port80-only) can be fetched
* Note: Redirects are treated as succesful
* $timeout Timeout for connection in seconds
* @return true if url could be fetched
*/
function is_reachable($timeout = 5)
{
$result = false;
if ($fp = @fsockopen($this->realhostname, $this->port, $errno, $errstr, $timeout))
{
fputs($fp, "GET /$this->path HTTP/1.0\r\nHost: $this->realhostname\r\nUser-Agent: ITools\r\n\r\n");
$line = fgets($fp, 128);
fclose($fp);
#debug("it_url::is_reachable($this->rawurl: $line");
$result = eregi("^$this->protocol/[^ ]+ +[23]", $line);
}
return $result;
}
/**
* Get simple URL with timeout. Can be called statically
* @p parameter array with the following keys
* - url: url to get, defaults to constructor URL
* - timeout: timeout per read in milliseconds, defaults to 5000
* - data: post data array with key-value pairs
* @return contents of resulting page, considering redirects, excluding headers, or false on error
*/
function get($p=null, $timeout=5000)
{
if (!is_array($p))
$p = array('url' => $p);
if (!isset($p['timeout']))
$p['timeout'] = $timeout;
if ($p['url'])
$url = new it_url($p['url']);
else
$url =& $this; # Must be reference for $url->result and $url->data to work
$url->result = $result = false;
unset($url->data);
$url->headers = array();
if ($url->protocol == 'http')
{
if ($fp = @fsockopen($url->realhostname, $url->port, $errno, $errstr, $p['timeout']/1000))
{
# urlencode data pairs if is array
if (is_array($p['data']))
{
$data_pairs = array();
foreach ($p['data'] as $key => $value)
$data_pairs[] = "$key=".urlencode($value);
$p['data'] = implode('&', $data_pairs);
}
$data = !empty($p['data']) ? "\r\nContent-Type: application/x-www-form-urlencoded\r\nContent-Length: " . strlen($p['data']) . "\r\n\r\n" . $p['data'] : "";
stream_set_timeout($fp, intval($p['timeout']/1000), ($p['timeout']%1000)*1000);
@fputs($fp, (empty($data)?'GET':'POST') . " /$url->path HTTP/1.0\r\nHost: $url->realhostname\r\nUser-Agent: Mozilla/4.0 (compatible; MSIE 6.0; ITools)$data\r\n\r\n");
while (!feof($fp) && ($line = @fgets($fp, 10240)) && ($line = trim($line)))
{
if (preg_match('#^(HTTP\S+)\s(\d+)#', $line, $parts)) # Parse result code
$url->headers[$parts[1]] = $url->result = $parts[2];
elseif (preg_match('#^Location: (https?://[^/]*)?(/)?(.*)$#', $line, $parts) && ($parts[1] != $url->url)) # Handle redirects (supports relative and global)
{
unset($p['url']);
$url->it_url($parts[1] ? $parts[1].$parts[2].$parts[3] : $url->protocol.'://'.$url->realhostname.($parts[2] ? $parts[2].$parts[3] : '/'.dirname($url->path).'/'.$parts[3]));
return $url->get($p);
}
elseif (preg_match('#^([^:]+): (.*)$#', $line, $parts))
$url->headers[$parts[1]] = $parts[2];
}
if ($url->result)
{
while (!feof($fp))
$url->data .= @fread($fp, 20480);
if ($url->result < 400)
$result =& $url->data;
}
@fclose($fp);
}
}
return $result;
}
/**
* Construct a local file name to cache an URL. Named args:
* @url remote url to get
* @cachedir path to cache directory
*/
function get_cache_filename($p)
{
if (!is_array($p))
$p = array('url'=>$p);
$p += array('cachedir' => $GLOBALS['ULTRAHOME'] . "/var/urlcache");
return $p['cachedir'] . "/" . substr(md5($p['url']), 0, 2) . "/" . md5($p['url']);
}
/**
* Store contents of url in a file and return file name. Provides locking. Call statically.
* Requires www writeable var/urlcache in your service dir. Params in assoc array:
* @url url to get
* @timeout timeout in milliseconds, default 10000
* @maxage maximum age of cache entries in seconds, default 86400
* @cleanbefore maximum daytime when attempting cleanup, default 7200
* @preprocess callback function (or array for methods) to change received file
* @safety value 0 means dont generate alert, value 1 means generate alerts on timeouts and failures
* @keepfailed keep old versions of files if download fails (sending alerts conservatively)
* @cachedir directory to store cache files in. NO TRAILING SLASH
*/
function get_cache($p = array())
{
$p += array('timeout'=>10000, 'maxage'=>86400, 'cleanbefore'=>7200, 'safety'=>1, 'cachedir'=>$GLOBALS['ULTRAHOME']."/var/urlcache");
$path = it_url::get_cache_filename($p);
@mkdir(dirname($path));
$age = time() - @filemtime($path);
# expire forgotten locks
$lockmtime = @filemtime("$path.lock");
if ($lockmtime && time()-$lockmtime > 30)
@unlink("$path.lock");
if ($p['keepfailed'] && ($age>$p['maxage']) && ($dummy = @fopen("$path.lock", EDC('nocache') ? "w" : "x"))) # update our copy if we get the lock
{
# my job to refresh the cache entry
fclose($dummy);
EDC('getcache', "refresh", $p['url'], $path);
if (($result = it_url::get($p['url'], $p['timeout'])))
it_url::_atomicwrite($path, $result);
else
touch($path);
$parts = parse_url($p['url']);
if ($p['safety'] == 1)
it::error(array('title'=>"get_cache: download failures on $path", 'ok_key'=>md5($parts['host']), 'ok_delay'=>$p['maxage'], 'ok'=>$result ? 1 : 0)); # send err only if multi failure
@unlink("$path.lock");
}
# Remove ancient lock or cached file if it is too old
if (!$p['keepfailed'])
it_url::_expire($path, $p['maxage']);
if ($dummy = @fopen($path, EDC('nocache') ? "w" : "x")) # succeeds if file is missing (or we are in nocache mode)
{
# fill cache myself
fclose($dummy);
EDC('getcache', "new", $p['url'], $path);
$result = it_url::_atomicwrite($path, it_url::get($p['url'], $p['timeout']));
}
else
{
# get file from cache, potentially waiting if file is currently being transferred
EDC('getcache', "old", $p['url'], $path);
$result = it_url::_waitforpath($p + array('path' => $path));
}
if ($result && $p['preprocess'])
{
$srcpath = $path;
$path .= substr(md5(serialize($p['preprocess'])), 0, 2);
it_url::_expire($path, $p['maxage']);
if ($dummy = @fopen($path, EDC('nocache') ? "w" : "x")) # in nocache mode, always succeed
{
fclose($dummy);
EDC('getcache', "process", $p['url'], $path);
$dstpath = "$path.preprocesstmp";
call_user_func($p['preprocess'], $srcpath, $dstpath);
if (!@filesize($dstpath) || !@rename($dstpath, $path))
{
@unlink($dstpath);
@unlink($path);
$result = false;
}
else
$result = $path;
}
else
$result = it_url::_waitforpath($p + array('path' => $path));
if ($result)
{
EDC('getcache', "processold", $p['url'], $path);
touch($result, filemtime($srcpath)); # Ensure processed is never newer than src
}
}
# cache cleanup at night
if ((time()%86400 < $p['cleanbefore']) && (time()-@filemtime($p['cachedir'] . "/cleaned") > 80000))
{
touch($p['cachedir'] . "/cleaned");
$maxagemin = intval($p['maxage']/60);
exec("nohup bash -c 'cd {$p['cachedir']} && sleep 10 && find -mmin +$maxagemin -print0 | xargs -0 -r rm' /dev/null 2>&1 &");
}
return $result;
}
function _waitforpath($p)
{
$p += array('sleeptime' => 100); # millisecs to wait
# wait until cache is ready, then read from cache
for ($maxpasses = $p['timeout'] / $p['sleeptime'], $passes = 0; (@filesize($p['path']) <= 0) && ($passes < $maxpasses); ++$passes)
{
usleep($p['sleeptime'] * 1000);
clearstatcache();
}
if ($passes < $maxpasses)
$result = $p['path'];
else if ($p['safety'] == 1)
it::error("timeout in it_url::get_cache(): url={$p['url']}, passes=$passes, maxpasses=$maxpasses, path={$p['path']}");
return $result;
}
function _atomicwrite($path, $data)
{
if ($data !== false)
{
$tmpname = tempnam(dirname($path), "writetmp");
fputs($cachetmp = fopen($tmpname, "w"), $data);
fclose($cachetmp);
chmod($tmpname, 0664);
rename($tmpname, $path);
$result = $path;
}
else
unlink($path);
return $result;
}
function _expire($path, $maxage)
{
# Remove ancient lock or cached file if it is too old
if (file_exists($path) && (((@filesize($path) == 0) && (time() - @filemtime($path)) > 30) || (time() - @filemtime($path) > $maxage)))
{
EDC('getcache', "expire", $path);
@unlink($path);
}
}
/**
* Make an URL absolute by using host an protocol from current Apache request (but not port number)
* @param $url Optional URL ( foo.html, /foo.html, //host/bar.html, http://host/bar.html ), default self
* @return absolute version of URL ( http[s]://host/bar.html )
*/
function absolute($url=null)
{
if (!isset($url))
$url = $_SERVER['PHP_SELF'];
if (!ereg('^http', $url))
{
if (!ereg('//', $url))
{
$dir = ereg_replace('/[^/]*$', '/', $_SERVER['PHP_SELF']);
$url = ereg('^/', $url) ? $url : "$dir$url";
$url = "//" . $_SERVER['HTTP_HOST'] . $url;
}
$url = "http" . (isset($_SERVER['HTTPS']) ? 's':'') . ":$url";
}
return $url;
}
/**
* Craft a valid redirect URL, send Location: header and terminate execution
* @param $url Optional URL ( foo.html, /foo.html, //host/bar.html, http://host/bar.html ), default self
* @return This method never returns.
*/
function redirect($url = null)
{
if (EDC('noredir'))
echo "" . htmlspecialchars($url) . "
";
else
header('Location: '.preg_replace("/[\r\n].*/", '', it_url::absolute($url))); # Security: cut after CR/LF
exit;
}
/**
* Urlencode but leave some chars
*/
function encode($str)
{
return strtr(urlencode($str), array("%2C"=>",", "%28"=>"(", "%29"=>")"));
}
}
?>