summaryrefslogtreecommitdiff
path: root/it_url.class
diff options
context:
space:
mode:
Diffstat (limited to 'it_url.class')
-rw-r--r--it_url.class123
1 files changed, 98 insertions, 25 deletions
diff --git a/it_url.class b/it_url.class
index 7ed0bce..4d3c70b 100644
--- a/it_url.class
+++ b/it_url.class
@@ -27,6 +27,7 @@ class it_url
var $hostname; /* E.g. relog.ch */
var $realhostname; /* E.g. www.relog.ch */
var $port; /* E.g. 80 */
+ var $explicitport; /* E.g. 80, explicitly set in rawurl */
var $path; /* E.g. / */
var $rawurl; /* E.g. HTTP://falcon:joshua@www.Relog.CH.:80/default.asp */
var $user; /* E.g. falcon */
@@ -81,14 +82,17 @@ function it_url($url = null, $options = array())
else
$pattern = '^([a-z0-9_:\.-]+)/*(.*)$';
+ $this->explicitport = '';
if (preg_match("#$pattern#is", $url, $regs))
{
list($hostname, $port) = explode(':', $regs[1]);
$this->realhostname = strtolower($hostname);
- if ($port)
+ if ($port) {
$this->port = intval($port);
+ $this->explicitport = ":" . $port;
+ }
$url = $regs[2];
}
@@ -251,20 +255,16 @@ function get($p=null, $timeout=5)
$url = new it_url($p['url']);
if ($url->protocol == 'http')
- {
$result = $url->request($p);
- if ($url->headers['Location'] && preg_match('#^(https?://[^/]*)?(/)?(.*)$#i', $url->headers['Location'], $parts) && ($parts[1] != $url->url)) # Handle redirects (supports relative and global)
- {
- unset($p['url'], $p['headers']['Host']);
- $url->it_url($parts[1] ? $parts[1].$parts[2].$parts[3] : $url->protocol.'://'.$url->realhostname.($parts[2] ? $parts[2].$parts[3] : '/'.dirname($url->path).'/'.$parts[3]));
- if (++$url->redir <= 4) /* Avoid infinite redirects */
- return $url->get($p);
- }
- }
else
+ $result = $url->request_curl($p);
+
+ if ($url->headers['Location'] && preg_match('#^(https?://[^/]*)?(/)?(.*)$#i', $url->headers['Location'], $parts) && ($parts[1] != $url->url)) # Handle redirects (supports relative and global)
{
- $results = self::get_multi(array('urls' => array('one' => $p['url'])) + $p);
- $result = $results['one'];
+ unset($p['url'], $p['headers']['Host']);
+ $url->it_url($parts[1] ? $parts[1].$parts[2].$parts[3] : $url->protocol.'://'.$url->realhostname.($parts[2] ? $parts[2].$parts[3] : '/'.dirname($url->path).'/'.$parts[3]));
+ if (++$url->redir <= 4) /* Avoid infinite redirects */
+ return $url->get($p);
}
if (!$result && $p['retries'] > 0 && $url->result < 400)
@@ -310,7 +310,7 @@ function request($p=array())
$data = $p['data'];
$p['headers'] = (array)$p['headers'] + array(
- 'Host' => $url->realhostname . ($url->port != 80 ? ":" . $url->port : ''),
+ 'Host' => $url->realhostname . $url->explicitport,
'User-Agent' => "Mozilla/5.0 (compatible; MSIE 9.0; ITools)",
'Accept-Language' => $p['headers']['Accept-Language'] ?: T_lang(), # can prevent loading of it_text
);
@@ -390,6 +390,86 @@ function request($p=array())
return $result;
}
+static function curl_opts($p=array())
+{
+ $p += array('totaltimeout' => "999999", 'timeout' => 5);
+
+ foreach ($p['headers'] as $header => $value)
+ $headers[] = "$header: $value";
+
+ if ($p['maxlength']) {
+ $maxlength = $p['maxlength'];
+ $add = [
+ #CURLOPT_BUFFERSIZE => 1024 * 1024 * 10,
+ CURLOPT_NOPROGRESS => false,
+ CURLOPT_PROGRESSFUNCTION => function ($dummy0, $dummy1, $size, $dummy2, $dummy3) use ($maxlength) { return $size < $maxlength ? 0 : 1; },
+ ];
+ }
+
+ return (array)$add + [
+ CURLOPT_HEADER => false,
+ CURLOPT_RETURNTRANSFER => true,
+ CURLOPT_TIMEOUT => $p['totaltimeout'],
+ CURLOPT_LOW_SPEED_LIMIT => 5,
+ CURLOPT_LOW_SPEED_TIME => $p['timeout'],
+ CURLOPT_FOLLOWLOCATION => false,
+ CURLOPT_HTTPHEADER => $headers,
+ CURLOPT_SSL_VERIFYPEER => 0,
+ CURLOPT_SSL_VERIFYHOST => 0,
+ CURLINFO_HEADER_OUT => 1,
+ ];
+}
+
+/*
+ * drop in replacement for request using curl
+ *
+ * todo:
+ * @param $p['filemtime'] Add HTTP header to only fetch when newer than this, otherwise return true instead of data
+ * @param $p['data'] POST data array with key-value pairs
+ * @param $p['method'] different HTTP method
+*/
+
+function request_curl($p=array())
+{
+ $url = $this;
+ if ($p['url'])
+ $this->it_url($p['url']);
+
+
+ $p['headers'] = (array)$p['headers'] + array(
+ 'Host' => $url->realhostname . $url->explicitport,
+ 'User-Agent' => "Mozilla/4.0 (compatible; MSIE 7.0; ITools)",
+ 'Accept-Language' => T_lang(),
+ );
+
+ $opts = [CURLOPT_FOLLOWLOCATION => false, CURLOPT_HEADER => 1] + self::curl_opts($p);
+
+ $curl = curl_init($url->rawurl);
+ curl_setopt_array($curl, $opts);
+
+ $got = curl_exec($curl);
+
+ EDC('curlinfo', curl_getinfo($curl));
+
+ if ($got === false && $p['safety'] == 1)
+ it::error(array('title' => "problem getting $url->url with curl: " . curl_error($curl)) + (array)$p['it_error']);
+
+ if ($got) {
+ list($url->header, $url->data) = explode("\r\n\r\n", $got, 2);
+
+ $url->parse_http_header($url->header);
+
+ if ($p['maxlength'] && (strlen($this->data) > $p['maxlength']))
+ $result = false;
+ else
+ $result =& $url->data;
+ } else
+ $result = false;
+
+ return $result;
+}
+
+
/**
* Get multiple URL in parallel with timeout. Needs to be called statically
@@ -403,22 +483,15 @@ function request($p=array())
*/
function get_multi($p=null)
{
- $p += array('totaltimeout' => "999999", 'timeout' => 5, 'retries' => 1);
+ $p += array('retries' => 1);
+
$p['headers'] = (array)$p['headers'] + array(
'User-Agent' => "Mozilla/4.0 (compatible; MSIE 7.0; ITools)",
'Accept-Language' => T_lang(),
);
- foreach ($p['headers'] as $header => $value)
- $headers[] = "$header: $value";
- $opts = array(
- CURLOPT_HEADER => false,
- CURLOPT_RETURNTRANSFER => true,
- CURLOPT_TIMEOUT => $p['totaltimeout'],
- CURLOPT_LOW_SPEED_LIMIT => 5,
- CURLOPT_LOW_SPEED_TIME => $p['timeout'],
- CURLOPT_FOLLOWLOCATION => true,
- CURLOPT_HTTPHEADER => $headers,
- );
+
+ $opts = [CURLOPT_FOLLOWLOCATION => true] + self::curl_opts($p);
+
$mh = curl_multi_init();
$urls = array();