diff options
-rw-r--r-- | it_url.class | 123 |
1 files changed, 98 insertions, 25 deletions
diff --git a/it_url.class b/it_url.class index 7ed0bce..4d3c70b 100644 --- a/it_url.class +++ b/it_url.class @@ -27,6 +27,7 @@ class it_url var $hostname; /* E.g. relog.ch */ var $realhostname; /* E.g. www.relog.ch */ var $port; /* E.g. 80 */ + var $explicitport; /* E.g. 80, explicitly set in rawurl */ var $path; /* E.g. / */ var $rawurl; /* E.g. HTTP://falcon:joshua@www.Relog.CH.:80/default.asp */ var $user; /* E.g. falcon */ @@ -81,14 +82,17 @@ function it_url($url = null, $options = array()) else $pattern = '^([a-z0-9_:\.-]+)/*(.*)$'; + $this->explicitport = ''; if (preg_match("#$pattern#is", $url, $regs)) { list($hostname, $port) = explode(':', $regs[1]); $this->realhostname = strtolower($hostname); - if ($port) + if ($port) { $this->port = intval($port); + $this->explicitport = ":" . $port; + } $url = $regs[2]; } @@ -251,20 +255,16 @@ function get($p=null, $timeout=5) $url = new it_url($p['url']); if ($url->protocol == 'http') - { $result = $url->request($p); - if ($url->headers['Location'] && preg_match('#^(https?://[^/]*)?(/)?(.*)$#i', $url->headers['Location'], $parts) && ($parts[1] != $url->url)) # Handle redirects (supports relative and global) - { - unset($p['url'], $p['headers']['Host']); - $url->it_url($parts[1] ? $parts[1].$parts[2].$parts[3] : $url->protocol.'://'.$url->realhostname.($parts[2] ? $parts[2].$parts[3] : '/'.dirname($url->path).'/'.$parts[3])); - if (++$url->redir <= 4) /* Avoid infinite redirects */ - return $url->get($p); - } - } else + $result = $url->request_curl($p); + + if ($url->headers['Location'] && preg_match('#^(https?://[^/]*)?(/)?(.*)$#i', $url->headers['Location'], $parts) && ($parts[1] != $url->url)) # Handle redirects (supports relative and global) { - $results = self::get_multi(array('urls' => array('one' => $p['url'])) + $p); - $result = $results['one']; + unset($p['url'], $p['headers']['Host']); + $url->it_url($parts[1] ? $parts[1].$parts[2].$parts[3] : $url->protocol.'://'.$url->realhostname.($parts[2] ? $parts[2].$parts[3] : '/'.dirname($url->path).'/'.$parts[3])); + if (++$url->redir <= 4) /* Avoid infinite redirects */ + return $url->get($p); } if (!$result && $p['retries'] > 0 && $url->result < 400) @@ -310,7 +310,7 @@ function request($p=array()) $data = $p['data']; $p['headers'] = (array)$p['headers'] + array( - 'Host' => $url->realhostname . ($url->port != 80 ? ":" . $url->port : ''), + 'Host' => $url->realhostname . $url->explicitport, 'User-Agent' => "Mozilla/5.0 (compatible; MSIE 9.0; ITools)", 'Accept-Language' => $p['headers']['Accept-Language'] ?: T_lang(), # can prevent loading of it_text ); @@ -390,6 +390,86 @@ function request($p=array()) return $result; } +static function curl_opts($p=array()) +{ + $p += array('totaltimeout' => "999999", 'timeout' => 5); + + foreach ($p['headers'] as $header => $value) + $headers[] = "$header: $value"; + + if ($p['maxlength']) { + $maxlength = $p['maxlength']; + $add = [ + #CURLOPT_BUFFERSIZE => 1024 * 1024 * 10, + CURLOPT_NOPROGRESS => false, + CURLOPT_PROGRESSFUNCTION => function ($dummy0, $dummy1, $size, $dummy2, $dummy3) use ($maxlength) { return $size < $maxlength ? 0 : 1; }, + ]; + } + + return (array)$add + [ + CURLOPT_HEADER => false, + CURLOPT_RETURNTRANSFER => true, + CURLOPT_TIMEOUT => $p['totaltimeout'], + CURLOPT_LOW_SPEED_LIMIT => 5, + CURLOPT_LOW_SPEED_TIME => $p['timeout'], + CURLOPT_FOLLOWLOCATION => false, + CURLOPT_HTTPHEADER => $headers, + CURLOPT_SSL_VERIFYPEER => 0, + CURLOPT_SSL_VERIFYHOST => 0, + CURLINFO_HEADER_OUT => 1, + ]; +} + +/* + * drop in replacement for request using curl + * + * todo: + * @param $p['filemtime'] Add HTTP header to only fetch when newer than this, otherwise return true instead of data + * @param $p['data'] POST data array with key-value pairs + * @param $p['method'] different HTTP method +*/ + +function request_curl($p=array()) +{ + $url = $this; + if ($p['url']) + $this->it_url($p['url']); + + + $p['headers'] = (array)$p['headers'] + array( + 'Host' => $url->realhostname . $url->explicitport, + 'User-Agent' => "Mozilla/4.0 (compatible; MSIE 7.0; ITools)", + 'Accept-Language' => T_lang(), + ); + + $opts = [CURLOPT_FOLLOWLOCATION => false, CURLOPT_HEADER => 1] + self::curl_opts($p); + + $curl = curl_init($url->rawurl); + curl_setopt_array($curl, $opts); + + $got = curl_exec($curl); + + EDC('curlinfo', curl_getinfo($curl)); + + if ($got === false && $p['safety'] == 1) + it::error(array('title' => "problem getting $url->url with curl: " . curl_error($curl)) + (array)$p['it_error']); + + if ($got) { + list($url->header, $url->data) = explode("\r\n\r\n", $got, 2); + + $url->parse_http_header($url->header); + + if ($p['maxlength'] && (strlen($this->data) > $p['maxlength'])) + $result = false; + else + $result =& $url->data; + } else + $result = false; + + return $result; +} + + /** * Get multiple URL in parallel with timeout. Needs to be called statically @@ -403,22 +483,15 @@ function request($p=array()) */ function get_multi($p=null) { - $p += array('totaltimeout' => "999999", 'timeout' => 5, 'retries' => 1); + $p += array('retries' => 1); + $p['headers'] = (array)$p['headers'] + array( 'User-Agent' => "Mozilla/4.0 (compatible; MSIE 7.0; ITools)", 'Accept-Language' => T_lang(), ); - foreach ($p['headers'] as $header => $value) - $headers[] = "$header: $value"; - $opts = array( - CURLOPT_HEADER => false, - CURLOPT_RETURNTRANSFER => true, - CURLOPT_TIMEOUT => $p['totaltimeout'], - CURLOPT_LOW_SPEED_LIMIT => 5, - CURLOPT_LOW_SPEED_TIME => $p['timeout'], - CURLOPT_FOLLOWLOCATION => true, - CURLOPT_HTTPHEADER => $headers, - ); + + $opts = [CURLOPT_FOLLOWLOCATION => true] + self::curl_opts($p); + $mh = curl_multi_init(); $urls = array(); |