diff options
-rw-r--r-- | it.class | 74 | ||||
-rw-r--r-- | it_dbi.class | 4 | ||||
-rw-r--r-- | it_html.class | 38 | ||||
-rw-r--r-- | it_xml.class | 11 | ||||
-rw-r--r-- | itjs.class | 23 | ||||
-rw-r--r-- | itjs.php | 10 | ||||
-rw-r--r-- | itjs/it.js | 18 | ||||
-rwxr-xr-x | tests/exec.t | 2 | ||||
-rwxr-xr-x | tests/getopt.t | 2 | ||||
-rwxr-xr-x | tests/it.t | 335 | ||||
-rwxr-xr-x | tests/it_html.t | 59 | ||||
-rwxr-xr-x | tests/it_url.t | 24 | ||||
-rwxr-xr-x | tests/it_xml.t | 35 |
13 files changed, 423 insertions, 212 deletions
@@ -123,6 +123,7 @@ static function timerlog($label = '') * @param $p['graceperiod'] number of seconds within which additional errors are ignored if id is set * @param $p['timewindow'] number of seconds after graceperiod within which the second error must occur if id is set * @param $p['backtraceskip'] number of stack levels to drop + * @param $p['skipfiles'] files to skip in backtrace * @param $p['blockmail'] number of seconds to block mails after having sent a mail [3600] * @param $p['blockmailid'] block mail for $p['blockmail'] seconds with same id. Default: $p['to'] * @param $p['omitdebuginfo'] Do not add stack dump, locals and environment to output [false] @@ -193,7 +194,7 @@ static function error($p = array(), $body = null, $to = null) # $body and $to de if ($toscreen || $sendmail) { - $trace = it_debug::backtrace($p['backtraceskip']); # moved in here for performance in mass error case + $trace = it_debug::backtrace(array('skiplevels' => $p['backtraceskip'], 'skipfiles' => $p['skipfiles'])); # moved in here for performance in mass error case if (strlen($p['body']) > 500000) { @@ -230,7 +231,7 @@ static function error($p = array(), $body = null, $to = null) # $body and $to de it::mail(array('To' => $p['to'], 'Subject' => substr($p['title'], 0, 80), 'Body' => $body) + (($cc = $GLOBALS['it_defaultconfig']['error_cc']) ? array('Cc' => $cc) : array())); } else if ($_SERVER['REMOTE_ADDR']) # toscreen mode: web - echo "<pre>{$p['title']}\n".rtrim($body)."</pre>"; + echo "<pre>" . htmlspecialchars($p['title'] . "\n" . rtrim($body), ENT_COMPAT, "iso-8859-1") . "</pre>"; # works with iso-8859-1 or utf-8, UTF8SAFE else # toscreen mode: shell (outputs to stderr) error_log($p['title'] . " in " . ($trace ? $trace : "{$p['file']}:{$p['line']} Url: $url") . " " . (EDC('verbose') ? D($p['locals']) : "")); } @@ -334,7 +335,6 @@ static function convertregex($pattern, $p = null) * @param $string String to match * @param $p['offset_capture'] Set flag preg_offset_capture (returns offsets with the matches). * @param $p['all'] Return every match as array instead of first match. - * @param $p['locale'] Use given locale (default: de_CH), mainly affects handling of iso-latin chars * @param $p contains pattern modifiers, @see convertregex() * @return Matched string or false */ @@ -346,21 +346,18 @@ static function match($pattern, $string, $p = null) { $flags = $p['offset_capture'] ? PREG_OFFSET_CAPTURE : 0; - $oldlocale = setlocale(LC_CTYPE, 0); - setlocale(LC_CTYPE, $p['locale'] ? $p['locale'] : "de_CH"); - if ($p['all']) $r = preg_match_all(it::convertregex($pattern, $p), $string, $m, $flags | PREG_PATTERN_ORDER, $p['offset']); else $r = preg_match(it::convertregex($pattern, $p), $string, $m, $flags, $p['offset']); - - setlocale(LC_CTYPE, $oldlocale); } if (!$r) # no match { if (preg_last_error() == PREG_BACKTRACK_LIMIT_ERROR) it::error("Exceeded pcre.backtrack_limit of " . ini_get('pcre.backtrack_limit') . " bytes"); + else if (preg_last_error() == PREG_BAD_UTF8_ERROR) + it::error("Input to it::match is not valid utf-8"); $result = $p['all'] ? array() : null; } @@ -384,23 +381,66 @@ static function match($pattern, $string, $p = null) */ static function replace($replacements, $string, $p = array()) { + $encoding = ini_get('default_charset') == 'utf-8' ? 'u' : ''; foreach ($replacements as $pattern => $dummy) - $patterns[] = !preg_match('/\\\\[wb]|[!\x80-\xff]|\[\[:/i', $pattern) && !$p ? "!$pattern!i" : it::convertregex($complex = $pattern, $p); + $patterns[] = !preg_match('/\\\\[wb]|[!\x80-\xff]|\[\[:/i', $pattern) && !$p ? "!$pattern!i$encoding" : it::convertregex($pattern, $p); + + $result = preg_replace($patterns, $replacements, $string, isset($p['limit']) ? $p['limit'] : -1); + + if ($result === null && preg_last_error() == PREG_BAD_UTF8_ERROR) + it::error("Input to it::replace is not valid utf-8"); - if (!$complex && !$p) - $result = preg_replace($patterns, $replacements, $string); + return $result; +} + +/** + * Returns only the array elements matching the given regex + * @param $pattern Regex to match against + * @param $array array to grep + * @return New array + */ +static function grep($pattern, $array, $p = array()) +{ + if (!preg_match('/\\\\[wb]|[!\x80-\xff]|\[\[:/i', $pattern) && !$p) + $result = preg_grep('!' . $pattern . '!i' . (ini_get('default_charset') == 'utf-8' ? 'u' : ''), $array); # fast path for simple patterns else - { - $oldlocale = setlocale(LC_CTYPE, 0); - setlocale(LC_CTYPE, 'de_CH'); - $result = preg_replace($patterns, $replacements, $string, isset($p['limit']) ? $p['limit'] : -1); - setlocale(LC_CTYPE, $oldlocale); - } + $result = preg_grep(it::convertregex($pattern, $p), $array); return $result; } /** + * Convert string to utf8 if it was not already utf-8 before + * @param $value String to convert + * @return Same string in utf-8 encoding + */ +function any2utf8($value) +{ + return grapheme_strlen($value) === null ? utf8_encode($value) : $value; +} + +/** + * Uppercase first character similar to ucfirst() but for mbstring.internal_encoding + */ +static function ucfirst($string) +{ + return mb_strtoupper(mb_substr($string, 0, 1)) . mb_substr($string, 1); +} + +/** + * Uppercase first character of each word similar to ucwords() but for mbstring.internal_encoding + */ +static function ucwords($string) +{ + return preg_replace_callback('/\b\w/u', function($m) { return mb_strtoupper($m[0]); }, mb_strtolower($string)); +} + +static function substr_replace($string, $replacement, $start, $length) +{ + return grapheme_substr($string, 0, $start) . $replacement . grapheme_substr($string, $start + $length); +} + +/** * Extract key => value pairs from assoc array by key * @param $array array to filter * @param $keys array or comma separated list of keys to keep diff --git a/it_dbi.class b/it_dbi.class index ba0606c..ced925b 100644 --- a/it_dbi.class +++ b/it_dbi.class @@ -247,7 +247,11 @@ function _set($tags, $allfields = false) if (substr($field, 0, 1) == '-') # Unquoted value (always added) $r[] = substr($field, 1)."=$value"; else if ($allfields || ($value !== $this->_data[$field])) + { + if ($this->_p['charset'] == "utf8") # NOTE: Mysql charset is simply utf8, not utf-8 + $value = it_html::fix_encoding($value); $r[] = "`$field`=".(isset($value) ? $this->escape_string($value) : 'NULL'); + } } return $r ? 'SET '.implode(', ', $r) : ''; diff --git a/it_html.class b/it_html.class index 92aa9ba..4c9f3e7 100644 --- a/it_html.class +++ b/it_html.class @@ -38,7 +38,7 @@ function it_html($p = array()) { # Default configuration of html class $this->p = $p + array( - 'charset' => 'iso-8859-1', + 'charset' => ini_get('default_charset') ?: 'iso-8859-1', 'doctype' => null, # Custom doctype (will usually be calculated from htmltype) 'head' => '', # Code to put into head() section 'htmltype' => 'xhtml', # 'html' (=old-style), 'xhtml' or 'xhtml-mobile' @@ -48,7 +48,7 @@ function it_html($p = array()) 'name' => 'it_html', # Name of global variable $this is assigned to (string), XXX Copy and paste in configure() to keep PHP4 compatibility 'nonewlinetags' => 'a,b,em,img,input,label,span,noscript', # tags that do not like newlines after them 'notexported' => 'configure,sanitize',# Those methods are not exported - 'prettyprint' => false, # Should output be prettily indented? + 'prettyprint' => it::is_devel(), # Should output be prettily indented? 'show_boot_dom' => false, # If true, append invisible <div id="it_boot_dom"> at the end of body 'show_content_type' => true, # If true, add <meta http-equiv="Content-Type" ...> header 'show_favicon' => true, # If true, add <link> tag to /favicon.ico if it exists @@ -258,6 +258,21 @@ function _parse_args($args) } +# internal +function fix_encoding($string, $silent = false) +{ + if (grapheme_strlen($string) === null) + list($string, $error) = array(utf8_encode($string), utf8_encode("incorrectly utf8-encoded: " . trim($string))); + else if ($string && preg_match('/[\x80-\xff]/', $string) && grapheme_strlen(utf8_decode($string)) !== null && utf8_encode(utf8_decode($string)) === $string) + list($string, $error) = array(utf8_decode($string), utf8_encode("doubly utf8-encoded: " . trim($string))); + + if ($error && !$silent) + it::error(array('title' => $error, 'skipfiles' => "it_html")); + + return $string; +} + + /** * function div($args...) * Return a <div>...</div> element @@ -312,10 +327,12 @@ function _tag($name, $args) else $result .= " />$newline"; + if ($GLOBALS['debug_utf8check'] && $GLOBALS['it_html']->p['charset'] == "utf-8") + $result = self::fix_encoding($result); + return $result; } - /** * Return a <tag> containing optional data. * @param $name tag name ('style', etc.) @@ -414,9 +431,11 @@ function _strip_tags($html) function sanitize($html) { $result = ""; + $charset = $GLOBALS['it_html']->p['charset'] ? $GLOBALS['it_html']->p['charset'] : 'iso-8859-1'; + if ($charset == "utf-8") + $html = it::any2utf8($html); $html = it::replace(array('[\0\s]+' => " "), $html); # \s also matches \r and \n $urlpattern = 'https?://[^">]+'; - $charset = $GLOBALS['it_html']->p['charset'] ? $GLOBALS['it_html']->p['charset'] : 'iso-8859-1'; if ($tag = it::match("(.*)<(div|p|i|b)\b[^>]*>(.*?)</\\2>(.*)", $html)) { @@ -450,11 +469,11 @@ function sanitize($html) } /** - * Decode all entities, ensure latin-1 encoding + * Decode all entities to encoding set for it_html */ function entity_decode($string) { - $string = preg_replace('/&#(8217|65533);/', "'", html_entity_decode($string)); + $string = preg_replace('/&#(8217|65533);/', "'", html_entity_decode($string, ENT_COMPAT, $GLOBALS['it_html']->p['charset'])); $string = preg_replace_callback('/�*([0-9a-f]+);/i', function($m) { return hexdec($m[1]) <= 255 ? chr(hexdec($m[1])) : " "; }, $string); $string = preg_replace_callback('/�*([0-9]+);/', function($m) { return $m[1] <= 255 ? chr($m[1]) : " "; }, $string); @@ -476,8 +495,13 @@ function latinize($string) */ function Q($string) { - if (preg_match('/[<>&"\x00-\x08\x0a-\x0c\x0e-\x1f\x80-\x9f]/', $string)) # WARNING: copy/pasted to _tag() + if (preg_match('/[<>&"\x00-\x08\x0a-\x0c\x0e-\x1f\x80-\xff]/', $string)) # WARNING: copy/pasted to _tag() + { + if ($GLOBALS['debug_utf8check'] && $GLOBALS['it_html']->p['charset'] == "utf-8") + $string = self::fix_encoding($string); + $string = htmlspecialchars($GLOBALS['it_html']->p['charset'] == "iso-8859-1" ? it_html::latinize($string) : $string, ENT_COMPAT, $GLOBALS['it_html']->p['charset']); + } return $GLOBALS['debug_q'] && $string ? "<span style='background:#8FF'>$string</span>" : $string; } diff --git a/it_xml.class b/it_xml.class index 0679c69..f854682 100644 --- a/it_xml.class +++ b/it_xml.class @@ -29,7 +29,7 @@ class it_xml * @param $p associative array * @param $p['forcearray'] xml tags to ALWAYS return as array * @param $p['safety'] 2 causes program abort with invalid xml, 1 (default) causes error report, 0 just returns false - * @param $p['encoding'] Output character encoding (e.g. UTF-8, default: ISO-8859-1) + * @param $p['encoding'] Output character encoding (utf-8, iso-8859-1 or us-ascii, default: ini_get('default_charset') * @param $p['prefix'] Optional prefix for class names * @param $p['lowercase'] Lowercase all tag and attribute names * @return XML object tree or null on failure @@ -49,23 +49,22 @@ function it_xml($xmldata = "", $p = array()) function create($xmldata, $p = array()) { $xml = new it_xml; - return $xml->from_xml($xmldata, array('factory' => true) + $p) ? $xml->_root : null; } function from_xml($xmldata, $p) { - $this->_p = $p + array('encoding' => "ISO-8859-1", 'safety' => 1); + $this->_p = $p + array('encoding' => ini_get('default_charset'), 'safety' => 1); $this->_arrayforce = array_flip((array)$this->_p['forcearray']); $this->_stack = array(); unset($this->error); - $parser = xml_parser_create($this->_p['encoding']); + $parser = xml_parser_create(); xml_set_object($parser, $this); xml_set_element_handler($parser, "start_element", "end_element"); xml_set_character_data_handler($parser, "character_data"); xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, 0); xml_parser_set_option($parser, XML_OPTION_TARGET_ENCODING, $this->_p['encoding']); - + $result = true; if (is_resource($xmldata)) @@ -123,7 +122,7 @@ function _sanitize($xmldata, $isutf8 = null) if (!preg_match('/^(<\?xml|\xEF\xBB\xBF|\xFE\xFF|\xFF\xFE|\x00\x00\xFE\xFF|\x00\x00\xFF\xFE)/', $xmldata)) $xmldata = '<?xml version="1.0" encoding="' . $this->_p['encoding'] . '"?>' . $xmldata; - $isutf8 = (!preg_match('/^<\?xml[^>]* encoding=/i', $xmldata) || preg_match('/^<\?xml[^>]* encoding=.UTF-8/i', $xmldata)); + $isutf8 = (!preg_match('/^<\?xml[^>]* encoding=/i', $xmldata) || preg_match('/^<\?xml[^>]* encoding=.utf-8/i', $xmldata)); } # Decode illegal entities but protect semantically important ones @@ -23,12 +23,18 @@ class itjs { +static $charset; /** * Send HTTP headers (content-type) to transmit javascript code */ -function send_headers($charset = 'iso-8859-1') +function send_headers($charset = null) { + if (!$charset) + $charset = ini_get('default_charset') ?: 'iso-8859-1'; + + self::$charset = $charset; + if (!preg_match('/Opera/', $_SERVER['HTTP_USER_AGENT']) && !$_REQUEST['itjs_iframe']) # text/plain breaks Opera 8.51/Linux and IFrame fallback header("Content-Type: text/plain; charset=$charset"); # Berni reported some Firewalls to require this @@ -67,16 +73,17 @@ function serialize($values, $envelope = false) */ function encode($values) { - $texts = ($values === array_values($values)) ? "[]0 " : "{}1\n"; # Numerical or associative array static $jskeyword = array("abstract" => 1, "boolean" => 1, "break" => 1, "byte" => 1, "case" => 1, "catch" => 1, "char" => 1, "class" => 1, "const" => 1, "continue" => 1, "debugger" => 1, "default" => 1, "delete" => 1, "do" => 1, "double" => 1, "each" => 1, "else" => 1, "enum" => 1, "export" => 1, "extends" => 1, "false" => 1, "final" => 1, "finally" => 1, "float" => 1, "for" => 1, "function" => 1, "goto" => 1, "if" => 1, "implements" => 1, "import" => 1, "in" => 1, "instanceof" => 1, "int" => 1, "interface" => 1, "long" => 1, "namespace" => 1, "native" => 1, "new" => 1, "null" => 1, "package" => 1, "private" => 1, "protected" => 1, "public" => 1, "return" => 1, "short" => 1, "static" => 1, "super" => 1, "switch" => 1, "synchronized" => 1, "this" => 1, "throw" => 1, "throws" => 1, "transient" => 1, "true" => 1, "try" => 1, "typeof" => 1, "var" => 1, "void" => 1, "volatile" => 1, "while" => 1, "with" => 1, "xml" => 1); - $result = $texts{0}; + $charset = self::$charset ?: ini_get('default_charset'); + $texts = ($values === array_values($values)) ? "[]0 " : "{}1\n"; # Numerical or associative array + $result = $texts[0]; foreach ($values as $key => $value) { $result .= $separator; - if ($texts{2}) + if ($texts[2]) { if ($jskeyword[$key] || !preg_match('/^[a-z_]\w*$/i', $key)) $key = "'$key'"; @@ -91,17 +98,19 @@ function encode($values) else if (!is_array($value)) { $quote = (strval(intval($value)) === strval($value)) ? "" : '"'; - $string = strtr($value, array("\0" => '\\0', "\x84" => '\\"', "\x93" => '\\"',"\x94" => '\\"', '"' => '\\"', "</"=>"<\\/", "\n" => '\\n', "\r" => '\\r', "\t" => '\\t', "\\" => '\\\\')); + if (strtolower($charset) != "utf-8") + $value = strtr($value, array("\x84" => '"', "\x93" => '"', "\x94" => '"')); + $string = strtr($value, array("\0" => '\\0', '"' => '\\"', "</"=>"<\\/", "\n" => '\\n', "\r" => '\\r', "\t" => '\\t', "\\" => '\\\\')); $string = $GLOBALS['itjs_defaultconfig']['latin2unicode'] ? preg_replace_callback('/([\xa0-\xff])/', function($m) { return sprintf("\\u%04x", ord($m[1])); }, $string) : $string; $result .= $quote . $string . $quote; } else $result .= itjs::encode($value); - $separator = "," . $texts{3}; + $separator = "," . $texts[3]; } - $result .= $texts{1}; + $result .= $texts[1]; return $result; } @@ -33,7 +33,7 @@ foreach ($files as $file) { ob_start(); # Needs to capture inside loop to guarantee file order if (!(it::match('W3C_CSS_Validator', $_SERVER['HTTP_USER_AGENT']) && it::match('jquery-ui\.css', $file))) - $data .= it::replace(array('^1$' => ""), @include($file)); + $data .= it::replace(array('^1$' => ""), @include($file), array('utf8' => false)); $data .= ob_get_clean(); } @@ -50,9 +50,8 @@ else if (it::match('\.css', $_SERVER['PHP_SELF'])) $data .= "\n#it_boot_dom { display:none }\n"; # Append magic style for it_boot if (!it::match('^devel', $GLOBALS['ULTRASERVERTYPE'])) $data = it::replace(array('[ \t]*([{};])[ \t]*' => '$1', '/\*.*?\*/' => ""), $data); - if (it::match('W3C_CSS_Validator', $_SERVER['HTTP_USER_AGENT']) || EDC('w3c')) { + if (it::match('W3C_CSS_Validator', $_SERVER['HTTP_USER_AGENT']) || EDC('w3c')) $data = it::replace(array('@-.*' => "", 'background[^;}]*(gradient|rgba)[^;}]*;?' => "", '(filter:\s*(progid|none)|text-overflow:|zoom:|-webkit-|display:-moz-|-moz-|-o-|cursor:|border-radius:|behavior:|\w+:expression)[^;}]*;?' => "", 'html\.ie6.*' => "", '([^/])\*(\w)' => '$1$2'), $data); - } } else if (it::match('\.htc$', $file)) { @@ -64,7 +63,8 @@ else if (!it::match('\.html$', $file)) if ($_REQUEST['boot'] && !$_REQUEST['retry']) ob_start('ob_gzhandler'); - header("Content-Type: application/x-javascript; charset=iso-8859-1"); + $charset = ini_get('default_charset') ?: 'iso-8859-1'; + header("Content-Type: application/x-javascript; charset=$charset"); } @header("Etag: $checksum"); @@ -80,7 +80,7 @@ if ($checksum != $_SERVER['HTTP_IF_NONE_MATCH']) { $data .= "window.it_boot_init();\n"; if (!$_REQUEST['script']) - $data = sprintf("/*sln:%d*/\n%s/*eln:%d*/", strlen($data), strtr($data, array('%' => "%25", '.' => "%2e", 'e' => "%65", 'i' => "%69")), strlen($data)); # Protect from Firewalls/Proxies altering Javascript source code + $data = sprintf("/*sln:%d*/\n%s/*eln:%d*/", grapheme_strlen($data), strtr($data, array('%' => "%25", '.' => "%2e", 'e' => "%65", 'i' => "%69")), grapheme_strlen($data)); # Protect from Firewalls/Proxies altering Javascript source code } echo it_untaint($data); @@ -280,6 +280,24 @@ function it_url_encode(str) } /** + * Unicode-safe equivalent of unescape() + * @param str string URL encoded string to be decoded + */ +function it_url_decode(str) +{ + str = str.replace(/\+/g, '%20'); + + // catch URI malformed errors + try { + if (window.decodeURIComponent) + return decodeURIComponent(str); + } + catch(e) {} + + return unescape(str); +} + +/** * Patch PNG transparency for IE 5.5-6 on the given image */ function it_pngfix(img, w, h, mode) diff --git a/tests/exec.t b/tests/exec.t index 689bd26..9425420 100755 --- a/tests/exec.t +++ b/tests/exec.t @@ -11,7 +11,7 @@ is(it::shell_command("echo {arg}", array('arg' => '')), "echo ''", "quote empty foreach (array("", "C", "de_CH", "de_CH.utf8") as $locale) { setlocale(LC_ALL, $locale); - $arg = "prepost"; + $arg = "preüpost"; if (it::match('utf8', $locale)) $arg = utf8_encode($arg); is(it::exec("echo " . $arg), $arg . "\n", "exec with umlaut (locale '$locale')"); diff --git a/tests/getopt.t b/tests/getopt.t index 7a84588..d67738d 100755 --- a/tests/getopt.t +++ b/tests/getopt.t @@ -17,7 +17,7 @@ function getopt_ok($argv, $exp, $name) return is($got['argument'], $exp, $name); } -foreach (array("" => "blah gnaber", " (umlaute)" => "pre post") as $variant => $testarg) { +foreach (array("" => "blah gnaber", " (umlaute)" => "pre üäpost") as $variant => $testarg) { getopt_ok(array('-a', $testarg), $testarg, "Short version" . $variant); getopt_ok(array('--argument', $testarg), $testarg, "Long version with space" . $variant); getopt_ok(array("--argument=$testarg"), $testarg, "Long version with equal" . $variant); @@ -3,252 +3,317 @@ # Tests for it.class -function match($regex, $string, $expect, $name) + +# +# tests for it::match() +# +$oldcharset = ini_get('default_charset'); +$oldlocale = setlocale(LC_CTYPE, 0); + +ini_set('default_charset', 'utf-8'); +setlocale(LC_CTYPE, 'de_CH'); # required becuase we're checking German umlauts in latin1 mode + + +function match($regex, $string, $expect, $name, $p = array()) { $GLOBALS['TEST_MORE_LEVEL'] = 1; - $pass = is (it::match($regex, $string), $expect, $name); + $pass = is (it::match($regex, $string, $p), $expect, $name); if (!$pass) { - diag(" regex given: $regex"); + diag(" regex given: $regex" . ($p ? " " .D($p) : "")); diag(" regex converted: " . it::convertregex($regex)); } $GLOBALS['TEST_MORE_LEVEL'] = 0; } + match( 'b', 'aaaabaaaa', 'b', 'simple regex' - ); +); + match( 'a/b', ' a/b ', 'a/b', 'regex with /' ); + match( 'aa(bb)aa(cc)aa(dd)qq', 'aabbaaccaaddqq', - array( 'bb', 'cc', 'dd' ), + array('bb', 'cc', 'dd'), 'return array of captures' - ); +); + match( '\bblah\b', ' blah ', 'blah', 'match \b at spaces' - ); +); + match( '\bblah\b', 'blah', 'blah', 'match \b at end of string' - ); +); + match( '\bblah\b', 'ablahc', false, 'don\'t match \b at word chars' - ); +); + match( - '\bblah\b', 'blah', + '\bblah\b', 'Üblahä', false, - 'don\'t match \b at umlaute in latin1' - ); + 'don\'t match \b at umlaute' +); + match( '\Bblah\B', ' blah ', false, 'don\'t match \B at spaces' - ); +); + match( '\Bblah\B', 'blah', false, 'don\'t match \B at end of string' - ); +); + match( '\Bblah\B', 'ablahc', 'blah', 'match \B at word chars' - ); +); + match( - '\Bblah\B', 'blah', + '\Bblah\B', 'Üblahä', 'blah', - 'match \B at umlaute in latin1' - ); + 'match \B at umlaute' +); + match( - '\w+', ' |#blah ', - 'blah', + '\w+', ' |#Üblahä ', + 'Üblahä', 'include umlaute in \w' - ); +); + match( - '[[:alpha:]]+', ' |#blah ', - 'blah', + '[[:alpha:]]+', ' |#blahä ', + 'blahä', 'include umlaute in [[:alpha:]]' - ); +); + match( - '\W+', ' |#blah ', + '\W+', ' |#Üblahä ', ' |#', 'don\'t include umlaute in \W' - ); +); + match( - '\ba', 'a', + '\ba', 'äa', '', '\b must know umlauts' - ); +); -eval( '$escapedwordregex = "' . it::convertregex( '\w' ) . '";' ); -$escapedwordregex = preg_replace( '|[\\\\/]|', '', $escapedwordregex ); +match( + 'aaa\\\\w+', ' aaa\www ', + 'aaa\www', + 'don\'t parse \w in \\\\w at beginning (match)' +); + +match( + 'aaa\\\\w+', ' aaa\www ', + 'aaa\www', + 'don\'t parse \w in \\\\w after chars (match)' +); + +eval('$escapedwordregex = "' . it::convertregex('\w') . '";'); +$escapedwordregex = preg_replace('|[\\\\/]|', '', $escapedwordregex); match( '\\\\w+', $escapedwordregex, false, 'don\'t parse \w in \\\\w at beginning (no match)' - ); -match( - 'aaa\\\\w+', ' aaa\www ', - 'aaa\www', - 'don\'t parse \w in \\\\w at beginning (match)' - ); +); + match( 'aaa\\\\w+', 'aaa' . $escapedwordregex, false, 'don\'t parse \w in \\\\w after chars (no match)' - ); -match( - 'aaa\\\\w+', ' aaa\www ', - 'aaa\www', - 'don\'t parse \w in \\\\w after chars (match)' - ); +); + match( '\\\\\\\\w+', '\\' . $escapedwordregex, false, 'don\'t parse \w in \\\\\\\w (no match)' - ); +); + match( '\\\\\\\\w+', ' \\\\www ', '\\\\www', 'don\'t parse \\\\\\\\w as \w (match)' - ); +); + match( '[\w]+', '[[[]]]---', false, 'replace \w in [\w] correctly (no match)' - ); +); + match( '[\w]+', ' \\\\aword[[[]]] ', 'aword', 'replace \w in [\w] correctly (match)' - ); +); + match( '[\\\\w]+', ' blabergna ', false, 'don\'t parse \w in [\\\\w] (no match)' - ); +); + match( '[\\\\w]+', ' \\\\worda[[[]', '\\\\w', 'don\'t parse \w in [\\\\w] (match)' - ); +); + match( '[a\W]+', 'bbbbbbb a a%$+ accccc', ' a a%$+ a', '\W in []' - ); +); + match( - '\\\\\\w+', ' \blah ', - '\blah', + '\\\\\\w+', ' \Üblahä ', + '\Üblahä', 'parse \w in \\\\\\w at beginning' - ); +); + match( - 'aaa\\\\\\w+', ' aaa\blah ', - 'aaa\blah', + 'aaa\\\\\\w+', ' aaa\Üblahä ', + 'aaa\Üblahä', 'parse \w in \\\\\\w after chars' - ); -is( - it::replace( - array( - 'regex1' => 'repl1', - 'regex2' => 'repl2', - 'regex3' => 'repl3' ), - 'regex2 regex1 regex3' ), - 'repl2 repl1 repl3', - 'test tr regex function' - ); -is( - it::match( '\w+', 'word1 wrd2 word_3', array('all' => true )), - array( 'word1', 'wrd2', 'word_3' ), - "test match_all function" - ); +); + +match( + '\w+', 'word1 wörd2 word_3', + array('word1', 'wörd2', 'word_3'), + "test match_all function", + array('all' => true) +); + match( 'aBcD', ' aBcD ', 'aBcD', "caseinsensitive is default" - ); +); + match( - '', '', - '', - 'match umlaute in latin1 case insensitive' - ); + '\w+', 'Müller', + 'Müller', + '\w matches umlaut in utf-8 mode' +); -is( - it::match(utf8_encode('aB'), utf8_encode("Ab"), array('utf8' => true)), - utf8_encode('Ab'), - "match utf-8 umlaute in case insensitive" +match( + 'M.ller', 'Müller', + 'Müller', + '. matches umlaut in utf-8 mode' ); -$oldcharset = ini_get('default_charset'); -ini_set('default_charset', 'utf-8'); match( - utf8_encode('aB'), utf8_encode('Ab'), - utf8_encode('Ab'), - "match utf-8 umlaute in case insensitive using default_charset" + utf8_decode('ö'), utf8_decode('Ö'), + utf8_decode('Ö'), + 'match umlaute in de_CH.latin1 case insensitive', + array('utf8' => false) ); -is( - it::match('aB', 'Ab', array('utf8' => false)), - 'Ab', - "non-utf-8 override with default_charset=utf-8" + +match( + utf8_decode('aöBÜ'), utf8_decode('AÖbü'), + utf8_decode('AÖbü'), + "match umlaute with non-utf-8 override in p", + array('utf8' => false) ); + + match( - '\w+', utf8_encode('Mller'), - utf8_encode('Mller'), - '\w matches umlaut in utf-8 mode' + 'abc', "aBc", + false, + "set case sensitivity by parameter", + array('casesensitive' => 1), ); + match( - 'M.ller', utf8_encode('Mller'), - utf8_encode('Mller'), - '. matches umlaut in utf-8 mode' + '\w+', 'word1 wörd2 word_3', + array('word1', 'wörd2', 'word_3'), + "test all => 1 without captures", + array('all' => 1) ); -ini_set('default_charset', $oldcharset); -is( - it::match( 'abc', "aBc", array('casesensitive' => 1 )), - false, - "set case sensitivity by parameter" - ); +match( + '\w+\s+(\d+)', 'word1 12 wörd2 3 word_3 4', + array('12', '3', '4'), + "test all => 1 with one capture", + array('all' => 1) +); +match( + '(\w+)\s+(\d+)', 'word1 12 wörd2 3 word_3 4', + array(array('word1', '12'), array('wörd2', '3'), array('word_3', '4')), + "test all => 1 with captures", + array('all' => 1) +); + +match( + '(\w+)\s+(\d+)', 'word1 12 wörd2 3 word_3 4', + array(array('word1', 'wörd2', 'word_3'), array('12', '3', '4')), + "test all => 1,pattern_order => 1", + array('all' => 1, 'pattern_order' => 1) +); + +ini_set('default_charset', 'iso-8859-1'); +match( + 'aöBÜ', "AÖbü", + 'AÖbü', + "match utf-8 umlaute in case insensitive mode with utf8 override", + array('utf8' => true) +); +ini_set('default_charset', 'utf-8'); + + +# +# tests for it::replace() +# is( - it::match( '\w+', 'word1 wrd2 word_3', array('all' => 1 )), - array( 'word1', 'wrd2', 'word_3' ), - "test all=>1 without captures" - ); -is( - it::match( '\w+\s+(\d+)', 'word1 12 wrd2 3 word_3 4', array('all' => 1 )), - array( '12', '3', '4' ), - "test all=>1 with one capture" - ); -is( - it::match( '(\w+)\s+(\d+)', 'word1 12 wrd2 3 word_3 4', array('all' => 1 )), - array( array( 'word1', '12' ), array( 'wrd2', '3' ), array( 'word_3', '4' ) ), - "test all=>1 with captures" - ); -is( - it::match( '(\w+)\s+(\d+)', 'word1 12 wrd2 3 word_3 4', array('all' => 1, 'pattern_order' => 1 )), - array( array( 'word1', 'wrd2', 'word_3' ), array( '12', '3', '4' ) ), - "test all=>1,pattern_order=>1" - ); + it::replace( + array( + 'regex1' => 'repl1', + 'regex2' => 'repl2', + 'regex3' => 'repl3'), + 'regex2 regex1 regex3'), + 'repl2 repl1 repl3', + 'test tr regex function' +); is(it::replace(array('a' => "1", 'b' => "2"), "ab"), "12"); is(it::replace(array('!' => "x"), "!"), "x"); -is(it::replace(array('\w' => "x"), "o"), "xx"); -is(it::replace(array('[[:alpha:]]' => "x"), ""), "x"); -is(it::replace(array('\w' => "x", '#' => "!"), "#"), "!x"); -is(it::replace(array('#' => "!", '\w' => "x"), "#"), "!x"); -is(it::replace(array('' => "x"), ""), "x"); +is(it::replace(array('\w' => "x"), "oö"), "xx"); +is(it::replace(array('[[:alpha:]]' => "x"), "ö"), "x"); +is(it::replace(array('\w' => "x", '#' => "!"), "#ö"), "!x"); +is(it::replace(array('#' => "!", '\w' => "x"), "#ö"), "!x"); +is(it::replace(array('ö' => "x"), "Ö"), "x"); is(it::replace(array('a' => "1"), "aaa", array('limit' => 1)), "1aa"); +is(it::replace(array('\s' => "x"), it_html::entity_decode(" ")), "x", "match non-breaking space as white-space character"); + +is(it::grep('ismatch', array('ismatch', 'isnomatch')), array('ismatch'), 'grep with simple regex'); +is(it::grep('!', array('ismatch!', 'isnomatch')), array('ismatch!'), '! in regex'); +is(it::grep('lower|UPPER', array('lower', 'LOWER', 'upper', 'UPPER'), 'casesensitive' => 1), array(0 => 'lower', 3 => 'UPPER'), 'set casesensitive'); +is(it::grep('match', array('foo' => 'match', 'bar' => 'gna')), array('foo' => 'match'), 'with keys'); + +setlocale(LC_CTYPE, $oldlocale); +ini_set('default_charset', $oldcharset); # end of tests that must run with specific charset + # it::filter_keys tests @@ -282,4 +347,26 @@ is(it::date('datetime', 1000000.543), it::date('datetime', "1000000"), '... larg is(it::date('time', "10.5"), "10:05", 'interpret string with points with strtotime'); is(it::date('time', "10.05"), "10:05", 'interpret string with points with strtotime'); +# it::uc* +is(it::ucfirst('foo bär über'), 'Foo bär über'); +is(it::ucwords('foo bär über'), 'Foo Bär Über'); + +# it::substr_replace +is(it::substr_replace('abcdefgh', 'xyz', 2, 4), substr_replace('abcdefgh', 'xyz', 2, 4), 'it::substr_replace the same as substr_replace for ascii'); +is(it::substr_replace('✔☯♥', '☃☃', 1, 1), '✔☃☃♥', 'it::substr_replace for utf-8'); + +is(it::any2utf8('Meier'), 'Meier', "it::any2utf8 ascii input"); +is(it::any2utf8('Müller'), 'Müller', "it::any2utf8 utf8 input"); +is(it::any2utf8('Aslı'), 'Aslı', "it::any2utf8 utf8 non-latin1 input"); +is(it::any2utf8(utf8_decode('Müller')), 'Müller', "it::any2utf8 latin1 input"); + +is(it::any2utf8( + ' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ'), + ' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ', + "it::any2utf8 utf8 input (exhaustive alphabet)"); +is(it::any2utf8( + utf8_decode(' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ')), + ' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ', + "it::any2utf8 latin1 input (exhaustive alphabet)"); + ?> diff --git a/tests/it_html.t b/tests/it_html.t index 0def431..174c487 100755 --- a/tests/it_html.t +++ b/tests/it_html.t @@ -4,6 +4,7 @@ |