summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--it.class74
-rw-r--r--it_dbi.class4
-rw-r--r--it_html.class38
-rw-r--r--it_xml.class11
-rw-r--r--itjs.class23
-rw-r--r--itjs.php10
-rw-r--r--itjs/it.js18
-rwxr-xr-xtests/exec.t2
-rwxr-xr-xtests/getopt.t2
-rwxr-xr-xtests/it.t335
-rwxr-xr-xtests/it_html.t59
-rwxr-xr-xtests/it_url.t24
-rwxr-xr-xtests/it_xml.t35
13 files changed, 423 insertions, 212 deletions
diff --git a/it.class b/it.class
index 624f601..1be8f7c 100644
--- a/it.class
+++ b/it.class
@@ -123,6 +123,7 @@ static function timerlog($label = '')
* @param $p['graceperiod'] number of seconds within which additional errors are ignored if id is set
* @param $p['timewindow'] number of seconds after graceperiod within which the second error must occur if id is set
* @param $p['backtraceskip'] number of stack levels to drop
+ * @param $p['skipfiles'] files to skip in backtrace
* @param $p['blockmail'] number of seconds to block mails after having sent a mail [3600]
* @param $p['blockmailid'] block mail for $p['blockmail'] seconds with same id. Default: $p['to']
* @param $p['omitdebuginfo'] Do not add stack dump, locals and environment to output [false]
@@ -193,7 +194,7 @@ static function error($p = array(), $body = null, $to = null) # $body and $to de
if ($toscreen || $sendmail)
{
- $trace = it_debug::backtrace($p['backtraceskip']); # moved in here for performance in mass error case
+ $trace = it_debug::backtrace(array('skiplevels' => $p['backtraceskip'], 'skipfiles' => $p['skipfiles'])); # moved in here for performance in mass error case
if (strlen($p['body']) > 500000)
{
@@ -230,7 +231,7 @@ static function error($p = array(), $body = null, $to = null) # $body and $to de
it::mail(array('To' => $p['to'], 'Subject' => substr($p['title'], 0, 80), 'Body' => $body) + (($cc = $GLOBALS['it_defaultconfig']['error_cc']) ? array('Cc' => $cc) : array()));
}
else if ($_SERVER['REMOTE_ADDR']) # toscreen mode: web
- echo "<pre>{$p['title']}\n".rtrim($body)."</pre>";
+ echo "<pre>" . htmlspecialchars($p['title'] . "\n" . rtrim($body), ENT_COMPAT, "iso-8859-1") . "</pre>"; # works with iso-8859-1 or utf-8, UTF8SAFE
else # toscreen mode: shell (outputs to stderr)
error_log($p['title'] . " in " . ($trace ? $trace : "{$p['file']}:{$p['line']} Url: $url") . " " . (EDC('verbose') ? D($p['locals']) : ""));
}
@@ -334,7 +335,6 @@ static function convertregex($pattern, $p = null)
* @param $string String to match
* @param $p['offset_capture'] Set flag preg_offset_capture (returns offsets with the matches).
* @param $p['all'] Return every match as array instead of first match.
- * @param $p['locale'] Use given locale (default: de_CH), mainly affects handling of iso-latin chars
* @param $p contains pattern modifiers, @see convertregex()
* @return Matched string or false
*/
@@ -346,21 +346,18 @@ static function match($pattern, $string, $p = null)
{
$flags = $p['offset_capture'] ? PREG_OFFSET_CAPTURE : 0;
- $oldlocale = setlocale(LC_CTYPE, 0);
- setlocale(LC_CTYPE, $p['locale'] ? $p['locale'] : "de_CH");
-
if ($p['all'])
$r = preg_match_all(it::convertregex($pattern, $p), $string, $m, $flags | PREG_PATTERN_ORDER, $p['offset']);
else
$r = preg_match(it::convertregex($pattern, $p), $string, $m, $flags, $p['offset']);
-
- setlocale(LC_CTYPE, $oldlocale);
}
if (!$r) # no match
{
if (preg_last_error() == PREG_BACKTRACK_LIMIT_ERROR)
it::error("Exceeded pcre.backtrack_limit of " . ini_get('pcre.backtrack_limit') . " bytes");
+ else if (preg_last_error() == PREG_BAD_UTF8_ERROR)
+ it::error("Input to it::match is not valid utf-8");
$result = $p['all'] ? array() : null;
}
@@ -384,23 +381,66 @@ static function match($pattern, $string, $p = null)
*/
static function replace($replacements, $string, $p = array())
{
+ $encoding = ini_get('default_charset') == 'utf-8' ? 'u' : '';
foreach ($replacements as $pattern => $dummy)
- $patterns[] = !preg_match('/\\\\[wb]|[!\x80-\xff]|\[\[:/i', $pattern) && !$p ? "!$pattern!i" : it::convertregex($complex = $pattern, $p);
+ $patterns[] = !preg_match('/\\\\[wb]|[!\x80-\xff]|\[\[:/i', $pattern) && !$p ? "!$pattern!i$encoding" : it::convertregex($pattern, $p);
+
+ $result = preg_replace($patterns, $replacements, $string, isset($p['limit']) ? $p['limit'] : -1);
+
+ if ($result === null && preg_last_error() == PREG_BAD_UTF8_ERROR)
+ it::error("Input to it::replace is not valid utf-8");
- if (!$complex && !$p)
- $result = preg_replace($patterns, $replacements, $string);
+ return $result;
+}
+
+/**
+ * Returns only the array elements matching the given regex
+ * @param $pattern Regex to match against
+ * @param $array array to grep
+ * @return New array
+ */
+static function grep($pattern, $array, $p = array())
+{
+ if (!preg_match('/\\\\[wb]|[!\x80-\xff]|\[\[:/i', $pattern) && !$p)
+ $result = preg_grep('!' . $pattern . '!i' . (ini_get('default_charset') == 'utf-8' ? 'u' : ''), $array); # fast path for simple patterns
else
- {
- $oldlocale = setlocale(LC_CTYPE, 0);
- setlocale(LC_CTYPE, 'de_CH');
- $result = preg_replace($patterns, $replacements, $string, isset($p['limit']) ? $p['limit'] : -1);
- setlocale(LC_CTYPE, $oldlocale);
- }
+ $result = preg_grep(it::convertregex($pattern, $p), $array);
return $result;
}
/**
+ * Convert string to utf8 if it was not already utf-8 before
+ * @param $value String to convert
+ * @return Same string in utf-8 encoding
+ */
+function any2utf8($value)
+{
+ return grapheme_strlen($value) === null ? utf8_encode($value) : $value;
+}
+
+/**
+ * Uppercase first character similar to ucfirst() but for mbstring.internal_encoding
+ */
+static function ucfirst($string)
+{
+ return mb_strtoupper(mb_substr($string, 0, 1)) . mb_substr($string, 1);
+}
+
+/**
+ * Uppercase first character of each word similar to ucwords() but for mbstring.internal_encoding
+ */
+static function ucwords($string)
+{
+ return preg_replace_callback('/\b\w/u', function($m) { return mb_strtoupper($m[0]); }, mb_strtolower($string));
+}
+
+static function substr_replace($string, $replacement, $start, $length)
+{
+ return grapheme_substr($string, 0, $start) . $replacement . grapheme_substr($string, $start + $length);
+}
+
+/**
* Extract key => value pairs from assoc array by key
* @param $array array to filter
* @param $keys array or comma separated list of keys to keep
diff --git a/it_dbi.class b/it_dbi.class
index ba0606c..ced925b 100644
--- a/it_dbi.class
+++ b/it_dbi.class
@@ -247,7 +247,11 @@ function _set($tags, $allfields = false)
if (substr($field, 0, 1) == '-') # Unquoted value (always added)
$r[] = substr($field, 1)."=$value";
else if ($allfields || ($value !== $this->_data[$field]))
+ {
+ if ($this->_p['charset'] == "utf8") # NOTE: Mysql charset is simply utf8, not utf-8
+ $value = it_html::fix_encoding($value);
$r[] = "`$field`=".(isset($value) ? $this->escape_string($value) : 'NULL');
+ }
}
return $r ? 'SET '.implode(', ', $r) : '';
diff --git a/it_html.class b/it_html.class
index 92aa9ba..4c9f3e7 100644
--- a/it_html.class
+++ b/it_html.class
@@ -38,7 +38,7 @@ function it_html($p = array())
{
# Default configuration of html class
$this->p = $p + array(
- 'charset' => 'iso-8859-1',
+ 'charset' => ini_get('default_charset') ?: 'iso-8859-1',
'doctype' => null, # Custom doctype (will usually be calculated from htmltype)
'head' => '', # Code to put into head() section
'htmltype' => 'xhtml', # 'html' (=old-style), 'xhtml' or 'xhtml-mobile'
@@ -48,7 +48,7 @@ function it_html($p = array())
'name' => 'it_html', # Name of global variable $this is assigned to (string), XXX Copy and paste in configure() to keep PHP4 compatibility
'nonewlinetags' => 'a,b,em,img,input,label,span,noscript', # tags that do not like newlines after them
'notexported' => 'configure,sanitize',# Those methods are not exported
- 'prettyprint' => false, # Should output be prettily indented?
+ 'prettyprint' => it::is_devel(), # Should output be prettily indented?
'show_boot_dom' => false, # If true, append invisible <div id="it_boot_dom"> at the end of body
'show_content_type' => true, # If true, add <meta http-equiv="Content-Type" ...> header
'show_favicon' => true, # If true, add <link> tag to /favicon.ico if it exists
@@ -258,6 +258,21 @@ function _parse_args($args)
}
+# internal
+function fix_encoding($string, $silent = false)
+{
+ if (grapheme_strlen($string) === null)
+ list($string, $error) = array(utf8_encode($string), utf8_encode("incorrectly utf8-encoded: " . trim($string)));
+ else if ($string && preg_match('/[\x80-\xff]/', $string) && grapheme_strlen(utf8_decode($string)) !== null && utf8_encode(utf8_decode($string)) === $string)
+ list($string, $error) = array(utf8_decode($string), utf8_encode("doubly utf8-encoded: " . trim($string)));
+
+ if ($error && !$silent)
+ it::error(array('title' => $error, 'skipfiles' => "it_html"));
+
+ return $string;
+}
+
+
/**
* function div($args...)
* Return a <div>...</div> element
@@ -312,10 +327,12 @@ function _tag($name, $args)
else
$result .= " />$newline";
+ if ($GLOBALS['debug_utf8check'] && $GLOBALS['it_html']->p['charset'] == "utf-8")
+ $result = self::fix_encoding($result);
+
return $result;
}
-
/**
* Return a <tag> containing optional data.
* @param $name tag name ('style', etc.)
@@ -414,9 +431,11 @@ function _strip_tags($html)
function sanitize($html)
{
$result = "";
+ $charset = $GLOBALS['it_html']->p['charset'] ? $GLOBALS['it_html']->p['charset'] : 'iso-8859-1';
+ if ($charset == "utf-8")
+ $html = it::any2utf8($html);
$html = it::replace(array('[\0\s]+' => " "), $html); # \s also matches \r and \n
$urlpattern = 'https?://[^">]+';
- $charset = $GLOBALS['it_html']->p['charset'] ? $GLOBALS['it_html']->p['charset'] : 'iso-8859-1';
if ($tag = it::match("(.*)<(div|p|i|b)\b[^>]*>(.*?)</\\2>(.*)", $html))
{
@@ -450,11 +469,11 @@ function sanitize($html)
}
/**
- * Decode all entities, ensure latin-1 encoding
+ * Decode all entities to encoding set for it_html
*/
function entity_decode($string)
{
- $string = preg_replace('/&#(8217|65533);/', "'", html_entity_decode($string));
+ $string = preg_replace('/&#(8217|65533);/', "'", html_entity_decode($string, ENT_COMPAT, $GLOBALS['it_html']->p['charset']));
$string = preg_replace_callback('/&#x0*([0-9a-f]+);/i', function($m) { return hexdec($m[1]) <= 255 ? chr(hexdec($m[1])) : " "; }, $string);
$string = preg_replace_callback('/&#0*([0-9]+);/', function($m) { return $m[1] <= 255 ? chr($m[1]) : " "; }, $string);
@@ -476,8 +495,13 @@ function latinize($string)
*/
function Q($string)
{
- if (preg_match('/[<>&"\x00-\x08\x0a-\x0c\x0e-\x1f\x80-\x9f]/', $string)) # WARNING: copy/pasted to _tag()
+ if (preg_match('/[<>&"\x00-\x08\x0a-\x0c\x0e-\x1f\x80-\xff]/', $string)) # WARNING: copy/pasted to _tag()
+ {
+ if ($GLOBALS['debug_utf8check'] && $GLOBALS['it_html']->p['charset'] == "utf-8")
+ $string = self::fix_encoding($string);
+
$string = htmlspecialchars($GLOBALS['it_html']->p['charset'] == "iso-8859-1" ? it_html::latinize($string) : $string, ENT_COMPAT, $GLOBALS['it_html']->p['charset']);
+ }
return $GLOBALS['debug_q'] && $string ? "<span style='background:#8FF'>$string</span>" : $string;
}
diff --git a/it_xml.class b/it_xml.class
index 0679c69..f854682 100644
--- a/it_xml.class
+++ b/it_xml.class
@@ -29,7 +29,7 @@ class it_xml
* @param $p associative array
* @param $p['forcearray'] xml tags to ALWAYS return as array
* @param $p['safety'] 2 causes program abort with invalid xml, 1 (default) causes error report, 0 just returns false
- * @param $p['encoding'] Output character encoding (e.g. UTF-8, default: ISO-8859-1)
+ * @param $p['encoding'] Output character encoding (utf-8, iso-8859-1 or us-ascii, default: ini_get('default_charset')
* @param $p['prefix'] Optional prefix for class names
* @param $p['lowercase'] Lowercase all tag and attribute names
* @return XML object tree or null on failure
@@ -49,23 +49,22 @@ function it_xml($xmldata = "", $p = array())
function create($xmldata, $p = array())
{
$xml = new it_xml;
-
return $xml->from_xml($xmldata, array('factory' => true) + $p) ? $xml->_root : null;
}
function from_xml($xmldata, $p)
{
- $this->_p = $p + array('encoding' => "ISO-8859-1", 'safety' => 1);
+ $this->_p = $p + array('encoding' => ini_get('default_charset'), 'safety' => 1);
$this->_arrayforce = array_flip((array)$this->_p['forcearray']);
$this->_stack = array();
unset($this->error);
- $parser = xml_parser_create($this->_p['encoding']);
+ $parser = xml_parser_create();
xml_set_object($parser, $this);
xml_set_element_handler($parser, "start_element", "end_element");
xml_set_character_data_handler($parser, "character_data");
xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, 0);
xml_parser_set_option($parser, XML_OPTION_TARGET_ENCODING, $this->_p['encoding']);
-
+
$result = true;
if (is_resource($xmldata))
@@ -123,7 +122,7 @@ function _sanitize($xmldata, $isutf8 = null)
if (!preg_match('/^(<\?xml|\xEF\xBB\xBF|\xFE\xFF|\xFF\xFE|\x00\x00\xFE\xFF|\x00\x00\xFF\xFE)/', $xmldata))
$xmldata = '<?xml version="1.0" encoding="' . $this->_p['encoding'] . '"?>' . $xmldata;
- $isutf8 = (!preg_match('/^<\?xml[^>]* encoding=/i', $xmldata) || preg_match('/^<\?xml[^>]* encoding=.UTF-8/i', $xmldata));
+ $isutf8 = (!preg_match('/^<\?xml[^>]* encoding=/i', $xmldata) || preg_match('/^<\?xml[^>]* encoding=.utf-8/i', $xmldata));
}
# Decode illegal entities but protect semantically important ones
diff --git a/itjs.class b/itjs.class
index a7a6a15..e41ffe5 100644
--- a/itjs.class
+++ b/itjs.class
@@ -23,12 +23,18 @@
class itjs
{
+static $charset;
/**
* Send HTTP headers (content-type) to transmit javascript code
*/
-function send_headers($charset = 'iso-8859-1')
+function send_headers($charset = null)
{
+ if (!$charset)
+ $charset = ini_get('default_charset') ?: 'iso-8859-1';
+
+ self::$charset = $charset;
+
if (!preg_match('/Opera/', $_SERVER['HTTP_USER_AGENT']) && !$_REQUEST['itjs_iframe']) # text/plain breaks Opera 8.51/Linux and IFrame fallback
header("Content-Type: text/plain; charset=$charset"); # Berni reported some Firewalls to require this
@@ -67,16 +73,17 @@ function serialize($values, $envelope = false)
*/
function encode($values)
{
- $texts = ($values === array_values($values)) ? "[]0 " : "{}1\n"; # Numerical or associative array
static $jskeyword = array("abstract" => 1, "boolean" => 1, "break" => 1, "byte" => 1, "case" => 1, "catch" => 1, "char" => 1, "class" => 1, "const" => 1, "continue" => 1, "debugger" => 1, "default" => 1, "delete" => 1, "do" => 1, "double" => 1, "each" => 1, "else" => 1, "enum" => 1, "export" => 1, "extends" => 1, "false" => 1, "final" => 1, "finally" => 1, "float" => 1, "for" => 1, "function" => 1, "goto" => 1, "if" => 1, "implements" => 1, "import" => 1, "in" => 1, "instanceof" => 1, "int" => 1, "interface" => 1, "long" => 1, "namespace" => 1, "native" => 1, "new" => 1, "null" => 1, "package" => 1, "private" => 1, "protected" => 1, "public" => 1, "return" => 1, "short" => 1, "static" => 1, "super" => 1, "switch" => 1, "synchronized" => 1, "this" => 1, "throw" => 1, "throws" => 1, "transient" => 1, "true" => 1, "try" => 1, "typeof" => 1, "var" => 1, "void" => 1, "volatile" => 1, "while" => 1, "with" => 1, "xml" => 1);
- $result = $texts{0};
+ $charset = self::$charset ?: ini_get('default_charset');
+ $texts = ($values === array_values($values)) ? "[]0 " : "{}1\n"; # Numerical or associative array
+ $result = $texts[0];
foreach ($values as $key => $value)
{
$result .= $separator;
- if ($texts{2})
+ if ($texts[2])
{
if ($jskeyword[$key] || !preg_match('/^[a-z_]\w*$/i', $key))
$key = "'$key'";
@@ -91,17 +98,19 @@ function encode($values)
else if (!is_array($value))
{
$quote = (strval(intval($value)) === strval($value)) ? "" : '"';
- $string = strtr($value, array("\0" => '\\0', "\x84" => '\\"', "\x93" => '\\"',"\x94" => '\\"', '"' => '\\"', "</"=>"<\\/", "\n" => '\\n', "\r" => '\\r', "\t" => '\\t', "\\" => '\\\\'));
+ if (strtolower($charset) != "utf-8")
+ $value = strtr($value, array("\x84" => '"', "\x93" => '"', "\x94" => '"'));
+ $string = strtr($value, array("\0" => '\\0', '"' => '\\"', "</"=>"<\\/", "\n" => '\\n', "\r" => '\\r', "\t" => '\\t', "\\" => '\\\\'));
$string = $GLOBALS['itjs_defaultconfig']['latin2unicode'] ? preg_replace_callback('/([\xa0-\xff])/', function($m) { return sprintf("\\u%04x", ord($m[1])); }, $string) : $string;
$result .= $quote . $string . $quote;
}
else
$result .= itjs::encode($value);
- $separator = "," . $texts{3};
+ $separator = "," . $texts[3];
}
- $result .= $texts{1};
+ $result .= $texts[1];
return $result;
}
diff --git a/itjs.php b/itjs.php
index 03073e9..f819cab 100644
--- a/itjs.php
+++ b/itjs.php
@@ -33,7 +33,7 @@ foreach ($files as $file)
{
ob_start(); # Needs to capture inside loop to guarantee file order
if (!(it::match('W3C_CSS_Validator', $_SERVER['HTTP_USER_AGENT']) && it::match('jquery-ui\.css', $file)))
- $data .= it::replace(array('^1$' => ""), @include($file));
+ $data .= it::replace(array('^1$' => ""), @include($file), array('utf8' => false));
$data .= ob_get_clean();
}
@@ -50,9 +50,8 @@ else if (it::match('\.css', $_SERVER['PHP_SELF']))
$data .= "\n#it_boot_dom { display:none }\n"; # Append magic style for it_boot
if (!it::match('^devel', $GLOBALS['ULTRASERVERTYPE']))
$data = it::replace(array('[ \t]*([{};])[ \t]*' => '$1', '/\*.*?\*/' => ""), $data);
- if (it::match('W3C_CSS_Validator', $_SERVER['HTTP_USER_AGENT']) || EDC('w3c')) {
+ if (it::match('W3C_CSS_Validator', $_SERVER['HTTP_USER_AGENT']) || EDC('w3c'))
$data = it::replace(array('@-.*' => "", 'background[^;}]*(gradient|rgba)[^;}]*;?' => "", '(filter:\s*(progid|none)|text-overflow:|zoom:|-webkit-|display:-moz-|-moz-|-o-|cursor:|border-radius:|behavior:|\w+:expression)[^;}]*;?' => "", 'html\.ie6.*' => "", '([^/])\*(\w)' => '$1$2'), $data);
- }
}
else if (it::match('\.htc$', $file))
{
@@ -64,7 +63,8 @@ else if (!it::match('\.html$', $file))
if ($_REQUEST['boot'] && !$_REQUEST['retry'])
ob_start('ob_gzhandler');
- header("Content-Type: application/x-javascript; charset=iso-8859-1");
+ $charset = ini_get('default_charset') ?: 'iso-8859-1';
+ header("Content-Type: application/x-javascript; charset=$charset");
}
@header("Etag: $checksum");
@@ -80,7 +80,7 @@ if ($checksum != $_SERVER['HTTP_IF_NONE_MATCH'])
{
$data .= "window.it_boot_init();\n";
if (!$_REQUEST['script'])
- $data = sprintf("/*sln:%d*/\n%s/*eln:%d*/", strlen($data), strtr($data, array('%' => "%25", '.' => "%2e", 'e' => "%65", 'i' => "%69")), strlen($data)); # Protect from Firewalls/Proxies altering Javascript source code
+ $data = sprintf("/*sln:%d*/\n%s/*eln:%d*/", grapheme_strlen($data), strtr($data, array('%' => "%25", '.' => "%2e", 'e' => "%65", 'i' => "%69")), grapheme_strlen($data)); # Protect from Firewalls/Proxies altering Javascript source code
}
echo it_untaint($data);
diff --git a/itjs/it.js b/itjs/it.js
index 54fa5c7..b31423e 100644
--- a/itjs/it.js
+++ b/itjs/it.js
@@ -280,6 +280,24 @@ function it_url_encode(str)
}
/**
+ * Unicode-safe equivalent of unescape()
+ * @param str string URL encoded string to be decoded
+ */
+function it_url_decode(str)
+{
+ str = str.replace(/\+/g, '%20');
+
+ // catch URI malformed errors
+ try {
+ if (window.decodeURIComponent)
+ return decodeURIComponent(str);
+ }
+ catch(e) {}
+
+ return unescape(str);
+}
+
+/**
* Patch PNG transparency for IE 5.5-6 on the given image
*/
function it_pngfix(img, w, h, mode)
diff --git a/tests/exec.t b/tests/exec.t
index 689bd26..9425420 100755
--- a/tests/exec.t
+++ b/tests/exec.t
@@ -11,7 +11,7 @@ is(it::shell_command("echo {arg}", array('arg' => '')), "echo ''", "quote empty
foreach (array("", "C", "de_CH", "de_CH.utf8") as $locale) {
setlocale(LC_ALL, $locale);
- $arg = "prepost";
+ $arg = "preüpost";
if (it::match('utf8', $locale))
$arg = utf8_encode($arg);
is(it::exec("echo " . $arg), $arg . "\n", "exec with umlaut (locale '$locale')");
diff --git a/tests/getopt.t b/tests/getopt.t
index 7a84588..d67738d 100755
--- a/tests/getopt.t
+++ b/tests/getopt.t
@@ -17,7 +17,7 @@ function getopt_ok($argv, $exp, $name)
return is($got['argument'], $exp, $name);
}
-foreach (array("" => "blah gnaber", " (umlaute)" => "pre post") as $variant => $testarg) {
+foreach (array("" => "blah gnaber", " (umlaute)" => "pre üäpost") as $variant => $testarg) {
getopt_ok(array('-a', $testarg), $testarg, "Short version" . $variant);
getopt_ok(array('--argument', $testarg), $testarg, "Long version with space" . $variant);
getopt_ok(array("--argument=$testarg"), $testarg, "Long version with equal" . $variant);
diff --git a/tests/it.t b/tests/it.t
index 1a308ec..8e4a7e2 100755
--- a/tests/it.t
+++ b/tests/it.t
@@ -3,252 +3,317 @@
# Tests for it.class
-function match($regex, $string, $expect, $name)
+
+#
+# tests for it::match()
+#
+$oldcharset = ini_get('default_charset');
+$oldlocale = setlocale(LC_CTYPE, 0);
+
+ini_set('default_charset', 'utf-8');
+setlocale(LC_CTYPE, 'de_CH'); # required becuase we're checking German umlauts in latin1 mode
+
+
+function match($regex, $string, $expect, $name, $p = array())
{
$GLOBALS['TEST_MORE_LEVEL'] = 1;
- $pass = is (it::match($regex, $string), $expect, $name);
+ $pass = is (it::match($regex, $string, $p), $expect, $name);
if (!$pass) {
- diag(" regex given: $regex");
+ diag(" regex given: $regex" . ($p ? " " .D($p) : ""));
diag(" regex converted: " . it::convertregex($regex));
}
$GLOBALS['TEST_MORE_LEVEL'] = 0;
}
+
match(
'b', 'aaaabaaaa',
'b',
'simple regex'
- );
+);
+
match(
'a/b', ' a/b ',
'a/b',
'regex with /'
);
+
match(
'aa(bb)aa(cc)aa(dd)qq', 'aabbaaccaaddqq',
- array( 'bb', 'cc', 'dd' ),
+ array('bb', 'cc', 'dd'),
'return array of captures'
- );
+);
+
match(
'\bblah\b', ' blah ',
'blah',
'match \b at spaces'
- );
+);
+
match(
'\bblah\b', 'blah',
'blah',
'match \b at end of string'
- );
+);
+
match(
'\bblah\b', 'ablahc',
false,
'don\'t match \b at word chars'
- );
+);
+
match(
- '\bblah\b', 'blah',
+ '\bblah\b', 'Üblahä',
false,
- 'don\'t match \b at umlaute in latin1'
- );
+ 'don\'t match \b at umlaute'
+);
+
match(
'\Bblah\B', ' blah ',
false,
'don\'t match \B at spaces'
- );
+);
+
match(
'\Bblah\B', 'blah',
false,
'don\'t match \B at end of string'
- );
+);
+
match(
'\Bblah\B', 'ablahc',
'blah',
'match \B at word chars'
- );
+);
+
match(
- '\Bblah\B', 'blah',
+ '\Bblah\B', 'Üblahä',
'blah',
- 'match \B at umlaute in latin1'
- );
+ 'match \B at umlaute'
+);
+
match(
- '\w+', ' |#blah ',
- 'blah',
+ '\w+', ' |#Üblahä ',
+ 'Üblahä',
'include umlaute in \w'
- );
+);
+
match(
- '[[:alpha:]]+', ' |#blah ',
- 'blah',
+ '[[:alpha:]]+', ' |#blahä ',
+ 'blahä',
'include umlaute in [[:alpha:]]'
- );
+);
+
match(
- '\W+', ' |#blah ',
+ '\W+', ' |#Üblahä ',
' |#',
'don\'t include umlaute in \W'
- );
+);
+
match(
- '\ba', 'a',
+ '\ba', 'äa',
'',
'\b must know umlauts'
- );
+);
-eval( '$escapedwordregex = "' . it::convertregex( '\w' ) . '";' );
-$escapedwordregex = preg_replace( '|[\\\\/]|', '', $escapedwordregex );
+match(
+ 'aaa\\\\w+', ' aaa\www ',
+ 'aaa\www',
+ 'don\'t parse \w in \\\\w at beginning (match)'
+);
+
+match(
+ 'aaa\\\\w+', ' aaa\www ',
+ 'aaa\www',
+ 'don\'t parse \w in \\\\w after chars (match)'
+);
+
+eval('$escapedwordregex = "' . it::convertregex('\w') . '";');
+$escapedwordregex = preg_replace('|[\\\\/]|', '', $escapedwordregex);
match(
'\\\\w+', $escapedwordregex,
false,
'don\'t parse \w in \\\\w at beginning (no match)'
- );
-match(
- 'aaa\\\\w+', ' aaa\www ',
- 'aaa\www',
- 'don\'t parse \w in \\\\w at beginning (match)'
- );
+);
+
match(
'aaa\\\\w+', 'aaa' . $escapedwordregex,
false,
'don\'t parse \w in \\\\w after chars (no match)'
- );
-match(
- 'aaa\\\\w+', ' aaa\www ',
- 'aaa\www',
- 'don\'t parse \w in \\\\w after chars (match)'
- );
+);
+
match(
'\\\\\\\\w+', '\\' . $escapedwordregex,
false,
'don\'t parse \w in \\\\\\\w (no match)'
- );
+);
+
match(
'\\\\\\\\w+', ' \\\\www ',
'\\\\www',
'don\'t parse \\\\\\\\w as \w (match)'
- );
+);
+
match(
'[\w]+', '[[[]]]---',
false,
'replace \w in [\w] correctly (no match)'
- );
+);
+
match(
'[\w]+', ' \\\\aword[[[]]] ',
'aword',
'replace \w in [\w] correctly (match)'
- );
+);
+
match(
'[\\\\w]+', ' blabergna ',
false,
'don\'t parse \w in [\\\\w] (no match)'
- );
+);
+
match(
'[\\\\w]+', ' \\\\worda[[[]',
'\\\\w',
'don\'t parse \w in [\\\\w] (match)'
- );
+);
+
match(
'[a\W]+', 'bbbbbbb a a%$+ accccc',
' a a%$+ a',
'\W in []'
- );
+);
+
match(
- '\\\\\\w+', ' \blah ',
- '\blah',
+ '\\\\\\w+', ' \Üblahä ',
+ '\Üblahä',
'parse \w in \\\\\\w at beginning'
- );
+);
+
match(
- 'aaa\\\\\\w+', ' aaa\blah ',
- 'aaa\blah',
+ 'aaa\\\\\\w+', ' aaa\Üblahä ',
+ 'aaa\Üblahä',
'parse \w in \\\\\\w after chars'
- );
-is(
- it::replace(
- array(
- 'regex1' => 'repl1',
- 'regex2' => 'repl2',
- 'regex3' => 'repl3' ),
- 'regex2 regex1 regex3' ),
- 'repl2 repl1 repl3',
- 'test tr regex function'
- );
-is(
- it::match( '\w+', 'word1 wrd2 word_3', array('all' => true )),
- array( 'word1', 'wrd2', 'word_3' ),
- "test match_all function"
- );
+);
+
+match(
+ '\w+', 'word1 wörd2 word_3',
+ array('word1', 'wörd2', 'word_3'),
+ "test match_all function",
+ array('all' => true)
+);
+
match(
'aBcD', ' aBcD ',
'aBcD',
"caseinsensitive is default"
- );
+);
+
match(
- '', '',
- '',
- 'match umlaute in latin1 case insensitive'
- );
+ '\w+', 'Müller',
+ 'Müller',
+ '\w matches umlaut in utf-8 mode'
+);
-is(
- it::match(utf8_encode('aB'), utf8_encode("Ab"), array('utf8' => true)),
- utf8_encode('Ab'),
- "match utf-8 umlaute in case insensitive"
+match(
+ 'M.ller', 'Müller',
+ 'Müller',
+ '. matches umlaut in utf-8 mode'
);
-$oldcharset = ini_get('default_charset');
-ini_set('default_charset', 'utf-8');
match(
- utf8_encode('aB'), utf8_encode('Ab'),
- utf8_encode('Ab'),
- "match utf-8 umlaute in case insensitive using default_charset"
+ utf8_decode('ö'), utf8_decode('Ö'),
+ utf8_decode('Ö'),
+ 'match umlaute in de_CH.latin1 case insensitive',
+ array('utf8' => false)
);
-is(
- it::match('aB', 'Ab', array('utf8' => false)),
- 'Ab',
- "non-utf-8 override with default_charset=utf-8"
+
+match(
+ utf8_decode('aöBÜ'), utf8_decode('AÖbü'),
+ utf8_decode('AÖbü'),
+ "match umlaute with non-utf-8 override in p",
+ array('utf8' => false)
);
+
+
match(
- '\w+', utf8_encode('Mller'),
- utf8_encode('Mller'),
- '\w matches umlaut in utf-8 mode'
+ 'abc', "aBc",
+ false,
+ "set case sensitivity by parameter",
+ array('casesensitive' => 1),
);
+
match(
- 'M.ller', utf8_encode('Mller'),
- utf8_encode('Mller'),
- '. matches umlaut in utf-8 mode'
+ '\w+', 'word1 wörd2 word_3',
+ array('word1', 'wörd2', 'word_3'),
+ "test all => 1 without captures",
+ array('all' => 1)
);
-ini_set('default_charset', $oldcharset);
-is(
- it::match( 'abc', "aBc", array('casesensitive' => 1 )),
- false,
- "set case sensitivity by parameter"
- );
+match(
+ '\w+\s+(\d+)', 'word1 12 wörd2 3 word_3 4',
+ array('12', '3', '4'),
+ "test all => 1 with one capture",
+ array('all' => 1)
+);
+match(
+ '(\w+)\s+(\d+)', 'word1 12 wörd2 3 word_3 4',
+ array(array('word1', '12'), array('wörd2', '3'), array('word_3', '4')),
+ "test all => 1 with captures",
+ array('all' => 1)
+);
+
+match(
+ '(\w+)\s+(\d+)', 'word1 12 wörd2 3 word_3 4',
+ array(array('word1', 'wörd2', 'word_3'), array('12', '3', '4')),
+ "test all => 1,pattern_order => 1",
+ array('all' => 1, 'pattern_order' => 1)
+);
+
+ini_set('default_charset', 'iso-8859-1');
+match(
+ 'aöBÜ', "AÖbü",
+ 'AÖbü',
+ "match utf-8 umlaute in case insensitive mode with utf8 override",
+ array('utf8' => true)
+);
+ini_set('default_charset', 'utf-8');
+
+
+#
+# tests for it::replace()
+#
is(
- it::match( '\w+', 'word1 wrd2 word_3', array('all' => 1 )),
- array( 'word1', 'wrd2', 'word_3' ),
- "test all=>1 without captures"
- );
-is(
- it::match( '\w+\s+(\d+)', 'word1 12 wrd2 3 word_3 4', array('all' => 1 )),
- array( '12', '3', '4' ),
- "test all=>1 with one capture"
- );
-is(
- it::match( '(\w+)\s+(\d+)', 'word1 12 wrd2 3 word_3 4', array('all' => 1 )),
- array( array( 'word1', '12' ), array( 'wrd2', '3' ), array( 'word_3', '4' ) ),
- "test all=>1 with captures"
- );
-is(
- it::match( '(\w+)\s+(\d+)', 'word1 12 wrd2 3 word_3 4', array('all' => 1, 'pattern_order' => 1 )),
- array( array( 'word1', 'wrd2', 'word_3' ), array( '12', '3', '4' ) ),
- "test all=>1,pattern_order=>1"
- );
+ it::replace(
+ array(
+ 'regex1' => 'repl1',
+ 'regex2' => 'repl2',
+ 'regex3' => 'repl3'),
+ 'regex2 regex1 regex3'),
+ 'repl2 repl1 repl3',
+ 'test tr regex function'
+);
is(it::replace(array('a' => "1", 'b' => "2"), "ab"), "12");
is(it::replace(array('!' => "x"), "!"), "x");
-is(it::replace(array('\w' => "x"), "o"), "xx");
-is(it::replace(array('[[:alpha:]]' => "x"), ""), "x");
-is(it::replace(array('\w' => "x", '#' => "!"), "#"), "!x");
-is(it::replace(array('#' => "!", '\w' => "x"), "#"), "!x");
-is(it::replace(array('' => "x"), ""), "x");
+is(it::replace(array('\w' => "x"), "oö"), "xx");
+is(it::replace(array('[[:alpha:]]' => "x"), "ö"), "x");
+is(it::replace(array('\w' => "x", '#' => "!"), "#ö"), "!x");
+is(it::replace(array('#' => "!", '\w' => "x"), "#ö"), "!x");
+is(it::replace(array('ö' => "x"), "Ö"), "x");
is(it::replace(array('a' => "1"), "aaa", array('limit' => 1)), "1aa");
+is(it::replace(array('\s' => "x"), it_html::entity_decode("&nbsp;")), "x", "match non-breaking space as white-space character");
+
+is(it::grep('ismatch', array('ismatch', 'isnomatch')), array('ismatch'), 'grep with simple regex');
+is(it::grep('!', array('ismatch!', 'isnomatch')), array('ismatch!'), '! in regex');
+is(it::grep('lower|UPPER', array('lower', 'LOWER', 'upper', 'UPPER'), 'casesensitive' => 1), array(0 => 'lower', 3 => 'UPPER'), 'set casesensitive');
+is(it::grep('match', array('foo' => 'match', 'bar' => 'gna')), array('foo' => 'match'), 'with keys');
+
+setlocale(LC_CTYPE, $oldlocale);
+ini_set('default_charset', $oldcharset); # end of tests that must run with specific charset
+
# it::filter_keys tests
@@ -282,4 +347,26 @@ is(it::date('datetime', 1000000.543), it::date('datetime', "1000000"), '... larg
is(it::date('time', "10.5"), "10:05", 'interpret string with points with strtotime');
is(it::date('time', "10.05"), "10:05", 'interpret string with points with strtotime');
+# it::uc*
+is(it::ucfirst('foo bär über'), 'Foo bär über');
+is(it::ucwords('foo bär über'), 'Foo Bär Über');
+
+# it::substr_replace
+is(it::substr_replace('abcdefgh', 'xyz', 2, 4), substr_replace('abcdefgh', 'xyz', 2, 4), 'it::substr_replace the same as substr_replace for ascii');
+is(it::substr_replace('✔☯♥', '☃☃', 1, 1), '✔☃☃♥', 'it::substr_replace for utf-8');
+
+is(it::any2utf8('Meier'), 'Meier', "it::any2utf8 ascii input");
+is(it::any2utf8('Müller'), 'Müller', "it::any2utf8 utf8 input");
+is(it::any2utf8('Aslı'), 'Aslı', "it::any2utf8 utf8 non-latin1 input");
+is(it::any2utf8(utf8_decode('Müller')), 'Müller', "it::any2utf8 latin1 input");
+
+is(it::any2utf8(
+ ' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ'),
+ ' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ',
+ "it::any2utf8 utf8 input (exhaustive alphabet)");
+is(it::any2utf8(
+ utf8_decode(' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ')),
+ ' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ',
+ "it::any2utf8 latin1 input (exhaustive alphabet)");
+
?>
diff --git a/tests/it_html.t b/tests/it_html.t
index 0def431..174c487 100755
--- a/tests/it_html.t
+++ b/