From b1c0b4946572027c8de564730a89ec584c830bf3 Mon Sep 17 00:00:00 2001 From: Christian Schneider Date: Wed, 28 Mar 2012 13:00:39 +0000 Subject: Added it::any2utf8, fixed it::replace fast path to add u modified, added error reporting for invalid utf-8 input to it::match and it::replace --- it.class | 18 +++++++++++++++++- it_html.class | 4 +++- tests/it_html.t | 1 + 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/it.class b/it.class index f683568..6d516c9 100644 --- a/it.class +++ b/it.class @@ -356,6 +356,8 @@ static function match($pattern, $string, $p = null) { if (preg_last_error() == PREG_BACKTRACK_LIMIT_ERROR) it::error("Exceeded pcre.backtrack_limit of " . ini_get('pcre.backtrack_limit') . " bytes"); + else if (preg_last_error() == PREG_BAD_UTF8_ERROR) + it::error("Input to it::match is not valid utf-8"); $result = $p['all'] ? array() : null; } @@ -379,11 +381,15 @@ static function match($pattern, $string, $p = null) */ static function replace($replacements, $string, $p = array()) { + $encoding = ini_get('default_charset') == 'utf-8' ? 'u' : ''; foreach ($replacements as $pattern => $dummy) - $patterns[] = !preg_match('/\\\\[wb]|[!\x80-\xff]|\[\[:/i', $pattern) && !$p ? "!$pattern!i" : it::convertregex($pattern, $p); + $patterns[] = !preg_match('/\\\\[wb]|[!\x80-\xff]|\[\[:/i', $pattern) && !$p ? "!$pattern!i$encoding" : it::convertregex($pattern, $p); $result = preg_replace($patterns, $replacements, $string, isset($p['limit']) ? $p['limit'] : -1); + if ($result === null && preg_last_error() == PREG_BAD_UTF8_ERROR) + it::error("Input to it::replace is not valid utf-8"); + return $result; } @@ -403,6 +409,16 @@ static function grep($pattern, $array, $p = array()) return $result; } +/** + * Convert string to utf8 if it was not already utf-8 before + * @param $value String to convert + * @return Same string in utf-8 encoding + */ +function any2utf8($value) +{ + return strlen($value) && strlen(htmlspecialchars($value, 0, 'utf-8')) == 0 ? utf8_encode($value) : $value; # Use side-effect of htmlspecialchars: Fails if not valid utf-8 encoding +} + /** * Uppercase first character similar to ucfirst() but for mbstring.internal_encoding */ diff --git a/it_html.class b/it_html.class index 71ac965..f20c4be 100644 --- a/it_html.class +++ b/it_html.class @@ -431,9 +431,11 @@ function _strip_tags($html) function sanitize($html) { $result = ""; + $charset = $GLOBALS['it_html']->p['charset'] ? $GLOBALS['it_html']->p['charset'] : 'iso-8859-1'; + if ($charset == "utf-8") + $html = it::any2utf8($html); $html = it::replace(array('[\0\s]+' => " "), $html); # \s also matches \r and \n $urlpattern = 'https?://[^">]+'; - $charset = $GLOBALS['it_html']->p['charset'] ? $GLOBALS['it_html']->p['charset'] : 'iso-8859-1'; if ($tag = it::match("(.*)<(div|p|i|b)\b[^>]*>(.*?)(.*)", $html)) { diff --git a/tests/it_html.t b/tests/it_html.t index b1d271e..770d11a 100755 --- a/tests/it_html.t +++ b/tests/it_html.t @@ -153,6 +153,7 @@ is(it_html::entity_decode("A"), "A"); # it_html::configure(array('charset' => "iso-8859-1")); +ini_set('default_charset', "iso-8859-1"); is( it_html::sanitize('qüx'), -- cgit v1.2.3