summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristian Schneider2012-03-28 13:00:39 +0000
committerChristian Schneider2012-03-28 13:00:39 +0000
commitb1c0b4946572027c8de564730a89ec584c830bf3 (patch)
tree81859c280f43a5cdb31ddede7300641fa530e0f8
parent49d2a5ce1b6ad201f051263db7c3a1f5ad6a39ab (diff)
downloaditools-b1c0b4946572027c8de564730a89ec584c830bf3.tar.gz
itools-b1c0b4946572027c8de564730a89ec584c830bf3.tar.bz2
itools-b1c0b4946572027c8de564730a89ec584c830bf3.zip
Added it::any2utf8, fixed it::replace fast path to add u modified, added error reporting for invalid utf-8 input to it::match and it::replace
-rw-r--r--it.class18
-rw-r--r--it_html.class4
-rwxr-xr-xtests/it_html.t1
3 files changed, 21 insertions, 2 deletions
diff --git a/it.class b/it.class
index f683568..6d516c9 100644
--- a/it.class
+++ b/it.class
@@ -356,6 +356,8 @@ static function match($pattern, $string, $p = null)
{
if (preg_last_error() == PREG_BACKTRACK_LIMIT_ERROR)
it::error("Exceeded pcre.backtrack_limit of " . ini_get('pcre.backtrack_limit') . " bytes");
+ else if (preg_last_error() == PREG_BAD_UTF8_ERROR)
+ it::error("Input to it::match is not valid utf-8");
$result = $p['all'] ? array() : null;
}
@@ -379,11 +381,15 @@ static function match($pattern, $string, $p = null)
*/
static function replace($replacements, $string, $p = array())
{
+ $encoding = ini_get('default_charset') == 'utf-8' ? 'u' : '';
foreach ($replacements as $pattern => $dummy)
- $patterns[] = !preg_match('/\\\\[wb]|[!\x80-\xff]|\[\[:/i', $pattern) && !$p ? "!$pattern!i" : it::convertregex($pattern, $p);
+ $patterns[] = !preg_match('/\\\\[wb]|[!\x80-\xff]|\[\[:/i', $pattern) && !$p ? "!$pattern!i$encoding" : it::convertregex($pattern, $p);
$result = preg_replace($patterns, $replacements, $string, isset($p['limit']) ? $p['limit'] : -1);
+ if ($result === null && preg_last_error() == PREG_BAD_UTF8_ERROR)
+ it::error("Input to it::replace is not valid utf-8");
+
return $result;
}
@@ -404,6 +410,16 @@ static function grep($pattern, $array, $p = array())
}
/**
+ * Convert string to utf8 if it was not already utf-8 before
+ * @param $value String to convert
+ * @return Same string in utf-8 encoding
+ */
+function any2utf8($value)
+{
+ return strlen($value) && strlen(htmlspecialchars($value, 0, 'utf-8')) == 0 ? utf8_encode($value) : $value; # Use side-effect of htmlspecialchars: Fails if not valid utf-8 encoding
+}
+
+/**
* Uppercase first character similar to ucfirst() but for mbstring.internal_encoding
*/
static function ucfirst($string)
diff --git a/it_html.class b/it_html.class
index 71ac965..f20c4be 100644
--- a/it_html.class
+++ b/it_html.class
@@ -431,9 +431,11 @@ function _strip_tags($html)
function sanitize($html)
{
$result = "";
+ $charset = $GLOBALS['it_html']->p['charset'] ? $GLOBALS['it_html']->p['charset'] : 'iso-8859-1';
+ if ($charset == "utf-8")
+ $html = it::any2utf8($html);
$html = it::replace(array('[\0\s]+' => " "), $html); # \s also matches \r and \n
$urlpattern = 'https?://[^">]+';
- $charset = $GLOBALS['it_html']->p['charset'] ? $GLOBALS['it_html']->p['charset'] : 'iso-8859-1';
if ($tag = it::match("(.*)<(div|p|i|b)\b[^>]*>(.*?)</\\2>(.*)", $html))
{
diff --git a/tests/it_html.t b/tests/it_html.t
index b1d271e..770d11a 100755
--- a/tests/it_html.t
+++ b/tests/it_html.t
@@ -153,6 +153,7 @@ is(it_html::entity_decode("&#65;"), "A");
#
it_html::configure(array('charset' => "iso-8859-1"));
+ini_set('default_charset', "iso-8859-1");
is(
it_html::sanitize('q&uuml;x'),