3 files changed, 21 insertions, 2 deletions
diff --git a/it.class b/it.class
index f683568..6d516c9 100644
--- a/it.class
+++ b/it.class
@@ -356,6 +356,8 @@ static function match($pattern, $string, $p = null)
 	{
 		if (preg_last_error() == PREG_BACKTRACK_LIMIT_ERROR)
 			it::error("Exceeded pcre.backtrack_limit of " . ini_get('pcre.backtrack_limit') . " bytes");
+		else if (preg_last_error() == PREG_BAD_UTF8_ERROR)
+			it::error("Input to it::match is not valid utf-8");
 
 		$result = $p['all'] ? array() : null;
 	}
@@ -379,11 +381,15 @@ static function match($pattern, $string, $p = null)
  */
 static function replace($replacements, $string, $p = array())
 {
+	$encoding = ini_get('default_charset') == 'utf-8' ? 'u' : '';
 	foreach ($replacements as $pattern => $dummy)
-		$patterns[] = !preg_match('/\\\\[wb]|[!\x80-\xff]|\[\[:/i', $pattern) && !$p ? "!$pattern!i" : it::convertregex($pattern, $p);
+		$patterns[] = !preg_match('/\\\\[wb]|[!\x80-\xff]|\[\[:/i', $pattern) && !$p ? "!$pattern!i$encoding" : it::convertregex($pattern, $p);
 
 	$result = preg_replace($patterns, $replacements, $string, isset($p['limit']) ? $p['limit'] : -1);
 
+	 if ($result === null && preg_last_error() == PREG_BAD_UTF8_ERROR)
+		it::error("Input to it::replace is not valid utf-8");
+
 	return $result;
 }
 
@@ -404,6 +410,16 @@ static function grep($pattern, $array, $p = array())
 }
 
 /**
+ * Convert string to utf8 if it was not already utf-8 before
+ * @param $value String to convert
+ * @return Same string in utf-8 encoding
+ */
+function any2utf8($value)
+{
+	return strlen($value) && strlen(htmlspecialchars($value, 0, 'utf-8')) == 0 ? utf8_encode($value) : $value;      # Use side-effect of htmlspecialchars: Fails if not valid utf-8 encoding
+}
+
+/**
  * Uppercase first character similar to ucfirst() but for mbstring.internal_encoding
  */
 static function ucfirst($string)
diff --git a/it_html.class b/it_html.class
index 71ac965..f20c4be 100644
--- a/it_html.class
+++ b/it_html.class
@@ -431,9 +431,11 @@ function _strip_tags($html)
 function sanitize($html)
 {
 	$result = "";
+	$charset = $GLOBALS['it_html']->p['charset'] ? $GLOBALS['it_html']->p['charset'] : 'iso-8859-1';
+	if ($charset == "utf-8")
+		$html = it::any2utf8($html);
 	$html = it::replace(array('[\0\s]+' => " "), $html);	# \s also matches \r and \n
 	$urlpattern = 'https?://[^">]+';
-	$charset = $GLOBALS['it_html']->p['charset'] ? $GLOBALS['it_html']->p['charset'] : 'iso-8859-1';
 
 	if ($tag = it::match("(.*)<(div|p|i|b)\b[^>]*>(.*?)</\\2>(.*)", $html))
 	{
diff --git a/tests/it_html.t b/tests/it_html.t
index b1d271e..770d11a 100755
--- a/tests/it_html.t
+++ b/tests/it_html.t
@@ -153,6 +153,7 @@ is(it_html::entity_decode("&#65;"),   "A");
 #
 
 it_html::configure(array('charset' => "iso-8859-1"));
+ini_set('default_charset', "iso-8859-1");
 
 is(
 	it_html::sanitize('q&uuml;x'),