summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorUrban Müller2012-04-24 14:18:21 +0000
committerUrban Müller2012-04-24 14:18:21 +0000
commit9c523156bbc9d34ff2a16ec3b2c345951fc55287 (patch)
treeb03409e32574aaa76f78aef8158030d6a8cea5d8
parent177de5290677a71fe95e5264f1f21fad9cc7c469 (diff)
downloaditools-9c523156bbc9d34ff2a16ec3b2c345951fc55287.tar.gz
itools-9c523156bbc9d34ff2a16ec3b2c345951fc55287.tar.bz2
itools-9c523156bbc9d34ff2a16ec3b2c345951fc55287.zip
merged it_html::fix_encoding in it::any2utf8
-rw-r--r--it.class14
-rw-r--r--it_dbi.class2
-rw-r--r--it_html.class19
-rwxr-xr-xtests/it.t8
-rwxr-xr-xtests/it_html.t11
5 files changed, 21 insertions, 33 deletions
diff --git a/it.class b/it.class
index b84c740..407e200 100644
--- a/it.class
+++ b/it.class
@@ -406,16 +406,22 @@ static function grep($pattern, $array, $p = array())
}
/**
- * Convert string to utf8 if it was not already utf-8 before
+ * Convert string to utf8 if it was not already utf-8 before. Also handles double encoding
* @param $value String to convert
+ * @param $errmsg Error message to output if anything needed to be done
* @return Same string in utf-8 encoding
*/
-function any2utf8($value)
+function any2utf8($value, $errmsg = "")
{
if (grapheme_strlen($value) === null)
- $value = utf8_encode($value);
+ list($value, $error) = array(utf8_encode($value), utf8_encode("$errmsg: incorrect utf8-encoding. input=" . trim($value)));
+ if (preg_match('/\xc3\x83\xc2([\xbc\xa9\xa4\xb6\xa8\xa2\xa0\xb4\xaa\xa7\x84\xab\xae\x9c\xaf\x96\xb2\xbb\xb9\x9f])/', $value))
+ list($value, $error) = array(preg_replace('/\xc3\x83\xc2([\xbc\xa9\xa4\xb6\xa8\xa2\xa0\xb4\xaa\xa7\x84\xab\xae\x9c\xaf\x96\xb2\xbb\xb9\x9f])/', "\xc3\$1", $value), utf8_encode("$errmsg: double utf8-encoding. input=" . trim($value)));
- return preg_replace('/\xc3\x83\xc2([\xbc\xa9\xa4\xb6\xa8\xa2\xa0\xb4\xaa\xa7\x84\xab\xae\x9c\xaf\x96\xb2\xbb\xb9\x9f])/', "\xc3\$1", $value); # fix most common double encodings, UTF8SAFE
+ if ($error && $errmsg)
+ it::error(array('title' => $error, 'skipfiles' => "it_html"));
+
+ return $value;
}
/**
diff --git a/it_dbi.class b/it_dbi.class
index ced925b..512cd4c 100644
--- a/it_dbi.class
+++ b/it_dbi.class
@@ -249,7 +249,7 @@ function _set($tags, $allfields = false)
else if ($allfields || ($value !== $this->_data[$field]))
{
if ($this->_p['charset'] == "utf8") # NOTE: Mysql charset is simply utf8, not utf-8
- $value = it_html::fix_encoding($value);
+ $value = it::any2utf8($value, "error in db-field $field");
$r[] = "`$field`=".(isset($value) ? $this->escape_string($value) : 'NULL');
}
}
diff --git a/it_html.class b/it_html.class
index ed47abd..ece7070 100644
--- a/it_html.class
+++ b/it_html.class
@@ -258,21 +258,6 @@ function _parse_args($args)
}
-# internal
-function fix_encoding($string, $silent = false)
-{
- if (grapheme_strlen($string) === null)
- list($string, $error) = array(utf8_encode($string), utf8_encode("incorrectly utf8-encoded: " . trim($string)));
- else if (preg_match('/\xc3\x83\xc2[\x84\x9c\xa4\xb6\xbc\xa9\xa0]/', $string)) # Double encoded ÄÖÜäöüéà, UTF8SAFE
- list($string, $error) = array(utf8_decode($string), utf8_encode("doubly utf8-encoded: " . trim($string)));
-
- if ($error && !$silent)
- it::error(array('title' => $error, 'skipfiles' => "it_html"));
-
- return $string;
-}
-
-
/**
* function div($args...)
* Return a <div>...</div> element
@@ -328,7 +313,7 @@ function _tag($name, $args)
$result .= " />$newline";
if ($GLOBALS['debug_utf8check'] && $GLOBALS['it_html']->p['charset'] == "utf-8")
- $result = self::fix_encoding($result);
+ $result = it::any2utf8($result, "error in $name()");
return $result;
}
@@ -498,7 +483,7 @@ function Q($string)
if (preg_match('/[<>&"\x00-\x08\x0a-\x0c\x0e-\x1f\x80-\xff]/', $string)) # WARNING: copy/pasted to _tag()
{
if ($GLOBALS['debug_utf8check'] && $GLOBALS['it_html']->p['charset'] == "utf-8")
- $string = self::fix_encoding($string);
+ $string = it::any2utf8($string, "error in Q()");
$origstring = $string;
$string = @htmlspecialchars($GLOBALS['it_html']->p['charset'] == "iso-8859-1" ? it_html::latinize($string) : $string, ENT_COMPAT, $GLOBALS['it_html']->p['charset']);
diff --git a/tests/it.t b/tests/it.t
index 26a4e25..68ea398 100755
--- a/tests/it.t
+++ b/tests/it.t
@@ -371,4 +371,12 @@ is(it::any2utf8(
is(it::any2utf8(utf8_encode("ü")), "ü", "it::any2utf8 double encoding");
+is(it::any2utf8("Meier"), "Meier", "it::any2utf8 ascii");
+is(it::any2utf8("Müller"), "Müller", "it::any2utf8 utf-8 latin1");
+is(it::any2utf8("Aslı"), "Aslı", "it::any2utf8 utf-8 non-latin1");
+is(it::any2utf8("é»"), "é»", "it::any2utf8 utf-8 latin1 special combination");
+is(it::any2utf8(utf8_encode("Müller")), "Müller", "it::any2utf8 double encoded latin1");
+is(it::any2utf8(utf8_decode("Müller")), "Müller", "it::any2utf8 incorrectly encoded latin1");
+is(it::any2utf8("a💚b"), "a💚b", "it::any2utf8 correctly handles 4-byte utf-8 character GREEN HEART");
+
?>
diff --git a/tests/it_html.t b/tests/it_html.t
index 3ac69f6..307bc7f 100755
--- a/tests/it_html.t
+++ b/tests/it_html.t
@@ -170,16 +170,5 @@ is(it_html::entity_decode("&#8217;"), "'", "it_html::entity_decode numeric decim
is(it_html::entity_decode("&#xfff;"), " ", "it_html::entity_decode invalid numeric hex entity");
is(it_html::entity_decode("&#999;"), " ", "it_html::entity_decode invalid numeric decimal entity");
-is(it_html::fix_encoding("Meier"), "Meier", "it_html::fix_encoding ascii");
-is(it_html::fix_encoding("Müller"), "Müller", "it_html::fix_encoding utf-8 latin1");
-is(it_html::fix_encoding("Aslı"), "Aslı", "it_html::fix_encoding utf-8 non-latin1");
-is(it_html::fix_encoding("é»"), "é»", "it_html::fix_encoding utf-8 latin1 special combination");
-
-is(it_html::fix_encoding(utf8_encode("Müller"), true), "Müller", "it_html::fix_encoding double encoded latin1");
-is(it_html::fix_encoding(utf8_encode("é»"), true), "é»", "it_html::fix_encoding double encoded latin1 special combination");
-
-is(it_html::fix_encoding(utf8_decode("Müller"), true), "Müller", "it_html::fix_encoding incorrectly encoded latin1");
-
-is(it_html::fix_encoding("a💚b"), "a💚b", "it_html::fix_encoding correctly handles 4-byte utf-8 character GREEN HEART");
?>