diff options
-rw-r--r-- | it.class | 14 | ||||
-rw-r--r-- | it_html.class | 2 | ||||
-rw-r--r-- | it_xml.class | 2 | ||||
-rwxr-xr-x | test/exec.t | 2 | ||||
-rwxr-xr-x | test/it.t | 34 | ||||
-rwxr-xr-x | test/it_xml.t | 4 |
6 files changed, 34 insertions, 24 deletions
@@ -587,9 +587,9 @@ static function any2utf8($value, $errprefix = "") else if (is_string($value)) { if (grapheme_strlen($value) === null) - list($value, $error) = array(utf8_encode($value), utf8_encode("incorrect utf8-encoding. input=$value")); + list($value, $error) = array(it::utf8_encode($value), it::utf8_encode("incorrect utf8-encoding. input=$value")); if (preg_match('/\xc3[\x82\x83]\xc2[\x82\x83\xbc\xa9\xa4\xb6\xa8\xa2\xa0\xb4\xaa\xa7\x84\xab\xae\x9c\xaf\x96\xb2\xbb\xb9\x9f]/', $value)) - list($value, $error) = array(it::any2utf8(preg_replace_callback('/\xc3[\x82\x83]\xc2[\x82\x83\xbc\xa9\xa4\xb6\xa8\xa2\xa0\xb4\xaa\xa7\x84\xab\xae\x9c\xaf\x96\xb2\xbb\xb9\x9f]/', function($m) {return utf8_decode($m[0]);}, $value)), $errprefix ? "double utf8-encoding. input=$value" : ""); + list($value, $error) = array(it::any2utf8(preg_replace_callback('/\xc3[\x82\x83]\xc2[\x82\x83\xbc\xa9\xa4\xb6\xa8\xa2\xa0\xb4\xaa\xa7\x84\xab\xae\x9c\xaf\x96\xb2\xbb\xb9\x9f]/', function($m) {return it::utf8_decode($m[0]);}, $value)), $errprefix ? "double utf8-encoding. input=$value" : ""); if (preg_match('/\xef\xb7[\x90-\xaf]|\xef\xbf[\xbe\xbf]/', $value)) list($value, $error) = array(preg_replace('/\xef\xb7[\x90-\xaf]|\xef\xbf[\xbe\xbf]/', " ", $value), "forbidden utf-8 character. input=$value"); $value = preg_replace('/\xc2\xad/', '', $value); # Kill invisible soft hyphens @@ -1369,4 +1369,14 @@ static function request_body() return it::any2utf8(it::file_get_contents('php://input')); } +static function utf8_decode($utf8) +{ + return UConverter::transcode($utf8, 'ISO-8859-1', 'UTF8', ['to_subst' => '?']); +} + +static function utf8_encode($latin1) +{ + return UConverter::transcode($latin1, 'UTF8', 'ISO-8859-1'); +} + } diff --git a/it_html.class b/it_html.class index 8254c22..93057bd 100644 --- a/it_html.class +++ b/it_html.class @@ -511,7 +511,7 @@ static function U(...$args) list($u['path'], $u['query']) = explode("?", $base, 2); } - $u['host'] = preg_match('/[^-_.0-9a-z]/i', $u['host']) && function_exists('idn_to_ascii') && ($idnahost = idn_to_ascii($GLOBALS['it_html']->p['charset'] == "iso-8859-1" ? utf8_encode($u['host']) : $u['host'])) ? $idnahost : $u['host']; # Punycode hostname to include into webpage + $u['host'] = preg_match('/[^-_.0-9a-z]/i', $u['host']) && function_exists('idn_to_ascii') && ($idnahost = idn_to_ascii($GLOBALS['it_html']->p['charset'] == "iso-8859-1" ? it::utf8_encode($u['host']) : $u['host'])) ? $idnahost : $u['host']; # Punycode hostname to include into webpage $u['host'] = preg_replace_callback('/[^-_.0-9a-z\x80-\xff]/i', function($m) { return rawurlencode($m[0]); }, $u['host']); # Encode garbage chars in host # handle scheme, user (urlencoded), password, host diff --git a/it_xml.class b/it_xml.class index 62b4df5..7765054 100644 --- a/it_xml.class +++ b/it_xml.class @@ -141,7 +141,7 @@ function _sanitize($xmldata, $isutf8 = null) # Encode non-utf8 characters in a string, leave utf8 alone static function _utf8_fix($str) { - return preg_match('/^([\xc0-\xdf][\x80-\xbf]|[\xe0-\xef][\x80-\xbf][\x80-\xbf]|[\xf0-\xf7][\x80-\xbf][\x80-\xbf][\x80-\xbf])$/', $str) ? $str : utf8_encode($str); + return preg_match('/^([\xc0-\xdf][\x80-\xbf]|[\xe0-\xef][\x80-\xbf][\x80-\xbf]|[\xf0-\xf7][\x80-\xbf][\x80-\xbf][\x80-\xbf])$/', $str) ? $str : it::utf8_encode($str); } function consume(/* $p */) diff --git a/test/exec.t b/test/exec.t index 09e16da..04850fc 100755 --- a/test/exec.t +++ b/test/exec.t @@ -70,7 +70,7 @@ foreach (["", "C", "de_CH", "de_CH.utf8"] as $locale) setlocale(LC_ALL, $locale); $arg = "preüpost"; if (it::match('utf8', $locale)) - $arg = utf8_encode($arg); + $arg = it::any2utf8($arg); is(it::exec("echo " . $arg), $arg . "\n", "exec with umlaut (locale '$locale')"); is(it::exec("echo {arg}", ['arg' => $arg]), $arg . "\n", "exec with argument and umlaut (locale '$locale')"); } @@ -221,15 +221,15 @@ _match( ); _match( - utf8_decode('ö'), utf8_decode('Ö'), - utf8_decode('Ö'), + it::utf8_decode('ö'), it::utf8_decode('Ö'), + it::utf8_decode('Ö'), 'match umlaute in de_CH.latin1 case insensitive', ['utf8' => false] ); _match( - utf8_decode('aöBÜ'), utf8_decode('AÖbü'), - utf8_decode('AÖbü'), + it::utf8_decode('aöBÜ'), it::utf8_decode('AÖbü'), + it::utf8_decode('AÖbü'), "match umlaute with non-utf-8 override in p", ['utf8' => false] ); @@ -419,32 +419,32 @@ is(grapheme_strlen("\xc1"), null, "need grapheme_strlen side effect for any2utf8 is(it::any2utf8('Meier'), 'Meier', "it::any2utf8 ascii input"); is(it::any2utf8('Müller'), 'Müller', "it::any2utf8 utf8 input"); is(it::any2utf8('Aslı'), 'Aslı', "it::any2utf8 utf8 non-latin1 input"); -is(it::any2utf8(utf8_decode('Müller')), 'Müller', "it::any2utf8 latin1 input"); +is(it::any2utf8(it::utf8_decode('Müller')), 'Müller', "it::any2utf8 latin1 input"); is(it::any2utf8( ' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ'), # omit soft hyphen cause we filter it ' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ', "it::any2utf8 utf8 input (exhaustive alphabet)"); is(it::any2utf8( - utf8_decode(' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ')), + it::utf8_decode(' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ')), ' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ', "it::any2utf8 latin1 input (exhaustive alphabet)"); -is(it::any2utf8(utf8_encode("ü")), "ü", "it::any2utf8 double encoding"); +is(it::any2utf8(it::utf8_encode("ü")), "ü", "it::any2utf8 double encoding"); is(it::any2utf8("Meier"), "Meier", "it::any2utf8 ascii"); is(it::any2utf8("Müller"), "Müller", "it::any2utf8 utf-8 latin1"); is(it::any2utf8("Aslı"), "Aslı", "it::any2utf8 utf-8 non-latin1"); is(it::any2utf8("é»"), "é»", "it::any2utf8 utf-8 latin1 special combination"); -is(it::any2utf8(utf8_encode("Müller")), "Müller", "it::any2utf8 doubly encoded utf8"); -is(it::any2utf8(utf8_encode(utf8_encode("Müller"))), "Müller", "it::any2utf8 triply encoded utf8"); -is(it::any2utf8(utf8_decode("Müller")), "Müller", "it::any2utf8 incorrectly encoded latin1"); +is(it::any2utf8(it::utf8_encode("Müller")), "Müller", "it::any2utf8 doubly encoded utf8"); +is(it::any2utf8(it::utf8_encode(it::utf8_encode("Müller"))), "Müller", "it::any2utf8 triply encoded utf8"); +is(it::any2utf8(it::utf8_decode("Müller")), "Müller", "it::any2utf8 incorrectly encoded latin1"); is(it::any2utf8("a💚b"), "a💚b", "it::any2utf8 correctly handles 4-byte utf-8 character GREEN HEART"); -is(it::any2utf8(["foo", utf8_decode("bär")]), ["foo", "bär"], "any2utf8 on arrays"); -is(it::any2utf8(["foo", [utf8_decode("bär")]]), ["foo", ["bär"]], "any2utf8 on recursive arrays"); +is(it::any2utf8(["foo", it::utf8_decode("bär")]), ["foo", "bär"], "any2utf8 on arrays"); +is(it::any2utf8(["foo", [it::utf8_decode("bär")]]), ["foo", ["bär"]], "any2utf8 on recursive arrays"); is(it::any2utf8([1, true, false, null]), [1, true, false, null], "any2utf8 should leave types alone"); -is(it::any2utf8([utf8_decode('Müller') => utf8_decode('Müller')]), ['Müller' => 'Müller'], "it::any2utf8 latin1 keys"); +is(it::any2utf8([it::utf8_decode('Müller') => it::utf8_decode('Müller')]), ['Müller' => 'Müller'], "it::any2utf8 latin1 keys"); is(it::any2utf8("\xc2\xad"), "", "it::any2utf8 remove soft hyphens"); @@ -532,10 +532,10 @@ it::file_put($tmpfile, "bb"); is(it::file_get($tmpfile), "bb"); unlink($tmpfile); -requesturi(utf8_decode("lüönd"), "lüönd"); -requesturi(utf8_decode("ü").utf8_encode("ü"), "üü"); -requesturi(utf8_encode("müller"), "müller"); -requesturi(utf8_encode(utf8_encode("müller")), "müller"); +requesturi(it::utf8_decode("lüönd"), "lüönd"); +requesturi(it::utf8_decode("ü").it::utf8_encode("ü"), "üü"); +requesturi(it::utf8_encode("müller"), "müller"); +requesturi(it::utf8_encode(it::utf8_encode("müller")), "müller"); requesturi("I 💚 Nü York", "I 💚 Nü York"); function requesturi($teststring, $expect) diff --git a/test/it_xml.t b/test/it_xml.t index e21f052..88a5cf4 100755 --- a/test/it_xml.t +++ b/test/it_xml.t @@ -63,7 +63,7 @@ _match( _match( '<foo>x ü y</foo>', - utf8_decode('foo Object ( [val] => x ü y ) '), + it::utf8_decode('foo Object ( [val] => x ü y ) '), 'Manual encoding override', "", ['encoding' => "iso-8859-1"] @@ -77,7 +77,7 @@ _match( _match( '<foo>&amp; <a> &amp; <b> &amp; <c> ü</foo>', - utf8_decode('foo Object ( [val] => & <a> & <b> & <c> ü ) '), + it::utf8_decode('foo Object ( [val] => & <a> & <b> & <c> ü ) '), 'Predecode illegal entities while keeping properly encoded ones (iso-8859-1)', "", ['encoding' => "iso-8859-1"] |