summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristian Schneider2022-12-12 16:55:47 +0100
committerChristian Schneider2022-12-12 16:55:47 +0100
commit55bd4e0052b830256ad1d1134bbe5c7231d1427b (patch)
tree49d4980bb599db87085b75c7780dc6138ad681db
parentd6f07d7e1350c57e59192ba0047a8431fc59cdf3 (diff)
downloaditools-55bd4e0052b830256ad1d1134bbe5c7231d1427b.tar.gz
itools-55bd4e0052b830256ad1d1134bbe5c7231d1427b.tar.bz2
itools-55bd4e0052b830256ad1d1134bbe5c7231d1427b.zip
Add and use it::utf8_decode and it::utf8_encode for easier migration to PHP 8.2
-rw-r--r--it.class14
-rw-r--r--it_html.class2
-rw-r--r--it_xml.class2
-rwxr-xr-xtest/exec.t2
-rwxr-xr-xtest/it.t34
-rwxr-xr-xtest/it_xml.t4
6 files changed, 34 insertions, 24 deletions
diff --git a/it.class b/it.class
index a0cf883..8375b83 100644
--- a/it.class
+++ b/it.class
@@ -587,9 +587,9 @@ static function any2utf8($value, $errprefix = "")
else if (is_string($value))
{
if (grapheme_strlen($value) === null)
- list($value, $error) = array(utf8_encode($value), utf8_encode("incorrect utf8-encoding. input=$value"));
+ list($value, $error) = array(it::utf8_encode($value), it::utf8_encode("incorrect utf8-encoding. input=$value"));
if (preg_match('/\xc3[\x82\x83]\xc2[\x82\x83\xbc\xa9\xa4\xb6\xa8\xa2\xa0\xb4\xaa\xa7\x84\xab\xae\x9c\xaf\x96\xb2\xbb\xb9\x9f]/', $value))
- list($value, $error) = array(it::any2utf8(preg_replace_callback('/\xc3[\x82\x83]\xc2[\x82\x83\xbc\xa9\xa4\xb6\xa8\xa2\xa0\xb4\xaa\xa7\x84\xab\xae\x9c\xaf\x96\xb2\xbb\xb9\x9f]/', function($m) {return utf8_decode($m[0]);}, $value)), $errprefix ? "double utf8-encoding. input=$value" : "");
+ list($value, $error) = array(it::any2utf8(preg_replace_callback('/\xc3[\x82\x83]\xc2[\x82\x83\xbc\xa9\xa4\xb6\xa8\xa2\xa0\xb4\xaa\xa7\x84\xab\xae\x9c\xaf\x96\xb2\xbb\xb9\x9f]/', function($m) {return it::utf8_decode($m[0]);}, $value)), $errprefix ? "double utf8-encoding. input=$value" : "");
if (preg_match('/\xef\xb7[\x90-\xaf]|\xef\xbf[\xbe\xbf]/', $value))
list($value, $error) = array(preg_replace('/\xef\xb7[\x90-\xaf]|\xef\xbf[\xbe\xbf]/', " ", $value), "forbidden utf-8 character. input=$value");
$value = preg_replace('/\xc2\xad/', '', $value); # Kill invisible soft hyphens
@@ -1369,4 +1369,14 @@ static function request_body()
return it::any2utf8(it::file_get_contents('php://input'));
}
+static function utf8_decode($utf8)
+{
+ return UConverter::transcode($utf8, 'ISO-8859-1', 'UTF8', ['to_subst' => '?']);
+}
+
+static function utf8_encode($latin1)
+{
+ return UConverter::transcode($latin1, 'UTF8', 'ISO-8859-1');
+}
+
}
diff --git a/it_html.class b/it_html.class
index 8254c22..93057bd 100644
--- a/it_html.class
+++ b/it_html.class
@@ -511,7 +511,7 @@ static function U(...$args)
list($u['path'], $u['query']) = explode("?", $base, 2);
}
- $u['host'] = preg_match('/[^-_.0-9a-z]/i', $u['host']) && function_exists('idn_to_ascii') && ($idnahost = idn_to_ascii($GLOBALS['it_html']->p['charset'] == "iso-8859-1" ? utf8_encode($u['host']) : $u['host'])) ? $idnahost : $u['host']; # Punycode hostname to include into webpage
+ $u['host'] = preg_match('/[^-_.0-9a-z]/i', $u['host']) && function_exists('idn_to_ascii') && ($idnahost = idn_to_ascii($GLOBALS['it_html']->p['charset'] == "iso-8859-1" ? it::utf8_encode($u['host']) : $u['host'])) ? $idnahost : $u['host']; # Punycode hostname to include into webpage
$u['host'] = preg_replace_callback('/[^-_.0-9a-z\x80-\xff]/i', function($m) { return rawurlencode($m[0]); }, $u['host']); # Encode garbage chars in host
# handle scheme, user (urlencoded), password, host
diff --git a/it_xml.class b/it_xml.class
index 62b4df5..7765054 100644
--- a/it_xml.class
+++ b/it_xml.class
@@ -141,7 +141,7 @@ function _sanitize($xmldata, $isutf8 = null)
# Encode non-utf8 characters in a string, leave utf8 alone
static function _utf8_fix($str)
{
- return preg_match('/^([\xc0-\xdf][\x80-\xbf]|[\xe0-\xef][\x80-\xbf][\x80-\xbf]|[\xf0-\xf7][\x80-\xbf][\x80-\xbf][\x80-\xbf])$/', $str) ? $str : utf8_encode($str);
+ return preg_match('/^([\xc0-\xdf][\x80-\xbf]|[\xe0-\xef][\x80-\xbf][\x80-\xbf]|[\xf0-\xf7][\x80-\xbf][\x80-\xbf][\x80-\xbf])$/', $str) ? $str : it::utf8_encode($str);
}
function consume(/* $p */)
diff --git a/test/exec.t b/test/exec.t
index 09e16da..04850fc 100755
--- a/test/exec.t
+++ b/test/exec.t
@@ -70,7 +70,7 @@ foreach (["", "C", "de_CH", "de_CH.utf8"] as $locale)
setlocale(LC_ALL, $locale);
$arg = "preüpost";
if (it::match('utf8', $locale))
- $arg = utf8_encode($arg);
+ $arg = it::any2utf8($arg);
is(it::exec("echo " . $arg), $arg . "\n", "exec with umlaut (locale '$locale')");
is(it::exec("echo {arg}", ['arg' => $arg]), $arg . "\n", "exec with argument and umlaut (locale '$locale')");
}
diff --git a/test/it.t b/test/it.t
index 7733c00..41758da 100755
--- a/test/it.t
+++ b/test/it.t
@@ -221,15 +221,15 @@ _match(
);
_match(
- utf8_decode('ö'), utf8_decode('Ö'),
- utf8_decode('Ö'),
+ it::utf8_decode('ö'), it::utf8_decode('Ö'),
+ it::utf8_decode('Ö'),
'match umlaute in de_CH.latin1 case insensitive',
['utf8' => false]
);
_match(
- utf8_decode('aöBÜ'), utf8_decode('AÖbü'),
- utf8_decode('AÖbü'),
+ it::utf8_decode('aöBÜ'), it::utf8_decode('AÖbü'),
+ it::utf8_decode('AÖbü'),
"match umlaute with non-utf-8 override in p",
['utf8' => false]
);
@@ -419,32 +419,32 @@ is(grapheme_strlen("\xc1"), null, "need grapheme_strlen side effect for any2utf8
is(it::any2utf8('Meier'), 'Meier', "it::any2utf8 ascii input");
is(it::any2utf8('Müller'), 'Müller', "it::any2utf8 utf8 input");
is(it::any2utf8('Aslı'), 'Aslı', "it::any2utf8 utf8 non-latin1 input");
-is(it::any2utf8(utf8_decode('Müller')), 'Müller', "it::any2utf8 latin1 input");
+is(it::any2utf8(it::utf8_decode('Müller')), 'Müller', "it::any2utf8 latin1 input");
is(it::any2utf8(
' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ'), # omit soft hyphen cause we filter it
' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ',
"it::any2utf8 utf8 input (exhaustive alphabet)");
is(it::any2utf8(
- utf8_decode(' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ')),
+ it::utf8_decode(' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ')),
' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ',
"it::any2utf8 latin1 input (exhaustive alphabet)");
-is(it::any2utf8(utf8_encode("ü")), "ü", "it::any2utf8 double encoding");
+is(it::any2utf8(it::utf8_encode("ü")), "ü", "it::any2utf8 double encoding");
is(it::any2utf8("Meier"), "Meier", "it::any2utf8 ascii");
is(it::any2utf8("Müller"), "Müller", "it::any2utf8 utf-8 latin1");
is(it::any2utf8("Aslı"), "Aslı", "it::any2utf8 utf-8 non-latin1");
is(it::any2utf8("é»"), "é»", "it::any2utf8 utf-8 latin1 special combination");
-is(it::any2utf8(utf8_encode("Müller")), "Müller", "it::any2utf8 doubly encoded utf8");
-is(it::any2utf8(utf8_encode(utf8_encode("Müller"))), "Müller", "it::any2utf8 triply encoded utf8");
-is(it::any2utf8(utf8_decode("Müller")), "Müller", "it::any2utf8 incorrectly encoded latin1");
+is(it::any2utf8(it::utf8_encode("Müller")), "Müller", "it::any2utf8 doubly encoded utf8");
+is(it::any2utf8(it::utf8_encode(it::utf8_encode("Müller"))), "Müller", "it::any2utf8 triply encoded utf8");
+is(it::any2utf8(it::utf8_decode("Müller")), "Müller", "it::any2utf8 incorrectly encoded latin1");
is(it::any2utf8("a💚b"), "a💚b", "it::any2utf8 correctly handles 4-byte utf-8 character GREEN HEART");
-is(it::any2utf8(["foo", utf8_decode("bär")]), ["foo", "bär"], "any2utf8 on arrays");
-is(it::any2utf8(["foo", [utf8_decode("bär")]]), ["foo", ["bär"]], "any2utf8 on recursive arrays");
+is(it::any2utf8(["foo", it::utf8_decode("bär")]), ["foo", "bär"], "any2utf8 on arrays");
+is(it::any2utf8(["foo", [it::utf8_decode("bär")]]), ["foo", ["bär"]], "any2utf8 on recursive arrays");
is(it::any2utf8([1, true, false, null]), [1, true, false, null], "any2utf8 should leave types alone");
-is(it::any2utf8([utf8_decode('Müller') => utf8_decode('Müller')]), ['Müller' => 'Müller'], "it::any2utf8 latin1 keys");
+is(it::any2utf8([it::utf8_decode('Müller') => it::utf8_decode('Müller')]), ['Müller' => 'Müller'], "it::any2utf8 latin1 keys");
is(it::any2utf8("\xc2\xad"), "", "it::any2utf8 remove soft hyphens");
@@ -532,10 +532,10 @@ it::file_put($tmpfile, "bb");
is(it::file_get($tmpfile), "bb");
unlink($tmpfile);
-requesturi(utf8_decode("lüönd"), "lüönd");
-requesturi(utf8_decode("ü").utf8_encode("ü"), "üü");
-requesturi(utf8_encode("müller"), "müller");
-requesturi(utf8_encode(utf8_encode("müller")), "müller");
+requesturi(it::utf8_decode("lüönd"), "lüönd");
+requesturi(it::utf8_decode("ü").it::utf8_encode("ü"), "üü");
+requesturi(it::utf8_encode("müller"), "müller");
+requesturi(it::utf8_encode(it::utf8_encode("müller")), "müller");
requesturi("I 💚 Nü York", "I 💚 Nü York");
function requesturi($teststring, $expect)
diff --git a/test/it_xml.t b/test/it_xml.t
index e21f052..88a5cf4 100755
--- a/test/it_xml.t
+++ b/test/it_xml.t
@@ -63,7 +63,7 @@ _match(
_match(
'<foo>x &uuml; y</foo>',
- utf8_decode('foo Object ( [val] => x ü y ) '),
+ it::utf8_decode('foo Object ( [val] => x ü y ) '),
'Manual encoding override',
"",
['encoding' => "iso-8859-1"]
@@ -77,7 +77,7 @@ _match(
_match(
'<foo>&amp;amp; &lt;a&gt; &#38;amp; &#60;b&#62; &#x26;amp; &#x3C;c&#x3E; &#xFC;</foo>',
- utf8_decode('foo Object ( [val] => &amp; <a> &amp; <b> &amp; <c> ü ) '),
+ it::utf8_decode('foo Object ( [val] => &amp; <a> &amp; <b> &amp; <c> ü ) '),
'Predecode illegal entities while keeping properly encoded ones (iso-8859-1)',
"",
['encoding' => "iso-8859-1"]