summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChristian Schneider2018-01-18 18:43:40 +0100
committerChristian Schneider2018-01-18 18:43:40 +0100
commit960b387c66c0126f862e21d5c2b56fa17e399b75 (patch)
tree71338d0939dfb4bc742bfcef98e63136636689a9
parent50485bf9d9243693514ace67e9931a6c24947333 (diff)
downloaditools-960b387c66c0126f862e21d5c2b56fa17e399b75.tar.gz
itools-960b387c66c0126f862e21d5c2b56fa17e399b75.tar.bz2
itools-960b387c66c0126f862e21d5c2b56fa17e399b75.zip
Normalize combining diaeresis to umlaut in it::any2utf8()
-rw-r--r--it.class4
-rwxr-xr-xtests/it.t10
2 files changed, 13 insertions, 1 deletions
diff --git a/it.class b/it.class
index 4c57b2c..83fcf0e 100644
--- a/it.class
+++ b/it.class
@@ -519,7 +519,9 @@ static function any2utf8($value, $errprefix = "")
list($value, $error) = array(it::any2utf8(preg_replace_callback('/\xc3[\x82\x83]\xc2[\x82\x83\xbc\xa9\xa4\xb6\xa8\xa2\xa0\xb4\xaa\xa7\x84\xab\xae\x9c\xaf\x96\xb2\xbb\xb9\x9f]/', function($m) {return utf8_decode($m[0]);}, $value)), $errprefix ? "$errprefix: double utf8-encoding. input=$value" : "");
if (preg_match('/\xef\xb7[\x90-\xaf]|\xef\xbf[\xbe\xbf]/', $value))
list($value, $error) = array(preg_replace('/\xef\xb7[\x90-\xaf]|\xef\xbf[\xbe\xbf]/', " ", $value), "forbidden utf-8 character. input=$value");
- $value = preg_replace('/\xc2\xad/', '', $value);
+ $value = preg_replace('/\xc2\xad/', '', $value); # Kill invisible soft hyphens
+ if (preg_match('/\xcc\x88/', $value)) # Normalize combining diaeresis to umlaut
+ $value = strtr($value, [ "a\xcc\x88" => 'ä', "A\xcc\x88" => 'Ä', "e\xcc\x88" => 'ë', "E\xcc\x88" => 'Ë', "i\xcc\x88" => 'ï', "I\xcc\x88" => 'Ï', "o\xcc\x88" => 'ö', "O\xcc\x88" => 'Ö', "u\xcc\x88" => 'ü', "U\xcc\x88" => 'Ü' ]);
if ($error && $errprefix)
it::error(array('title' => "$errprefix: " . trim($error)));
}
diff --git a/tests/it.t b/tests/it.t
index 22826a6..0da8768 100755
--- a/tests/it.t
+++ b/tests/it.t
@@ -414,6 +414,16 @@ is(it::any2utf8(array("foo", array(utf8_decode("bär")))), array("foo", array("b
is(it::any2utf8(array(1, true, false, null)), array(1, true, false, null), "any2utf8 should leave types alone");
is(it::any2utf8(array(utf8_decode('Müller') => utf8_decode('Müller'))), array('Müller' => 'Müller'), "it::any2utf8 latin1 keys");
+is(it::any2utf8("\xc2\xad"), "", "it::any2utf8 remove soft hyphens");
+
+foreach ([ 'a' => 'ä', 'e' => 'ë', 'i' => 'ï', 'o' => 'ö', 'u' => 'ü' ] as $src => $dst)
+{
+ is(it::any2utf8("$src\xcc\x88"), $dst, "it::any2utf8 normalize combining diaeresis $dst to umlaut code $dst");
+ $src = mb_strtoupper($src);
+ $dst = mb_strtoupper($dst);
+ is(it::any2utf8("$src\xcc\x88"), $dst, "it::any2utf8 normalize combining diaeresis $dst to umlaut code $dst");
+}
+
foreach (array($dummy, false, true, null, 1, "a", "Ä", "/", array()) as $var)
is(it::json_decode(it::json_encode($var)), $var);