diff options
-rw-r--r-- | it_html.class | 16 | ||||
-rwxr-xr-x | tests/it_html.t | 8 |
2 files changed, 22 insertions, 2 deletions
diff --git a/it_html.class b/it_html.class index fd2930e..675e852 100644 --- a/it_html.class +++ b/it_html.class @@ -423,6 +423,18 @@ function sanitize($html) } /** + * Decode all entities, ensure latin-1 encoding + */ +function entity_decode($string) +{ + $string = preg_replace('/&#(8217|65533);/', "'", html_entity_decode($string)); + $string = preg_replace('/�*([0-9a-f]+);/ei', 'hexdec("\\1") <= 255 ? chr(hexdec("\\1")) : " "', $string); + $string = preg_replace('/�*([0-9]+);/e', '\\1 <= 255 ? chr(\\1) : " "', $string); + + return $string; +} + +/** * Replace or remove all illegal characters from a latin-1 string */ function latinize($string) @@ -464,9 +476,9 @@ function U(/* ... */) $u['host'] = preg_match('/[^-_.0-9a-z]/i', $u['host']) && function_exists('idn_to_ascii') && ($idnahost = idn_to_ascii($GLOBALS['it_html']->p['charset'] == "iso-8859-1" ? utf8_encode($u['host']) : $u['host'])) ? $idnahost : $u['host']; # Punycode hostname to include into webpage $u['host'] = preg_replace('/[^-_.0-9a-z\x80-\xff]/ie', "rawurlencode('\$0')", $u['host']); # Encode garbage chars in host - # handle scheme, user, password, host + # handle scheme, user (urlencoded), password, host $hostpart = - ($u['user'] ? $u['user'] . ($u['pass'] ? ":" . $u['pass'] : "") . "@" : "") . + ($u['user'] ? preg_replace('|[^-\w.+!*(),:?@&=/~$%#]|e', 'urlencode(stripslashes("$0"))', $u['user'] . ($u['pass'] ? ":" . $u['pass'] : "") . "@") : "") . ($u['host'] ? $u['host'] : "") . ($u['port'] ? ":" . intval($u['port']) : ""); diff --git a/tests/it_html.t b/tests/it_html.t index 0a3f4eb..0def431 100755 --- a/tests/it_html.t +++ b/tests/it_html.t @@ -148,4 +148,12 @@ is( 'a/b', 'U() converting of \ to /' ); + +is(it_html::entity_decode("ä"), "ä"); +is(it_html::entity_decode("’"), "'"); +is(it_html::entity_decode("J"), "J"); +is(it_html::entity_decode("J"), "J"); +is(it_html::entity_decode("࿿"), " "); +is(it_html::entity_decode("A"), "A"); +is(it_html::entity_decode("ϧ"), " "); ?> |