summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--it_html.class16
-rwxr-xr-xtests/it_html.t8
2 files changed, 22 insertions, 2 deletions
diff --git a/it_html.class b/it_html.class
index fd2930e..675e852 100644
--- a/it_html.class
+++ b/it_html.class
@@ -423,6 +423,18 @@ function sanitize($html)
}
/**
+ * Decode all entities, ensure latin-1 encoding
+ */
+function entity_decode($string)
+{
+ $string = preg_replace('/&#(8217|65533);/', "'", html_entity_decode($string));
+ $string = preg_replace('/&#x0*([0-9a-f]+);/ei', 'hexdec("\\1") <= 255 ? chr(hexdec("\\1")) : " "', $string);
+ $string = preg_replace('/&#0*([0-9]+);/e', '\\1 <= 255 ? chr(\\1) : " "', $string);
+
+ return $string;
+}
+
+/**
* Replace or remove all illegal characters from a latin-1 string
*/
function latinize($string)
@@ -464,9 +476,9 @@ function U(/* ... */)
$u['host'] = preg_match('/[^-_.0-9a-z]/i', $u['host']) && function_exists('idn_to_ascii') && ($idnahost = idn_to_ascii($GLOBALS['it_html']->p['charset'] == "iso-8859-1" ? utf8_encode($u['host']) : $u['host'])) ? $idnahost : $u['host']; # Punycode hostname to include into webpage
$u['host'] = preg_replace('/[^-_.0-9a-z\x80-\xff]/ie', "rawurlencode('\$0')", $u['host']); # Encode garbage chars in host
- # handle scheme, user, password, host
+ # handle scheme, user (urlencoded), password, host
$hostpart =
- ($u['user'] ? $u['user'] . ($u['pass'] ? ":" . $u['pass'] : "") . "@" : "") .
+ ($u['user'] ? preg_replace('|[^-\w.+!*(),:?@&=/~$%#]|e', 'urlencode(stripslashes("$0"))', $u['user'] . ($u['pass'] ? ":" . $u['pass'] : "") . "@") : "") .
($u['host'] ? $u['host'] : "") .
($u['port'] ? ":" . intval($u['port']) : "");
diff --git a/tests/it_html.t b/tests/it_html.t
index 0a3f4eb..0def431 100755
--- a/tests/it_html.t
+++ b/tests/it_html.t
@@ -148,4 +148,12 @@ is(
'a/b',
'U() converting of \ to /'
);
+
+is(it_html::entity_decode("&auml;"), "ä");
+is(it_html::entity_decode("&#8217;"), "'");
+is(it_html::entity_decode("&#x4a;"), "J");
+is(it_html::entity_decode("&#x4A;"), "J");
+is(it_html::entity_decode("&#xfff;"), " ");
+is(it_html::entity_decode("&#65;"), "A");
+is(it_html::entity_decode("&#999;"), " ");
?>