diff options
author | Christian Schneider | 2012-07-02 15:30:50 +0000 |
---|---|---|
committer | Christian Schneider | 2012-07-02 15:30:50 +0000 |
commit | 57da96485e0671fc34f2ea862b872af186ff043c (patch) | |
tree | 5a312f9a618f262171d3e9998b83a0293a068b02 | |
parent | 58a0444eabccaba87f3050b7f264e404f0cec6d4 (diff) | |
download | itools-57da96485e0671fc34f2ea862b872af186ff043c.tar.gz itools-57da96485e0671fc34f2ea862b872af186ff043c.tar.bz2 itools-57da96485e0671fc34f2ea862b872af186ff043c.zip |
Improved cleanup handling for illegal characters in html, now handles utf-8 as well
-rw-r--r-- | it_html.class | 30 | ||||
-rw-r--r-- | it_xml.class | 2 |
2 files changed, 18 insertions, 14 deletions
diff --git a/it_html.class b/it_html.class index bf9f8ff..66c751a 100644 --- a/it_html.class +++ b/it_html.class @@ -180,9 +180,6 @@ function head($args = array()) header("Content-Type: " . $p['content-type']); $js = isset($p['jsenv']) ? "var env = " . itjs::serialize($p['jsenv']) . ";\n" : ''; - if ($GLOBALS['it_html']->p['charset'] == "iso-8859-1") - $js = it_html::latinize($js); - $js .= $this->_itjs($p['jsinline'], 'inline'); if ($p['js']) @@ -194,7 +191,7 @@ function head($args = array()) } if ($js) - $data .= $this->js(array($js)); + $data .= $this->js(array(self::_cleanup($js, $p['charset']))); return tag('head', $header, $data); } @@ -286,6 +283,7 @@ function _tag($name, $args) if (($levels = intval($GLOBALS['debug_srclines'])) && !it::match('^(head|meta|title|script|style|link)', $name)) $attr = array('title' => it_debug::backtrace(array('levels' => max(3, $levels), 'skipfiles' => "_html\\.class"))) + $attr; + $charset = $GLOBALS['it_html']->p['charset']; $result .= "<$name"; # add attributes. If $value === true, use key only (<td nowrap> instead of <td nowrap=""> for old html, <td nowrap="nowrap"> for xhtml style) @@ -296,7 +294,7 @@ function _tag($name, $args) else if (isset($value) && $value !== true) # normal case: value { if (preg_match('/[<>&"\x00-\x08\x0a-\x0c\x0e-\x1f\x80-\x9f]/', $value)) # WARNING: copy/pasted from Q() - $result .= " $key=\"" . str_replace("\n", " ", htmlspecialchars($GLOBALS['it_html']->p['charset'] == "iso-8859-1" ? it_html::latinize($value) : $value, ENT_COMPAT, $GLOBALS['it_html']->p['charset'])) . '"'; + $result .= " $key=\"" . str_replace("\n", " ", htmlspecialchars(self::_cleanup($value, $charset), ENT_COMPAT, $charset)) . '"'; else $result .= " $key=\"$value\""; } @@ -312,7 +310,7 @@ function _tag($name, $args) else $result .= " />$newline"; - if ($GLOBALS['debug_utf8check'] && $GLOBALS['it_html']->p['charset'] == "utf-8") + if ($GLOBALS['debug_utf8check'] && $charset == "utf-8") $result = it::any2utf8($result, "error in $name()"); return $result; @@ -458,18 +456,23 @@ function sanitize($html) */ function entity_decode($string) { - $string = preg_replace('/&#(8217|65533);/', "'", html_entity_decode($string, ENT_COMPAT, $GLOBALS['it_html']->p['charset'])); + $charset = $GLOBALS['it_html']->p['charset']; + $string = preg_replace('/&#(8217|65533);/', "'", html_entity_decode($string, ENT_COMPAT, $charset)); $string = preg_replace('/&#[^;]*;/i', " ", $string); # remove remaining illegal numeric entities, e.g. 0x80-0x9f - return $string; + return self::_cleanup($string, $charset); } /** - * Replace or remove all illegal characters from a latin-1 string + * Replace or remove all illegal characters from a HTML string (knows utf-8 and latin1) */ -function latinize($string) +function _cleanup($string, $charset) { - return preg_replace('/[\x00-\x08\x0b-\x0c\x0e-\x1f\x80-\x9f]/', ' ', strtr($string, array("\x80" => "EUR", "\x82" => "'", "\x84" => "\"", "\x85" => "...", "\x8a" => "S", "\x8c" => "OE", "\x8e" => "Z", "\x91" => "'", "\x92" => "'", "\x93" => "\"", "\x94" => "\"", "\x96" => "-", "\x97" => "-", "\x9a" => "s", "\x9e" => "z"))); + $result = $charset == "utf-8" + ? preg_replace('/\xc2[\x80-\x9f]/', ' ', $string) + : preg_replace('/[\x80-\x9f]/', ' ', strtr($string, array("\x80" => "EUR", "\x82" => "'", "\x84" => "\"", "\x85" => "...", "\x8a" => "S", "\x8c" => "OE", "\x8e" => "Z", "\x91" => "'", "\x92" => "'", "\x93" => "\"", "\x94" => "\"", "\x96" => "-", "\x97" => "-", "\x9a" => "s", "\x9e" => "z"))); + + return preg_replace('/[\x00-\x08\x0b-\x0c\x0e-\x1f]/', ' ', $result); } /** @@ -481,11 +484,12 @@ function Q($string) { if (preg_match('/[<>&"\x00-\x08\x0a-\x0c\x0e-\x1f\x80-\xff]/', $string)) # WARNING: copy/pasted to _tag() { - if ($GLOBALS['debug_utf8check'] && $GLOBALS['it_html']->p['charset'] == "utf-8") + $charset = $GLOBALS['it_html']->p['charset']; + if ($GLOBALS['debug_utf8check'] && $charset == "utf-8") $string = it::any2utf8($string, "error in Q()"); $origstring = $string; - $string = @htmlspecialchars($GLOBALS['it_html']->p['charset'] == "iso-8859-1" ? it_html::latinize($string) : $string, ENT_COMPAT, $GLOBALS['it_html']->p['charset']); + $string = @htmlspecialchars(self::_cleanup($string, $charset), ENT_COMPAT, $charset); if ($string === "" && $origstring) it::error("invalid utf-8 '$origstring'"); } diff --git a/it_xml.class b/it_xml.class index f854682..c98b6eb 100644 --- a/it_xml.class +++ b/it_xml.class @@ -134,7 +134,7 @@ function _sanitize($xmldata, $isutf8 = null) # If not utf-8, remove characters illegal for latin-1 if (!$isutf8 && preg_match('/[\x00-\x08\x0b-\x0c\x0e-\x1f\x80-\x9f]/', $xmldata)) - $xmldata = it_html::latinize($xmldata); + $xmldata = it_html::_cleanup($xmldata, $isutf8 ? "utf-8" : "iso-8859-1"); return array($xmldata, $isutf8); } |