From 6c6828a8f0904110a67fe89031f9d4eaedf29213 Mon Sep 17 00:00:00 2001 From: Christian Weber Date: Fri, 23 Mar 2012 15:28:10 +0000 Subject: it_xml uses correct target encoding by default, adapt tests to utf-8 deafult --- it_xml.class | 11 +++++------ tests/it_xml.t | 35 +++++++++++++++++++++-------------- 2 files changed, 26 insertions(+), 20 deletions(-) diff --git a/it_xml.class b/it_xml.class index 0679c69..f854682 100644 --- a/it_xml.class +++ b/it_xml.class @@ -29,7 +29,7 @@ class it_xml * @param $p associative array * @param $p['forcearray'] xml tags to ALWAYS return as array * @param $p['safety'] 2 causes program abort with invalid xml, 1 (default) causes error report, 0 just returns false - * @param $p['encoding'] Output character encoding (e.g. UTF-8, default: ISO-8859-1) + * @param $p['encoding'] Output character encoding (utf-8, iso-8859-1 or us-ascii, default: ini_get('default_charset') * @param $p['prefix'] Optional prefix for class names * @param $p['lowercase'] Lowercase all tag and attribute names * @return XML object tree or null on failure @@ -49,23 +49,22 @@ function it_xml($xmldata = "", $p = array()) function create($xmldata, $p = array()) { $xml = new it_xml; - return $xml->from_xml($xmldata, array('factory' => true) + $p) ? $xml->_root : null; } function from_xml($xmldata, $p) { - $this->_p = $p + array('encoding' => "ISO-8859-1", 'safety' => 1); + $this->_p = $p + array('encoding' => ini_get('default_charset'), 'safety' => 1); $this->_arrayforce = array_flip((array)$this->_p['forcearray']); $this->_stack = array(); unset($this->error); - $parser = xml_parser_create($this->_p['encoding']); + $parser = xml_parser_create(); xml_set_object($parser, $this); xml_set_element_handler($parser, "start_element", "end_element"); xml_set_character_data_handler($parser, "character_data"); xml_parser_set_option($parser, XML_OPTION_CASE_FOLDING, 0); xml_parser_set_option($parser, XML_OPTION_TARGET_ENCODING, $this->_p['encoding']); - + $result = true; if (is_resource($xmldata)) @@ -123,7 +122,7 @@ function _sanitize($xmldata, $isutf8 = null) if (!preg_match('/^(<\?xml|\xEF\xBB\xBF|\xFE\xFF|\xFF\xFE|\x00\x00\xFE\xFF|\x00\x00\xFF\xFE)/', $xmldata)) $xmldata = '_p['encoding'] . '"?>' . $xmldata; - $isutf8 = (!preg_match('/^<\?xml[^>]* encoding=/i', $xmldata) || preg_match('/^<\?xml[^>]* encoding=.UTF-8/i', $xmldata)); + $isutf8 = (!preg_match('/^<\?xml[^>]* encoding=/i', $xmldata) || preg_match('/^<\?xml[^>]* encoding=.utf-8/i', $xmldata)); } # Decode illegal entities but protect semantically important ones diff --git a/tests/it_xml.t b/tests/it_xml.t index f74c54b..d74fadf 100755 --- a/tests/it_xml.t +++ b/tests/it_xml.t @@ -5,13 +5,14 @@ function match($xmldata, $expected, $name, $prefix = "", $p = array()) { - $classname = $prefix ? ($prefix . "_xml") : "it_xml"; + $classname = ($prefix ?: "it") . "_xml"; $varname = $prefix . "foo"; $xmldata = "$xmldata"; $xml = new $classname($xmldata, $p); + $mod_utf8 = $p['encoding'] != "iso-8859-1" ? "u" : ""; is( - preg_replace('/[#\s]+/', " ", print_r($xml->$varname, true)), + preg_replace('/[#\s]+/' . $mod_utf8, " ", print_r($xml->$varname, true)), $expected, "$name (string)" ); @@ -24,11 +25,10 @@ function match($xmldata, $expected, $name, $prefix = "", $p = array()) fclose($tmpfile); is( - preg_replace('/[#\s]+/', " ", print_r($xml->$varname, true)), + preg_replace('/[#\s]+/' . $mod_utf8, " ", print_r($xml->$varname, true)), $expected, "$name (file)" ); - } match( @@ -44,8 +44,8 @@ match( ); match( - 'Stüssihofstadt', - 'foo Object ( [attr] => Array ( [title] => Zürich ) [val] => Stüssihofstadt ) ', + 'Stüssihofstadt', + 'foo Object ( [attr] => Array ( [title] => Zürich ) [val] => Stüssihofstadt ) ', 'simple tag with latin1 content and attribute' ); @@ -62,26 +62,33 @@ match( ); match( - '&amp; <a> &amp; <b> &amp; <c> ü', - 'foo Object ( [val] => & & & ü ) ', - 'Predecode illegal entities while keeping properly encoded ones' + 'x ü y', + utf8_decode('foo Object ( [val] => x ü y ) '), + 'Manual encoding override', + "", + array('encoding' => "iso-8859-1") ); match( '&amp; <a> &amp; <b> &amp; <c> ü', - utf8_encode('foo Object ( [val] => & & & ü ) '), - 'Predecode illegal entities while keeping properly encoded ones (UTF-8)', - "", - array('encoding' => "UTF-8") + 'foo Object ( [val] => & & & ü ) ', + 'Predecode illegal entities while keeping properly encoded ones', ); +match( + '&amp; <a> &amp; <b> &amp; <c> ü', + utf8_decode('foo Object ( [val] => & & & ü ) '), + 'Predecode illegal entities while keeping properly encoded ones (iso-8859-1)', + "", + array('encoding' => "iso-8859-1") +); match( "a\x05b", 'foo Object ( [val] => a b ) ', 'Illegal latin 1 character', "", - array('encoding' => "ISO-8859-1") + array('encoding' => "iso-8859-1") ); # Test inheritance -- cgit v1.2.3