diff options
-rw-r--r-- | it_xml.class | 56 | ||||
-rwxr-xr-x | tests/it_xml.t | 19 |
2 files changed, 52 insertions, 23 deletions
diff --git a/it_xml.class b/it_xml.class index 1686478..29652d8 100644 --- a/it_xml.class +++ b/it_xml.class @@ -70,13 +70,16 @@ function from_xml($xmldata, $p) if (is_resource($xmldata)) { - if (!preg_match('/^<\?xml/', ($head = ltrim(fread($xmldata, 1024 * 1024))))) # Prepend XML header for charset detection in PHP5 - $head = '<?xml version="1.0" encoding="' . $this->_p['encoding'] . '"?>' . $head; - - $result = xml_parse($parser, $head, false); - while ($result && !feof($xmldata)) - $result = xml_parse($parser, fread($xmldata, 1024 * 1024), false); + { + $data = fread($xmldata, 1024 * 1024); + while (!feof($xmldata) && preg_match('/[\x80-\xff]$/', $data)) # Make sure end of chunk is not in the middle of a UTF8 character + $data .= fread($xmldata, 1); + + $xmlorig .= $data; + list($data, $isutf8) = $this->_sanitize($data, $isutf8); + $result = xml_parse($parser, $data); + } if ($result) $result = xml_parse($parser, "", true); @@ -84,21 +87,7 @@ function from_xml($xmldata, $p) else { $xmlorig = $xmldata; - $xmldata = trim($xmldata); - - # Add header for charset detection (PHP5) if no header/BOM - # See http://www.w3.org/TR/2006/REC-xml-20060816/#sec-guessing - if (!preg_match('/^(<\?xml|\xEF\xBB\xBF|\xFE\xFF|\xFF\xFE|\x00\x00\xFE\xFF|\x00\x00\xFF\xFE)/', $xmldata)) - $xmldata = '<?xml version="1.0" encoding="' . $this->_p['encoding'] . '"?>' . $xmldata; - - # Decode illegal entities but protect semantically important ones - $xmldata = html_entity_decode(preg_replace('/&(amp|lt|gt|#38|#60|#62|#x26|#x3C|#x3E);/i', '&$1;', $xmldata), ENT_NOQUOTES, $this->_p['encoding']); - - # If should be utf-8 and can't be decoded as such, fix it, even if mixed between both - if (!preg_match('/^<\?xml[^>]* encoding=/i', $xmldata) || preg_match('/^<\?xml[^>]* encoding=.UTF-8/i', $xmldata)) - if (preg_match('/[^\x80-\xff][\x80-\xff][^\x80-\xff]/', $xmldata)) - $xmldata = preg_replace('/[\x80-\xff]{1,4}/e', "it_xml::_utf8_fix('\\0')", $xmldata); - + list($xmldata, $isutf8) = $this->_sanitize($xmldata); $result = xml_parse($parser, $xmldata); } @@ -122,6 +111,31 @@ function from_xml($xmldata, $p) return empty($this->error); } +# Use various heuristics to fix real-world XML files +function _sanitize($xmldata, $isutf8 = null) +{ + $xmldata = trim($xmldata); + + if (!isset($isutf8)) # Check if we already decided on charset yet + { + # Add header for charset detection (PHP5) if no header/BOM + # See http://www.w3.org/TR/2006/REC-xml-20060816/#sec-guessing + if (!preg_match('/^(<\?xml|\xEF\xBB\xBF|\xFE\xFF|\xFF\xFE|\x00\x00\xFE\xFF|\x00\x00\xFF\xFE)/', $xmldata)) + $xmldata = '<?xml version="1.0" encoding="' . $this->_p['encoding'] . '"?>' . $xmldata; + + $isutf8 = (!preg_match('/^<\?xml[^>]* encoding=/i', $xmldata) || preg_match('/^<\?xml[^>]* encoding=.UTF-8/i', $xmldata)); + } + + # Decode illegal entities but protect semantically important ones + $xmldata = html_entity_decode(preg_replace('/&(amp|lt|gt|#38|#60|#62|#x26|#x3C|#x3E);/i', '&$1;', $xmldata), ENT_NOQUOTES, $this->_p['encoding']); + + # If should be utf-8 and can't be decoded as such, fix it, even if mixed between both + if ($isutf8 && preg_match('/[^\x80-\xff][\x80-\xff][^\x80-\xff]/', $xmldata)) + $xmldata = preg_replace('/[\x80-\xff]{1,4}/e', "it_xml::_utf8_fix('\\0')", $xmldata); + + return array($xmldata, $isutf8); +} + # Encode non-utf8 characters in a string, leave utf8 alone function _utf8_fix($str) { diff --git a/tests/it_xml.t b/tests/it_xml.t index 8c2c084..7e3a3cf 100755 --- a/tests/it_xml.t +++ b/tests/it_xml.t @@ -7,13 +7,28 @@ function match($xmldata, $expected, $name, $prefix = "", $p = array()) { $classname = $prefix ? ($prefix . "_xml") : "it_xml"; $varname = $prefix . "foo"; - $xml = new $classname("<root>$xmldata</root>", $p); + $xmldata = "<root>$xmldata</root>"; + $xml = new $classname($xmldata, $p); is( preg_replace('/[#\s]+/', " ", print_r($xml->$varname, true)), $expected, - $name + "$name (string)" ); + + $tmpfile = tmpfile(); + fwrite($tmpfile, $xmldata); + rewind($tmpfile); + + $xml = new $classname($tmpfile, $p); + fclose($tmpfile); + + is( + preg_replace('/[#\s]+/', " ", print_r($xml->$varname, true)), + $expected, + "$name (file)" + ); + } match( |