summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--it_xml.class56
-rwxr-xr-xtests/it_xml.t19
2 files changed, 52 insertions, 23 deletions
diff --git a/it_xml.class b/it_xml.class
index 1686478..29652d8 100644
--- a/it_xml.class
+++ b/it_xml.class
@@ -70,13 +70,16 @@ function from_xml($xmldata, $p)
if (is_resource($xmldata))
{
- if (!preg_match('/^<\?xml/', ($head = ltrim(fread($xmldata, 1024 * 1024))))) # Prepend XML header for charset detection in PHP5
- $head = '<?xml version="1.0" encoding="' . $this->_p['encoding'] . '"?>' . $head;
-
- $result = xml_parse($parser, $head, false);
-
while ($result && !feof($xmldata))
- $result = xml_parse($parser, fread($xmldata, 1024 * 1024), false);
+ {
+ $data = fread($xmldata, 1024 * 1024);
+ while (!feof($xmldata) && preg_match('/[\x80-\xff]$/', $data)) # Make sure end of chunk is not in the middle of a UTF8 character
+ $data .= fread($xmldata, 1);
+
+ $xmlorig .= $data;
+ list($data, $isutf8) = $this->_sanitize($data, $isutf8);
+ $result = xml_parse($parser, $data);
+ }
if ($result)
$result = xml_parse($parser, "", true);
@@ -84,21 +87,7 @@ function from_xml($xmldata, $p)
else
{
$xmlorig = $xmldata;
- $xmldata = trim($xmldata);
-
- # Add header for charset detection (PHP5) if no header/BOM
- # See http://www.w3.org/TR/2006/REC-xml-20060816/#sec-guessing
- if (!preg_match('/^(<\?xml|\xEF\xBB\xBF|\xFE\xFF|\xFF\xFE|\x00\x00\xFE\xFF|\x00\x00\xFF\xFE)/', $xmldata))
- $xmldata = '<?xml version="1.0" encoding="' . $this->_p['encoding'] . '"?>' . $xmldata;
-
- # Decode illegal entities but protect semantically important ones
- $xmldata = html_entity_decode(preg_replace('/&(amp|lt|gt|#38|#60|#62|#x26|#x3C|#x3E);/i', '&amp;$1;', $xmldata), ENT_NOQUOTES, $this->_p['encoding']);
-
- # If should be utf-8 and can't be decoded as such, fix it, even if mixed between both
- if (!preg_match('/^<\?xml[^>]* encoding=/i', $xmldata) || preg_match('/^<\?xml[^>]* encoding=.UTF-8/i', $xmldata))
- if (preg_match('/[^\x80-\xff][\x80-\xff][^\x80-\xff]/', $xmldata))
- $xmldata = preg_replace('/[\x80-\xff]{1,4}/e', "it_xml::_utf8_fix('\\0')", $xmldata);
-
+ list($xmldata, $isutf8) = $this->_sanitize($xmldata);
$result = xml_parse($parser, $xmldata);
}
@@ -122,6 +111,31 @@ function from_xml($xmldata, $p)
return empty($this->error);
}
+# Use various heuristics to fix real-world XML files
+function _sanitize($xmldata, $isutf8 = null)
+{
+ $xmldata = trim($xmldata);
+
+ if (!isset($isutf8)) # Check if we already decided on charset yet
+ {
+ # Add header for charset detection (PHP5) if no header/BOM
+ # See http://www.w3.org/TR/2006/REC-xml-20060816/#sec-guessing
+ if (!preg_match('/^(<\?xml|\xEF\xBB\xBF|\xFE\xFF|\xFF\xFE|\x00\x00\xFE\xFF|\x00\x00\xFF\xFE)/', $xmldata))
+ $xmldata = '<?xml version="1.0" encoding="' . $this->_p['encoding'] . '"?>' . $xmldata;
+
+ $isutf8 = (!preg_match('/^<\?xml[^>]* encoding=/i', $xmldata) || preg_match('/^<\?xml[^>]* encoding=.UTF-8/i', $xmldata));
+ }
+
+ # Decode illegal entities but protect semantically important ones
+ $xmldata = html_entity_decode(preg_replace('/&(amp|lt|gt|#38|#60|#62|#x26|#x3C|#x3E);/i', '&amp;$1;', $xmldata), ENT_NOQUOTES, $this->_p['encoding']);
+
+ # If should be utf-8 and can't be decoded as such, fix it, even if mixed between both
+ if ($isutf8 && preg_match('/[^\x80-\xff][\x80-\xff][^\x80-\xff]/', $xmldata))
+ $xmldata = preg_replace('/[\x80-\xff]{1,4}/e', "it_xml::_utf8_fix('\\0')", $xmldata);
+
+ return array($xmldata, $isutf8);
+}
+
# Encode non-utf8 characters in a string, leave utf8 alone
function _utf8_fix($str)
{
diff --git a/tests/it_xml.t b/tests/it_xml.t
index 8c2c084..7e3a3cf 100755
--- a/tests/it_xml.t
+++ b/tests/it_xml.t
@@ -7,13 +7,28 @@ function match($xmldata, $expected, $name, $prefix = "", $p = array())
{
$classname = $prefix ? ($prefix . "_xml") : "it_xml";
$varname = $prefix . "foo";
- $xml = new $classname("<root>$xmldata</root>", $p);
+ $xmldata = "<root>$xmldata</root>";
+ $xml = new $classname($xmldata, $p);
is(
preg_replace('/[#\s]+/', " ", print_r($xml->$varname, true)),
$expected,
- $name
+ "$name (string)"
);
+
+ $tmpfile = tmpfile();
+ fwrite($tmpfile, $xmldata);
+ rewind($tmpfile);
+
+ $xml = new $classname($tmpfile, $p);
+ fclose($tmpfile);
+
+ is(
+ preg_replace('/[#\s]+/', " ", print_r($xml->$varname, true)),
+ $expected,
+ "$name (file)"
+ );
+
}
match(