1 files changed, 35 insertions, 21 deletions
diff --git a/it_xml.class b/it_xml.class
index 1686478..29652d8 100644
--- a/it_xml.class
+++ b/it_xml.class
@@ -70,13 +70,16 @@ function from_xml($xmldata, $p)
 
 	if (is_resource($xmldata))
 	{
-		if (!preg_match('/^<\?xml/', ($head = ltrim(fread($xmldata, 1024 * 1024)))))	# Prepend XML header for charset detection in PHP5
-			$head = '<?xml version="1.0" encoding="' . $this->_p['encoding'] . '"?>' . $head;
-
-		$result = xml_parse($parser, $head, false);
-
 		while ($result && !feof($xmldata))
-			$result = xml_parse($parser, fread($xmldata, 1024 * 1024), false);
+		{
+			$data = fread($xmldata, 1024 * 1024);
+			while (!feof($xmldata) && preg_match('/[\x80-\xff]$/', $data))	# Make sure end of chunk is not in the middle of a UTF8 character
+				$data .= fread($xmldata, 1);
+
+			$xmlorig .= $data;
+			list($data, $isutf8) = $this->_sanitize($data, $isutf8);
+			$result = xml_parse($parser, $data);
+		}
 
 		if ($result)
 			$result = xml_parse($parser, "", true);
@@ -84,21 +87,7 @@ function from_xml($xmldata, $p)
 	else
 	{
 		$xmlorig = $xmldata;
-		$xmldata = trim($xmldata);
-
-		# Add header for charset detection (PHP5) if no header/BOM
-		# See http://www.w3.org/TR/2006/REC-xml-20060816/#sec-guessing
-		if (!preg_match('/^(<\?xml|\xEF\xBB\xBF|\xFE\xFF|\xFF\xFE|\x00\x00\xFE\xFF|\x00\x00\xFF\xFE)/', $xmldata))
-			$xmldata = '<?xml version="1.0" encoding="' . $this->_p['encoding'] . '"?>' . $xmldata;
-
-		# Decode illegal entities but protect semantically important ones
-		$xmldata = html_entity_decode(preg_replace('/&(amp|lt|gt|#38|#60|#62|#x26|#x3C|#x3E);/i', '&amp;$1;', $xmldata), ENT_NOQUOTES, $this->_p['encoding']);
-
-		# If should be utf-8 and can't be decoded as such, fix it, even if mixed between both
-		if (!preg_match('/^<\?xml[^>]* encoding=/i', $xmldata) || preg_match('/^<\?xml[^>]* encoding=.UTF-8/i', $xmldata))
-			if (preg_match('/[^\x80-\xff][\x80-\xff][^\x80-\xff]/', $xmldata))
-				$xmldata = preg_replace('/[\x80-\xff]{1,4}/e', "it_xml::_utf8_fix('\\0')", $xmldata);
-
+		list($xmldata, $isutf8) = $this->_sanitize($xmldata);
 		$result = xml_parse($parser, $xmldata);
 	}
 
@@ -122,6 +111,31 @@ function from_xml($xmldata, $p)
 	return empty($this->error);
 }
 
+# Use various heuristics to fix real-world XML files
+function _sanitize($xmldata, $isutf8 = null)
+{
+	$xmldata = trim($xmldata);
+
+	if (!isset($isutf8))	# Check if we already decided on charset yet
+	{
+		# Add header for charset detection (PHP5) if no header/BOM
+		# See http://www.w3.org/TR/2006/REC-xml-20060816/#sec-guessing
+		if (!preg_match('/^(<\?xml|\xEF\xBB\xBF|\xFE\xFF|\xFF\xFE|\x00\x00\xFE\xFF|\x00\x00\xFF\xFE)/', $xmldata))
+			$xmldata = '<?xml version="1.0" encoding="' . $this->_p['encoding'] . '"?>' . $xmldata;
+
+		$isutf8 = (!preg_match('/^<\?xml[^>]* encoding=/i', $xmldata) || preg_match('/^<\?xml[^>]* encoding=.UTF-8/i', $xmldata));
+	}
+
+	# Decode illegal entities but protect semantically important ones
+	$xmldata = html_entity_decode(preg_replace('/&(amp|lt|gt|#38|#60|#62|#x26|#x3C|#x3E);/i', '&amp;$1;', $xmldata), ENT_NOQUOTES, $this->_p['encoding']);
+
+	# If should be utf-8 and can't be decoded as such, fix it, even if mixed between both
+	if ($isutf8 && preg_match('/[^\x80-\xff][\x80-\xff][^\x80-\xff]/', $xmldata))
+		$xmldata = preg_replace('/[\x80-\xff]{1,4}/e', "it_xml::_utf8_fix('\\0')", $xmldata);
+
+	return array($xmldata, $isutf8);
+}
+
 # Encode non-utf8 characters in a string, leave utf8 alone
 function _utf8_fix($str)
 {