summaryrefslogtreecommitdiff
path: root/it_xml.class
diff options
context:
space:
mode:
authorUrban Müller2009-10-07 15:31:29 +0000
committerUrban Müller2009-10-07 15:31:29 +0000
commitf9789a730edac3133cd2ec7117c8a66465027fdd (patch)
tree3fe65bcfc4ef803f69d07e14ee37be25b9fcafee /it_xml.class
parentdb8cdb82a16a4782ee6fbfe41d1cbdebe64b4c7b (diff)
downloaditools-f9789a730edac3133cd2ec7117c8a66465027fdd.tar.gz
itools-f9789a730edac3133cd2ec7117c8a66465027fdd.tar.bz2
itools-f9789a730edac3133cd2ec7117c8a66465027fdd.zip
fix mixed utf8 and latin in input
Diffstat (limited to 'it_xml.class')
-rw-r--r--it_xml.class12
1 files changed, 11 insertions, 1 deletions
diff --git a/it_xml.class b/it_xml.class
index ff594fd..bacae95 100644
--- a/it_xml.class
+++ b/it_xml.class
@@ -88,9 +88,13 @@ function from_xml($xmldata, $p)
if (!preg_match('/^(<\?xml|\xEF\xBB\xBF|\xFE\xFF|\xFF\xFE|\x00\x00\xFE\xFF|\x00\x00\xFF\xFE)/', $xmldata))
$xmldata = '<?xml version="1.0" encoding="' . $this->_p['encoding'] . '"?>' . $xmldata;
- # decode illegal entities but protect semantically important ones
+ # Decode illegal entities but protect semantically important ones
$xmldata = html_entity_decode(preg_replace('/&(amp|lt|gt|#38|#60|#62|#x26|#x3C|#x3E);/i', '&amp;$1;', $xmldata), ENT_QUOTES, $this->_p['encoding']);
+ # If should be utf-8 and can't be decoded as such, fix it, even if mixed between both
+ if (!preg_match('/^<\?xml[^>]* encoding=/', $xmldata) && preg_match('/[^\x80-\xff][\x80-\xff][^\x80-\xff]/', $xmldata))
+ $xmldata = preg_replace('/[\x80-\xff]{1,4}/e', "it_xml::_utf8_fix('\\0')", $xmldata);
+
$result = xml_parse($parser, $xmldata);
}
@@ -114,6 +118,12 @@ function from_xml($xmldata, $p)
return empty($this->error);
}
+# Encode non-utf8 characters in a string, leave utf8 alone
+function _utf8_fix($str)
+{
+ return preg_match('/^([\xc0-\xdf][\x80-\xbf]|[\xe0-\xef][\x80-\xbf][\x80-\xbf]|[\xf0-\xf7][\x80-\xbf][\x80-\xbf][\x80-\xbf])$/', $str) ? $str : utf8_encode($str);
+}
+
function consume(/* $p */)
{
return false;