diff options
author | Urban Müller | 2009-10-07 15:31:29 +0000 |
---|---|---|
committer | Urban Müller | 2009-10-07 15:31:29 +0000 |
commit | f9789a730edac3133cd2ec7117c8a66465027fdd (patch) | |
tree | 3fe65bcfc4ef803f69d07e14ee37be25b9fcafee /it_xml.class | |
parent | db8cdb82a16a4782ee6fbfe41d1cbdebe64b4c7b (diff) | |
download | itools-f9789a730edac3133cd2ec7117c8a66465027fdd.tar.gz itools-f9789a730edac3133cd2ec7117c8a66465027fdd.tar.bz2 itools-f9789a730edac3133cd2ec7117c8a66465027fdd.zip |
fix mixed utf8 and latin in input
Diffstat (limited to 'it_xml.class')
-rw-r--r-- | it_xml.class | 12 |
1 files changed, 11 insertions, 1 deletions
diff --git a/it_xml.class b/it_xml.class index ff594fd..bacae95 100644 --- a/it_xml.class +++ b/it_xml.class @@ -88,9 +88,13 @@ function from_xml($xmldata, $p) if (!preg_match('/^(<\?xml|\xEF\xBB\xBF|\xFE\xFF|\xFF\xFE|\x00\x00\xFE\xFF|\x00\x00\xFF\xFE)/', $xmldata)) $xmldata = '<?xml version="1.0" encoding="' . $this->_p['encoding'] . '"?>' . $xmldata; - # decode illegal entities but protect semantically important ones + # Decode illegal entities but protect semantically important ones $xmldata = html_entity_decode(preg_replace('/&(amp|lt|gt|#38|#60|#62|#x26|#x3C|#x3E);/i', '&$1;', $xmldata), ENT_QUOTES, $this->_p['encoding']); + # If should be utf-8 and can't be decoded as such, fix it, even if mixed between both + if (!preg_match('/^<\?xml[^>]* encoding=/', $xmldata) && preg_match('/[^\x80-\xff][\x80-\xff][^\x80-\xff]/', $xmldata)) + $xmldata = preg_replace('/[\x80-\xff]{1,4}/e', "it_xml::_utf8_fix('\\0')", $xmldata); + $result = xml_parse($parser, $xmldata); } @@ -114,6 +118,12 @@ function from_xml($xmldata, $p) return empty($this->error); } +# Encode non-utf8 characters in a string, leave utf8 alone +function _utf8_fix($str) +{ + return preg_match('/^([\xc0-\xdf][\x80-\xbf]|[\xe0-\xef][\x80-\xbf][\x80-\xbf]|[\xf0-\xf7][\x80-\xbf][\x80-\xbf][\x80-\xbf])$/', $str) ? $str : utf8_encode($str); +} + function consume(/* $p */) { return false; |