Class it_html:
/**
* Return HTML with all evil things stripped. Allowed are a coupld of simple
* tags like div, p, i, b, strong, h1 - h6, br without attributes, a with absolute href,
* img with absolute src url. Also ensures that tags are balanced.
* @param $html HTML string to be sanitized
* @return Sanitized HTML
*/
static function sanitize($html)
{
$result = "";
$charset = $GLOBALS['it_html']->p['charset'] ? $GLOBALS['it_html']->p['charset'] : 'iso-8859-1';
if ($charset == "utf-8")
$html = it::any2utf8($html);
$html = it::replace(array('[\0\s]+' => " "), $html); # \s also matches \r and \n
$urlpattern = 'https?://[^">]+';
if ($tag = it::match("(.*?)<(div|p|ol|ul|li|i|b|strong|h[1-6])\b[^>]*>(.*?)</\\2>(.*)", $html))
{
# Simple tags with content, no attributes kept
list($head, $tagname, $content, $tail) = $tag;
$tagname = strtolower($tagname);
$result .= it_html::sanitize($head) . "<$tagname>" . it_html::sanitize($content) . "</$tagname>" . it_html::sanitize($tail);
}
else if ($tag = it::match('(.*)<a\b[^>]+?\bhref\s*=\s*"(' . $urlpattern . ')"[^>]*?>(.*?)</a>(.*)', $html))
{
# Link tags, keeps only href attribute
list($head, $href, $content, $tail) = $tag;
$result .= it_html::sanitize($head) . '<a href="' . it_html::Q(it_html::U(html_entity_decode($href, ENT_COMPAT, $charset))) . '">' . it_html::sanitize($content) . "</a>" . it_html::sanitize($tail);
}
else if ($tag = it::match('(.*)<img\b[^>]+?\bsrc\s*=\s*"(' . $urlpattern . ')"[^>]*?>(.*)', $html))
{
# Image tags, keeps only src attribute
list($head, $src, $tail) = $tag;
$result .= it_html::sanitize($head) . '<img src="' . it_html::Q(it_html::U(html_entity_decode($src, ENT_COMPAT, $charset))) . '" alt="" />' . it_html::sanitize($tail);
}
else if ($tag = it::match("(.*)<(br|/tr)\b[^>]*>(.*)", $html))
{
# brs and table rows are converted so simple line breaks
list($head, $tagname, $tail) = $tag;
$result .= it_html::sanitize($head) . "<br />" . it_html::sanitize($tail);
}
else
$result = it::replace(array('&(#\d+;)' => '&$1'), it_html::Q(html_entity_decode(strip_tags($html), ENT_COMPAT, $charset)));
return $GLOBALS['debug_q'] ? "<span style=\"background:#8FF\">$result</span>" : it::replace(array('<(div|p|i|b|a)></\1>' => ""), $result); # remove empty tags
}