From f2225077d7450ea08f40ea45cc348184f1d30b97 Mon Sep 17 00:00:00 2001 From: Christian Schneider Date: Fri, 9 May 2025 14:31:12 +0200 Subject: Handle mailto:-links and tags inside tags in it_html::sanitize() (support request for https://search.ch/tel/biel-bienne/bahnhofstrasse-5/groupe-mutuel-4) --- it_html.class | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'it_html.class') diff --git a/it_html.class b/it_html.class index e3053b6..9bb8e7a 100644 --- a/it_html.class +++ b/it_html.class @@ -393,32 +393,33 @@ static function sanitize($html) if ($charset == "utf-8") $html = it::any2utf8($html); $html = it::replace(array('[\0\s]+' => " "), $html); # \s also matches \r and \n - $urlpattern = 'https?://[^">]+'; + $urlpattern = '(?:https?://|mailto:)[^">]+'; + $placeholder = bin2hex(random_bytes(16)); if ($tag = it::match("(.*?)<(div|p|ol|ul|li|i|b|strong|h[1-6])\b[^>]*>(.*?)(.*)", $html)) { # Simple tags with content, no attributes kept list($head, $tagname, $content, $tail) = $tag; $tagname = strtolower($tagname); - $result .= it_html::sanitize($head) . "<$tagname>" . it_html::sanitize($content) . "" . it_html::sanitize($tail); + $result .= it::replace([$placeholder => "<$tagname>" . it_html::sanitize($content) . ""], it_html::sanitize("$head$placeholder$tail")); } else if ($tag = it::match('(.*)]+?\bhref\s*=\s*"(' . $urlpattern . ')"[^>]*?>(.*?)(.*)', $html)) { # Link tags, keeps only href attribute list($head, $href, $content, $tail) = $tag; - $result .= it_html::sanitize($head) . '' . it_html::sanitize($content) . "" . it_html::sanitize($tail); + $result .= it::replace([$placeholder => '' . it_html::sanitize($content) . ""], it_html::sanitize("$head$placeholder$tail")); } else if ($tag = it::match('(.*)]+?\bsrc\s*=\s*"(' . $urlpattern . ')"[^>]*?>(.*)', $html)) { # Image tags, keeps only src attribute list($head, $src, $tail) = $tag; - $result .= it_html::sanitize($head) . '' . it_html::sanitize($tail); + $result .= it::replace([$placeholder => ''], it_html::sanitize("$head$placeholder$tail")); } else if ($tag = it::match("(.*)<(br|/tr)\b[^>]*>(.*)", $html)) { # brs and table rows are converted so simple line breaks list($head, $tagname, $tail) = $tag; - $result .= it_html::sanitize($head) . "
" . it_html::sanitize($tail); + $result .= it::replace([$placeholder => "
"], it_html::sanitize("$head$placeholder$tail")); } else $result = it::replace(array('&(#\d+;)' => '&$1'), it_html::Q(html_entity_decode(strip_tags($html), ENT_COMPAT, $charset))); -- cgit v1.2.3