From f2225077d7450ea08f40ea45cc348184f1d30b97 Mon Sep 17 00:00:00 2001
From: Christian Schneider
Date: Fri, 9 May 2025 14:31:12 +0200
Subject: Handle mailto:-links and tags inside tags in it_html::sanitize()
(support request for
https://search.ch/tel/biel-bienne/bahnhofstrasse-5/groupe-mutuel-4)
---
it_html.class | 11 ++++++-----
test/it_html.t | 12 ++++++++++++
2 files changed, 18 insertions(+), 5 deletions(-)
diff --git a/it_html.class b/it_html.class
index e3053b6..9bb8e7a 100644
--- a/it_html.class
+++ b/it_html.class
@@ -393,32 +393,33 @@ static function sanitize($html)
if ($charset == "utf-8")
$html = it::any2utf8($html);
$html = it::replace(array('[\0\s]+' => " "), $html); # \s also matches \r and \n
- $urlpattern = 'https?://[^">]+';
+ $urlpattern = '(?:https?://|mailto:)[^">]+';
+ $placeholder = bin2hex(random_bytes(16));
if ($tag = it::match("(.*?)<(div|p|ol|ul|li|i|b|strong|h[1-6])\b[^>]*>(.*?)\\2>(.*)", $html))
{
# Simple tags with content, no attributes kept
list($head, $tagname, $content, $tail) = $tag;
$tagname = strtolower($tagname);
- $result .= it_html::sanitize($head) . "<$tagname>" . it_html::sanitize($content) . "$tagname>" . it_html::sanitize($tail);
+ $result .= it::replace([$placeholder => "<$tagname>" . it_html::sanitize($content) . "$tagname>"], it_html::sanitize("$head$placeholder$tail"));
}
else if ($tag = it::match('(.*)]+?\bhref\s*=\s*"(' . $urlpattern . ')"[^>]*?>(.*?)(.*)', $html))
{
# Link tags, keeps only href attribute
list($head, $href, $content, $tail) = $tag;
- $result .= it_html::sanitize($head) . '' . it_html::sanitize($content) . "" . it_html::sanitize($tail);
+ $result .= it::replace([$placeholder => '' . it_html::sanitize($content) . ""], it_html::sanitize("$head$placeholder$tail"));
}
else if ($tag = it::match('(.*)
]+?\bsrc\s*=\s*"(' . $urlpattern . ')"[^>]*?>(.*)', $html))
{
# Image tags, keeps only src attribute
list($head, $src, $tail) = $tag;
- $result .= it_html::sanitize($head) . '
' . it_html::sanitize($tail);
+ $result .= it::replace([$placeholder => '
'], it_html::sanitize("$head$placeholder$tail"));
}
else if ($tag = it::match("(.*)<(br|/tr)\b[^>]*>(.*)", $html))
{
# brs and table rows are converted so simple line breaks
list($head, $tagname, $tail) = $tag;
- $result .= it_html::sanitize($head) . "
" . it_html::sanitize($tail);
+ $result .= it::replace([$placeholder => "
"], it_html::sanitize("$head$placeholder$tail"));
}
else
$result = it::replace(array('&(#\d+;)' => '&$1'), it_html::Q(html_entity_decode(strip_tags($html), ENT_COMPAT, $charset)));
diff --git a/test/it_html.t b/test/it_html.t
index 15f444d..20ab65f 100755
--- a/test/it_html.t
+++ b/test/it_html.t
@@ -259,6 +259,18 @@ is(
'it_html::sanitize handle anchors with spaces between attribute name and value'
);
+is(
+ it_html::sanitize('foo'),
+ 'foo',
+ 'it_html::sanitize handle nesting of tags inside '
+);
+
+is(
+ it_html::sanitize('foo'),
+ 'foo',
+ 'it_html::sanitize handle mailto links'
+);
+
is(
it_html::sanitize("foo"),
'foo',
--
cgit v1.2.3