From b484fab88a9229f7c87ea053564d0d8d3d2a565d Mon Sep 17 00:00:00 2001
From: David Flatz
Date: Mon, 27 May 2024 13:27:39 +0200
Subject: Improve handling of nested tags in it_html::sanitize

---
 it_html.class  | 10 +++++++---
 test/it_html.t | 12 ++++++++++++
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/it_html.class b/it_html.class
index 9780d5d..effe920 100644
--- a/it_html.class
+++ b/it_html.class
@@ -408,11 +408,15 @@ static function sanitize($html)
 	$html = it::replace(array('[\0\s]+' => " "), $html);	# \s also matches \r and \n
 	$urlpattern = 'https?://[^">]+';
 
-	if ($tag = it::match("(.*?)<(div|p|ol|ul|li|i|b|strong|h[1-6])\b[^>]*>(.*?)</\\2>(.*)", $html))
+	if ($tag = it::match('(<(div|p|ol|ul|li|i|b|strong|h[1-6])\b[^>]*>((?:(?!</?\2\b).|(?R))*?)</\2>)', $html, ['offset_capture' => 1]))
 	{
 		# Simple tags with content, no attributes kept
-		list($head, $tagname, $content, $tail) = $tag;
-		$tagname = strtolower($tagname);
+		$offset = $tag[0][1];
+		$length = strlen($tag[0][0]);
+		$head = substr($html, 0, $offset);
+		$tail = substr($html, $offset + $length);
+		$content = $tag[2][0];
+		$tagname = strtolower($tag[1][0]);
 		$result .= it_html::sanitize($head) . "<$tagname>" . it_html::sanitize($content) . "</$tagname>" . it_html::sanitize($tail);
 	}
 	else if ($tag = it::match('(.*)<a\b[^>]+?\bhref\s*=\s*"(' . $urlpattern . ')"[^>]*?>(.*?)</a>(.*)', $html))
diff --git a/test/it_html.t b/test/it_html.t
index 11e05dd..380a779 100755
--- a/test/it_html.t
+++ b/test/it_html.t
@@ -289,6 +289,18 @@ is(
 	'TODO it_html::sanitize handle anchors with unquoted attribute value in img'
 );
 
+is(
+	it_html::sanitize('<ul><li><ul><li class="removeme">foo</li><li>bar</li></ul></li></ul>'),
+	'<ul><li><ul><li>foo</li><li>bar</li></ul></li></ul>',
+	'Nested unordered lists'
+);
+
+is(
+	it_html::sanitize('<p><b>one</b> one</p><ul><li><p><i>one-one</i></p><ul><li>one-one-one</li><li>one-one-two</li></ul></li><li><p><i>one-two</i></p><ul><li>one-two-one</li><li>one-two-two</li></ul></li></ul>'),
+	'<p><b>one</b> one</p><ul><li><p><i>one-one</i></p><ul><li>one-one-one</li><li>one-one-two</li></ul></li><li><p><i>one-two</i></p><ul><li>one-two-one</li><li>one-two-two</li></ul></li></ul>',
+	'More nested tags'
+);
+
 foreach (json_decode(it::file_get_contents(dirname($argv[0]) . '/U_tests.json'), true) as $test)
 	is(U(...$test['args']), $test['exp'], $test['name']);
 
-- 
cgit v1.2.3