From b484fab88a9229f7c87ea053564d0d8d3d2a565d Mon Sep 17 00:00:00 2001 From: David Flatz Date: Mon, 27 May 2024 13:27:39 +0200 Subject: Improve handling of nested tags in it_html::sanitize --- it_html.class | 10 +++++++--- test/it_html.t | 12 ++++++++++++ 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/it_html.class b/it_html.class index 9780d5d..effe920 100644 --- a/it_html.class +++ b/it_html.class @@ -408,11 +408,15 @@ static function sanitize($html) $html = it::replace(array('[\0\s]+' => " "), $html); # \s also matches \r and \n $urlpattern = 'https?://[^">]+'; - if ($tag = it::match("(.*?)<(div|p|ol|ul|li|i|b|strong|h[1-6])\b[^>]*>(.*?)\\2>(.*)", $html)) + if ($tag = it::match('(<(div|p|ol|ul|li|i|b|strong|h[1-6])\b[^>]*>((?:(?!?\2\b).|(?R))*?)\2>)', $html, ['offset_capture' => 1])) { # Simple tags with content, no attributes kept - list($head, $tagname, $content, $tail) = $tag; - $tagname = strtolower($tagname); + $offset = $tag[0][1]; + $length = strlen($tag[0][0]); + $head = substr($html, 0, $offset); + $tail = substr($html, $offset + $length); + $content = $tag[2][0]; + $tagname = strtolower($tag[1][0]); $result .= it_html::sanitize($head) . "<$tagname>" . it_html::sanitize($content) . "$tagname>" . it_html::sanitize($tail); } else if ($tag = it::match('(.*)]+?\bhref\s*=\s*"(' . $urlpattern . ')"[^>]*?>(.*?)(.*)', $html)) diff --git a/test/it_html.t b/test/it_html.t index 11e05dd..380a779 100755 --- a/test/it_html.t +++ b/test/it_html.t @@ -289,6 +289,18 @@ is( 'TODO it_html::sanitize handle anchors with unquoted attribute value in img' ); +is( + it_html::sanitize('
one one
one-one
one-two
one one
one-one
one-two