summaryrefslogtreecommitdiff
path: root/it_html.class
diff options
context:
space:
mode:
authorDavid Flatz2024-05-27 13:27:39 +0200
committerDavid Flatz2024-05-27 13:27:39 +0200
commitb484fab88a9229f7c87ea053564d0d8d3d2a565d (patch)
treef9395fc1b9908cdcf9e54aad1de6c8f97406a03f /it_html.class
parent8c86436125d34e1dc322c475f7d460cad682cdd7 (diff)
downloaditools-b484fab88a9229f7c87ea053564d0d8d3d2a565d.tar.gz
itools-b484fab88a9229f7c87ea053564d0d8d3d2a565d.tar.bz2
itools-b484fab88a9229f7c87ea053564d0d8d3d2a565d.zip
Improve handling of nested tags in it_html::sanitize
Diffstat (limited to 'it_html.class')
-rw-r--r--it_html.class10
1 files changed, 7 insertions, 3 deletions
diff --git a/it_html.class b/it_html.class
index 9780d5d..effe920 100644
--- a/it_html.class
+++ b/it_html.class
@@ -408,11 +408,15 @@ static function sanitize($html)
$html = it::replace(array('[\0\s]+' => " "), $html); # \s also matches \r and \n
$urlpattern = 'https?://[^">]+';
- if ($tag = it::match("(.*?)<(div|p|ol|ul|li|i|b|strong|h[1-6])\b[^>]*>(.*?)</\\2>(.*)", $html))
+ if ($tag = it::match('(<(div|p|ol|ul|li|i|b|strong|h[1-6])\b[^>]*>((?:(?!</?\2\b).|(?R))*?)</\2>)', $html, ['offset_capture' => 1]))
{
# Simple tags with content, no attributes kept
- list($head, $tagname, $content, $tail) = $tag;
- $tagname = strtolower($tagname);
+ $offset = $tag[0][1];
+ $length = strlen($tag[0][0]);
+ $head = substr($html, 0, $offset);
+ $tail = substr($html, $offset + $length);
+ $content = $tag[2][0];
+ $tagname = strtolower($tag[1][0]);
$result .= it_html::sanitize($head) . "<$tagname>" . it_html::sanitize($content) . "</$tagname>" . it_html::sanitize($tail);
}
else if ($tag = it::match('(.*)<a\b[^>]+?\bhref\s*=\s*"(' . $urlpattern . ')"[^>]*?>(.*?)</a>(.*)', $html))