diff options
author | David Flatz | 2024-05-27 13:27:39 +0200 |
---|---|---|
committer | David Flatz | 2024-05-27 13:27:39 +0200 |
commit | b484fab88a9229f7c87ea053564d0d8d3d2a565d (patch) | |
tree | f9395fc1b9908cdcf9e54aad1de6c8f97406a03f | |
parent | 8c86436125d34e1dc322c475f7d460cad682cdd7 (diff) | |
download | itools-b484fab88a9229f7c87ea053564d0d8d3d2a565d.tar.gz itools-b484fab88a9229f7c87ea053564d0d8d3d2a565d.tar.bz2 itools-b484fab88a9229f7c87ea053564d0d8d3d2a565d.zip |
Improve handling of nested tags in it_html::sanitize
-rw-r--r-- | it_html.class | 10 | ||||
-rwxr-xr-x | test/it_html.t | 12 |
2 files changed, 19 insertions, 3 deletions
diff --git a/it_html.class b/it_html.class index 9780d5d..effe920 100644 --- a/it_html.class +++ b/it_html.class @@ -408,11 +408,15 @@ static function sanitize($html) $html = it::replace(array('[\0\s]+' => " "), $html); # \s also matches \r and \n $urlpattern = 'https?://[^">]+'; - if ($tag = it::match("(.*?)<(div|p|ol|ul|li|i|b|strong|h[1-6])\b[^>]*>(.*?)</\\2>(.*)", $html)) + if ($tag = it::match('(<(div|p|ol|ul|li|i|b|strong|h[1-6])\b[^>]*>((?:(?!</?\2\b).|(?R))*?)</\2>)', $html, ['offset_capture' => 1])) { # Simple tags with content, no attributes kept - list($head, $tagname, $content, $tail) = $tag; - $tagname = strtolower($tagname); + $offset = $tag[0][1]; + $length = strlen($tag[0][0]); + $head = substr($html, 0, $offset); + $tail = substr($html, $offset + $length); + $content = $tag[2][0]; + $tagname = strtolower($tag[1][0]); $result .= it_html::sanitize($head) . "<$tagname>" . it_html::sanitize($content) . "</$tagname>" . it_html::sanitize($tail); } else if ($tag = it::match('(.*)<a\b[^>]+?\bhref\s*=\s*"(' . $urlpattern . ')"[^>]*?>(.*?)</a>(.*)', $html)) diff --git a/test/it_html.t b/test/it_html.t index 11e05dd..380a779 100755 --- a/test/it_html.t +++ b/test/it_html.t @@ -289,6 +289,18 @@ is( 'TODO it_html::sanitize handle anchors with unquoted attribute value in img' ); +is( + it_html::sanitize('<ul><li><ul><li class="removeme">foo</li><li>bar</li></ul></li></ul>'), + '<ul><li><ul><li>foo</li><li>bar</li></ul></li></ul>', + 'Nested unordered lists' +); + +is( + it_html::sanitize('<p><b>one</b> one</p><ul><li><p><i>one-one</i></p><ul><li>one-one-one</li><li>one-one-two</li></ul></li><li><p><i>one-two</i></p><ul><li>one-two-one</li><li>one-two-two</li></ul></li></ul>'), + '<p><b>one</b> one</p><ul><li><p><i>one-one</i></p><ul><li>one-one-one</li><li>one-one-two</li></ul></li><li><p><i>one-two</i></p><ul><li>one-two-one</li><li>one-two-two</li></ul></li></ul>', + 'More nested tags' +); + foreach (json_decode(it::file_get_contents(dirname($argv[0]) . '/U_tests.json'), true) as $test) is(U(...$test['args']), $test['exp'], $test['name']); |