summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorUrban Müller2024-05-28 17:49:09 +0200
committerUrban Müller2024-05-28 17:49:09 +0200
commit25a946b8c2bf6638b7adfe2afa387fa26cb97e71 (patch)
treeaa693ef50ac2531fc5399614e289c9fd8609d2af
parent937d04e90f85e332186c6c16fc6b06fa0ca400a7 (diff)
downloaditools-25a946b8c2bf6638b7adfe2afa387fa26cb97e71.tar.gz
itools-25a946b8c2bf6638b7adfe2afa387fa26cb97e71.tar.bz2
itools-25a946b8c2bf6638b7adfe2afa387fa26cb97e71.zip
Revert "Improve handling of nested tags in it_html::sanitize": getting "Exceeded pcre.backtrack_limit of 1000000 bytes"
This reverts commit b484fab88a9229f7c87ea053564d0d8d3d2a565d.
-rw-r--r--it_html.class10
-rwxr-xr-xtest/it_html.t12
2 files changed, 3 insertions, 19 deletions
diff --git a/it_html.class b/it_html.class
index effe920..9780d5d 100644
--- a/it_html.class
+++ b/it_html.class
@@ -408,15 +408,11 @@ static function sanitize($html)
$html = it::replace(array('[\0\s]+' => " "), $html); # \s also matches \r and \n
$urlpattern = 'https?://[^">]+';
- if ($tag = it::match('(<(div|p|ol|ul|li|i|b|strong|h[1-6])\b[^>]*>((?:(?!</?\2\b).|(?R))*?)</\2>)', $html, ['offset_capture' => 1]))
+ if ($tag = it::match("(.*?)<(div|p|ol|ul|li|i|b|strong|h[1-6])\b[^>]*>(.*?)</\\2>(.*)", $html))
{
# Simple tags with content, no attributes kept
- $offset = $tag[0][1];
- $length = strlen($tag[0][0]);
- $head = substr($html, 0, $offset);
- $tail = substr($html, $offset + $length);
- $content = $tag[2][0];
- $tagname = strtolower($tag[1][0]);
+ list($head, $tagname, $content, $tail) = $tag;
+ $tagname = strtolower($tagname);
$result .= it_html::sanitize($head) . "<$tagname>" . it_html::sanitize($content) . "</$tagname>" . it_html::sanitize($tail);
}
else if ($tag = it::match('(.*)<a\b[^>]+?\bhref\s*=\s*"(' . $urlpattern . ')"[^>]*?>(.*?)</a>(.*)', $html))
diff --git a/test/it_html.t b/test/it_html.t
index 380a779..11e05dd 100755
--- a/test/it_html.t
+++ b/test/it_html.t
@@ -289,18 +289,6 @@ is(
'TODO it_html::sanitize handle anchors with unquoted attribute value in img'
);
-is(
- it_html::sanitize('<ul><li><ul><li class="removeme">foo</li><li>bar</li></ul></li></ul>'),
- '<ul><li><ul><li>foo</li><li>bar</li></ul></li></ul>',
- 'Nested unordered lists'
-);
-
-is(
- it_html::sanitize('<p><b>one</b> one</p><ul><li><p><i>one-one</i></p><ul><li>one-one-one</li><li>one-one-two</li></ul></li><li><p><i>one-two</i></p><ul><li>one-two-one</li><li>one-two-two</li></ul></li></ul>'),
- '<p><b>one</b> one</p><ul><li><p><i>one-one</i></p><ul><li>one-one-one</li><li>one-one-two</li></ul></li><li><p><i>one-two</i></p><ul><li>one-two-one</li><li>one-two-two</li></ul></li></ul>',
- 'More nested tags'
-);
-
foreach (json_decode(it::file_get_contents(dirname($argv[0]) . '/U_tests.json'), true) as $test)
is(U(...$test['args']), $test['exp'], $test['name']);