remove DOCTYPE declarations (#1260)

This commit is contained in:
Athou
2025-01-10 16:09:21 +01:00
parent 74f7c48818
commit 62d3ed16e6
3 changed files with 26 additions and 0 deletions

View File

@@ -1,6 +1,7 @@
package com.commafeed.backend.feed.parser;
import java.util.Collection;
import java.util.regex.Pattern;
import org.ahocorasick.trie.Emit;
import org.ahocorasick.trie.Trie;
@@ -11,6 +12,8 @@ import jakarta.inject.Singleton;
@Singleton
class FeedCleaner {
private static final Pattern DOCTYPE_PATTERN = Pattern.compile("<!DOCTYPE[^>]*>", Pattern.CASE_INSENSITIVE);
public String trimInvalidXmlCharacters(String xml) {
if (StringUtils.isBlank(xml)) {
return null;
@@ -60,4 +63,8 @@ class FeedCleaner {
return sb.toString();
}
public String removeDoctypeDeclarations(String xml) {
return DOCTYPE_PATTERN.matcher(xml).replaceAll("");
}
}

View File

@@ -64,6 +64,7 @@ public class FeedParser {
throw new FeedException("Input string is null for url " + feedUrl);
}
xmlString = feedCleaner.replaceHtmlEntitiesWithNumericEntities(xmlString);
xmlString = feedCleaner.removeDoctypeDeclarations(xmlString);
InputSource source = new InputSource(new StringReader(xmlString));
SyndFeed feed = new SyndFeedInput().build(source);

View File

@@ -13,4 +13,22 @@ class FeedCleanerTest {
Assertions.assertEquals("<source>T&#180;l&#180;phone &#8242;</source>", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source));
}
@Test
void testRemoveDoctype() {
String source = "<!DOCTYPE html><html><head></head><body></body></html>";
Assertions.assertEquals("<html><head></head><body></body></html>", feedCleaner.removeDoctypeDeclarations(source));
}
@Test
void testRemoveMultilineDoctype() {
String source = """
<!DOCTYPE
html
>
<html><head></head><body></body></html>""";
Assertions.assertEquals("""
<html><head></head><body></body></html>""", feedCleaner.removeDoctypeDeclarations(source));
}
}