diff --git a/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedCleaner.java b/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedCleaner.java index 5389fd61..e656121b 100644 --- a/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedCleaner.java +++ b/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedCleaner.java @@ -1,6 +1,7 @@ package com.commafeed.backend.feed.parser; import java.util.Collection; +import java.util.regex.Pattern; import org.ahocorasick.trie.Emit; import org.ahocorasick.trie.Trie; @@ -11,6 +12,8 @@ import jakarta.inject.Singleton; @Singleton class FeedCleaner { + private static final Pattern DOCTYPE_PATTERN = Pattern.compile("]*>", Pattern.CASE_INSENSITIVE); + public String trimInvalidXmlCharacters(String xml) { if (StringUtils.isBlank(xml)) { return null; @@ -60,4 +63,8 @@ class FeedCleaner { return sb.toString(); } + public String removeDoctypeDeclarations(String xml) { + return DOCTYPE_PATTERN.matcher(xml).replaceAll(""); + } + } diff --git a/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedParser.java b/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedParser.java index d25f7c13..1877d250 100644 --- a/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedParser.java +++ b/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedParser.java @@ -64,6 +64,7 @@ public class FeedParser { throw new FeedException("Input string is null for url " + feedUrl); } xmlString = feedCleaner.replaceHtmlEntitiesWithNumericEntities(xmlString); + xmlString = feedCleaner.removeDoctypeDeclarations(xmlString); InputSource source = new InputSource(new StringReader(xmlString)); SyndFeed feed = new SyndFeedInput().build(source); diff --git a/commafeed-server/src/test/java/com/commafeed/backend/feed/parser/FeedCleanerTest.java b/commafeed-server/src/test/java/com/commafeed/backend/feed/parser/FeedCleanerTest.java index 68a7584c..c324b958 100644 --- a/commafeed-server/src/test/java/com/commafeed/backend/feed/parser/FeedCleanerTest.java +++ b/commafeed-server/src/test/java/com/commafeed/backend/feed/parser/FeedCleanerTest.java @@ -13,4 +13,22 @@ class FeedCleanerTest { Assertions.assertEquals("T´l´phone ′", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source)); } + @Test + void testRemoveDoctype() { + String source = ""; + Assertions.assertEquals("", feedCleaner.removeDoctypeDeclarations(source)); + } + + @Test + void testRemoveMultilineDoctype() { + String source = """ + + """; + Assertions.assertEquals(""" + + """, feedCleaner.removeDoctypeDeclarations(source)); + } + } \ No newline at end of file