diff --git a/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedCleaner.java b/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedCleaner.java
index 3ee52073..23236f92 100644
--- a/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedCleaner.java
+++ b/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedCleaner.java
@@ -8,41 +8,47 @@ import jakarta.inject.Singleton;
import org.ahocorasick.trie.Emit;
import org.ahocorasick.trie.Trie;
import org.apache.commons.lang3.StringUtils;
+import org.jdom2.Verifier;
@Singleton
class FeedCleaner {
private static final Pattern DOCTYPE_PATTERN = Pattern.compile("]*>", Pattern.CASE_INSENSITIVE);
- public String trimInvalidXmlCharacters(String xml) {
+ public String clean(String xml) {
+ xml = removeCharactersBeforeFirstXmlTag(xml);
+ xml = removeInvalidXmlCharacters(xml);
+ xml = replaceHtmlEntitiesWithNumericEntities(xml);
+ xml = removeDoctypeDeclarations(xml);
+ return xml;
+ }
+
+ String removeCharactersBeforeFirstXmlTag(String xml) {
if (StringUtils.isBlank(xml)) {
return null;
}
- StringBuilder sb = new StringBuilder();
- boolean firstTagFound = false;
- for (int i = 0; i < xml.length(); i++) {
- char c = xml.charAt(i);
+ int pos = xml.indexOf('<');
+ return pos < 0 ? null : xml.substring(pos);
+ }
- if (!firstTagFound) {
- if (c == '<') {
- firstTagFound = true;
- } else {
- continue;
- }
- }
-
- if (c >= 32 || c == 9 || c == 10 || c == 13) {
- if (!Character.isHighSurrogate(c) && !Character.isLowSurrogate(c)) {
- sb.append(c);
- }
- }
+ String removeInvalidXmlCharacters(String xml) {
+ if (StringUtils.isBlank(xml)) {
+ return null;
}
- return sb.toString();
+
+ return xml.codePoints()
+ .filter(Verifier::isXMLCharacter)
+ .collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append)
+ .toString();
}
// https://stackoverflow.com/a/40836618
- public String replaceHtmlEntitiesWithNumericEntities(String source) {
+ String replaceHtmlEntitiesWithNumericEntities(String source) {
+ if (StringUtils.isBlank(source)) {
+ return null;
+ }
+
// Create a buffer sufficiently large that re-allocations are minimized.
StringBuilder sb = new StringBuilder(source.length() << 1);
@@ -63,7 +69,11 @@ class FeedCleaner {
return sb.toString();
}
- public String removeDoctypeDeclarations(String xml) {
+ String removeDoctypeDeclarations(String xml) {
+ if (StringUtils.isBlank(xml)) {
+ return null;
+ }
+
return DOCTYPE_PATTERN.matcher(xml).replaceAll("");
}
diff --git a/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedParser.java b/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedParser.java
index 188ec477..f61c1557 100644
--- a/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedParser.java
+++ b/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedParser.java
@@ -58,12 +58,11 @@ public class FeedParser {
public FeedParserResult parse(String feedUrl, byte[] xml) throws FeedParsingException {
try {
Charset encoding = encodingDetector.getEncoding(xml);
- String xmlString = feedCleaner.trimInvalidXmlCharacters(new String(xml, encoding));
+
+ String xmlString = feedCleaner.clean(new String(xml, encoding));
if (xmlString == null) {
- throw new FeedParsingException("Input string is null for url " + feedUrl);
+ throw new FeedParsingException("Input string is empty for url " + feedUrl);
}
- xmlString = feedCleaner.replaceHtmlEntitiesWithNumericEntities(xmlString);
- xmlString = feedCleaner.removeDoctypeDeclarations(xmlString);
InputSource source = new InputSource(new StringReader(xmlString));
SyndFeed feed = new SyndFeedInput().build(source);
diff --git a/commafeed-server/src/test/java/com/commafeed/backend/feed/parser/FeedCleanerTest.java b/commafeed-server/src/test/java/com/commafeed/backend/feed/parser/FeedCleanerTest.java
index be575068..d1afafd2 100644
--- a/commafeed-server/src/test/java/com/commafeed/backend/feed/parser/FeedCleanerTest.java
+++ b/commafeed-server/src/test/java/com/commafeed/backend/feed/parser/FeedCleanerTest.java
@@ -1,34 +1,271 @@
package com.commafeed.backend.feed.parser;
import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Nested;
import org.junit.jupiter.api.Test;
class FeedCleanerTest {
FeedCleaner feedCleaner = new FeedCleaner();
- @Test
- void testReplaceHtmlEntitiesWithNumericEntities() {
- String source = "T´l´phone ′";
- Assertions.assertEquals("T´l´phone ′", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source));
+ @Nested
+ class RemoveCharactersBeforeFirstXmlTag {
+ @Test
+ void removesWhitespaceBeforeXmlTag() {
+ String xml = " \n\tcontent";
+ Assertions.assertEquals("content", feedCleaner.removeCharactersBeforeFirstXmlTag(xml));
+ }
+
+ @Test
+ void removesTextBeforeXmlTag() {
+ String xml = "some text herecontent";
+ Assertions.assertEquals("content", feedCleaner.removeCharactersBeforeFirstXmlTag(xml));
+ }
+
+ @Test
+ void returnsUnchangedWhenStartsWithXmlTag() {
+ String xml = "content";
+ Assertions.assertEquals("content", feedCleaner.removeCharactersBeforeFirstXmlTag(xml));
+ }
+
+ @Test
+ void returnsNullWhenNoXmlTagFound() {
+ String xml = "no xml tags here";
+ Assertions.assertNull(feedCleaner.removeCharactersBeforeFirstXmlTag(xml));
+ }
+
+ @Test
+ void returnsNullWhenInputIsNull() {
+ Assertions.assertNull(feedCleaner.removeCharactersBeforeFirstXmlTag(null));
+ }
+
+ @Test
+ void returnsNullWhenInputIsEmpty() {
+ Assertions.assertNull(feedCleaner.removeCharactersBeforeFirstXmlTag(""));
+ }
+
+ @Test
+ void returnsNullWhenInputIsBlank() {
+ Assertions.assertNull(feedCleaner.removeCharactersBeforeFirstXmlTag(" \n\t "));
+ }
+
+ @Test
+ void preservesMultipleXmlTags() {
+ String xml = "garbage- content
";
+ Assertions.assertEquals("- content
", feedCleaner.removeCharactersBeforeFirstXmlTag(xml));
+ }
}
- @Test
- void testRemoveDoctype() {
- String source = "
";
- Assertions.assertEquals("", feedCleaner.removeDoctypeDeclarations(source));
+ @Nested
+ class RemoveInvalidXmlCharacters {
+ @Test
+ void removesNullCharacter() {
+ String xml = "content\u0000here";
+ Assertions.assertEquals("contenthere", feedCleaner.removeInvalidXmlCharacters(xml));
+ }
+
+ @Test
+ void removesInvalidControlCharacters() {
+ String xml = "content\u0001\u0002\u0003here";
+ Assertions.assertEquals("contenthere", feedCleaner.removeInvalidXmlCharacters(xml));
+ }
+
+ @Test
+ void preservesValidXmlCharacters() {
+ String xml = "content with\ttab\nand newline";
+ Assertions.assertEquals("content with\ttab\nand newline", feedCleaner.removeInvalidXmlCharacters(xml));
+ }
+
+ @Test
+ void preservesUnicodeCharacters() {
+ String xml = "café résumé 中文 العربية";
+ Assertions.assertEquals("café résumé 中文 العربية", feedCleaner.removeInvalidXmlCharacters(xml));
+ }
+
+ @Test
+ void preservesEmojiCharacters() {
+ String xml = "🎮💪✅";
+ Assertions.assertEquals("🎮💪✅", feedCleaner.removeInvalidXmlCharacters(xml));
+ }
+
+ @Test
+ void removesMultipleInvalidCharacters() {
+ String xml = "test\u0000test\u0001test\u0002test";
+ Assertions.assertEquals("testtesttesttest", feedCleaner.removeInvalidXmlCharacters(xml));
+ }
+
+ @Test
+ void returnsNullWhenInputIsNull() {
+ Assertions.assertNull(feedCleaner.removeInvalidXmlCharacters(null));
+ }
+
+ @Test
+ void returnsNullWhenInputIsEmpty() {
+ Assertions.assertNull(feedCleaner.removeInvalidXmlCharacters(""));
+ }
+
+ @Test
+ void returnsNullWhenInputIsBlank() {
+ Assertions.assertNull(feedCleaner.removeInvalidXmlCharacters(" "));
+ }
+
+ @Test
+ void handlesStringWithOnlyInvalidCharacters() {
+ String xml = "\u0000\u0001\u0002";
+ Assertions.assertEquals("", feedCleaner.removeInvalidXmlCharacters(xml));
+ }
}
- @Test
- void testRemoveMultilineDoctype() {
- String source = """
-
- """;
- Assertions.assertEquals("""
+ @Nested
+ class Entities {
+ @Test
+ void testReplaceHtmlEntitiesWithNumericEntities() {
+ String source = "T´l´phone ′";
+ Assertions.assertEquals("T´l´phone ′",
+ feedCleaner.replaceHtmlEntitiesWithNumericEntities(source));
+ }
- """, feedCleaner.removeDoctypeDeclarations(source));
+ @Test
+ void replacesMultipleOccurrencesOfSameEntity() {
+ String source = " ";
+ Assertions.assertEquals(" ", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source));
+ }
+
+ @Test
+ void preservesTextWithoutEntities() {
+ String source = "regular content";
+ Assertions.assertEquals("regular content", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source));
+ }
+
+ @Test
+ void preservesNumericEntities() {
+ String source = "´′";
+ Assertions.assertEquals("´′", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source));
+ }
+
+ @Test
+ void replacesCommonHtmlEntities() {
+ String source = "&"";
+ Assertions.assertEquals("&"", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source));
+ }
+
+ @Test
+ void handlesPartialEntityMatches() {
+ String source = "&lifier";
+ String result = feedCleaner.replaceHtmlEntitiesWithNumericEntities(source);
+ Assertions.assertTrue(result.startsWith("&") || result.equals("&lifier"));
+ }
+
+ @Test
+ void returnsNullWhenInputIsNull() {
+ Assertions.assertNull(feedCleaner.replaceHtmlEntitiesWithNumericEntities(null));
+ }
+
+ @Test
+ void returnsNullWhenInputIsEmpty() {
+ Assertions.assertNull(feedCleaner.replaceHtmlEntitiesWithNumericEntities(""));
+ }
+
+ @Test
+ void returnsNullWhenInputIsBlank() {
+ Assertions.assertNull(feedCleaner.replaceHtmlEntitiesWithNumericEntities(" "));
+ }
+
+ @Test
+ void handlesEntityAtStartOfString() {
+ String source = "&test";
+ Assertions.assertEquals("&test", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source));
+ }
+
+ @Test
+ void handlesEntityAtEndOfString() {
+ String source = "test&";
+ Assertions.assertEquals("test&", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source));
+ }
+
+ @Test
+ void handlesMixedEntitiesAndText() {
+ String source = "Hello World! Test.";
+ String result = feedCleaner.replaceHtmlEntitiesWithNumericEntities(source);
+ Assertions.assertTrue(result.contains(""));
+ }
}
-}
\ No newline at end of file
+ @Nested
+ class Doctype {
+ @Test
+ void testRemoveDoctype() {
+ String source = "";
+ Assertions.assertEquals("", feedCleaner.removeDoctypeDeclarations(source));
+ }
+
+ @Test
+ void testRemoveMultilineDoctype() {
+ String source = """
+
+ """;
+ Assertions.assertEquals("""
+
+ """, feedCleaner.removeDoctypeDeclarations(source));
+ }
+
+ @Test
+ void removesComplexDoctypeWithSystemId() {
+ String source = "";
+ Assertions.assertEquals("", feedCleaner.removeDoctypeDeclarations(source));
+ }
+
+ @Test
+ void removesComplexDoctypeWithPublicId() {
+ String source = "";
+ Assertions.assertEquals("", feedCleaner.removeDoctypeDeclarations(source));
+ }
+
+ @Test
+ void removesCaseInsensitiveDoctype() {
+ String source = "";
+ Assertions.assertEquals("", feedCleaner.removeDoctypeDeclarations(source));
+ }
+
+ @Test
+ void removesMixedCaseDoctype() {
+ String source = "";
+ Assertions.assertEquals("", feedCleaner.removeDoctypeDeclarations(source));
+ }
+
+ @Test
+ void removesMultipleDoctypeDeclarations() {
+ String source = "";
+ Assertions.assertEquals("", feedCleaner.removeDoctypeDeclarations(source));
+ }
+
+ @Test
+ void preservesContentWithoutDoctype() {
+ String source = "No doctype here";
+ Assertions.assertEquals("No doctype here", feedCleaner.removeDoctypeDeclarations(source));
+ }
+
+ @Test
+ void returnsNullWhenInputIsNull() {
+ Assertions.assertNull(feedCleaner.removeDoctypeDeclarations(null));
+ }
+
+ @Test
+ void returnsNullWhenInputIsEmpty() {
+ Assertions.assertNull(feedCleaner.removeDoctypeDeclarations(""));
+ }
+
+ @Test
+ void returnsNullWhenInputIsBlank() {
+ Assertions.assertNull(feedCleaner.removeDoctypeDeclarations(" "));
+ }
+
+ @Test
+ void handlesDoctypeWithExtraWhitespace() {
+ String source = "";
+ Assertions.assertEquals("", feedCleaner.removeDoctypeDeclarations(source));
+ }
+ }
+}