From e5c271ca1c26a1f3f73e0a0bcb0205746edb43b4 Mon Sep 17 00:00:00 2001 From: Athou Date: Sun, 16 Nov 2025 09:04:37 +0100 Subject: [PATCH] add support for more emojis (#1955) --- .../backend/feed/parser/FeedCleaner.java | 52 ++-- .../backend/feed/parser/FeedParser.java | 7 +- .../backend/feed/parser/FeedCleanerTest.java | 273 ++++++++++++++++-- 3 files changed, 289 insertions(+), 43 deletions(-) diff --git a/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedCleaner.java b/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedCleaner.java index 3ee52073..23236f92 100644 --- a/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedCleaner.java +++ b/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedCleaner.java @@ -8,41 +8,47 @@ import jakarta.inject.Singleton; import org.ahocorasick.trie.Emit; import org.ahocorasick.trie.Trie; import org.apache.commons.lang3.StringUtils; +import org.jdom2.Verifier; @Singleton class FeedCleaner { private static final Pattern DOCTYPE_PATTERN = Pattern.compile("]*>", Pattern.CASE_INSENSITIVE); - public String trimInvalidXmlCharacters(String xml) { + public String clean(String xml) { + xml = removeCharactersBeforeFirstXmlTag(xml); + xml = removeInvalidXmlCharacters(xml); + xml = replaceHtmlEntitiesWithNumericEntities(xml); + xml = removeDoctypeDeclarations(xml); + return xml; + } + + String removeCharactersBeforeFirstXmlTag(String xml) { if (StringUtils.isBlank(xml)) { return null; } - StringBuilder sb = new StringBuilder(); - boolean firstTagFound = false; - for (int i = 0; i < xml.length(); i++) { - char c = xml.charAt(i); + int pos = xml.indexOf('<'); + return pos < 0 ? null : xml.substring(pos); + } - if (!firstTagFound) { - if (c == '<') { - firstTagFound = true; - } else { - continue; - } - } - - if (c >= 32 || c == 9 || c == 10 || c == 13) { - if (!Character.isHighSurrogate(c) && !Character.isLowSurrogate(c)) { - sb.append(c); - } - } + String removeInvalidXmlCharacters(String xml) { + if (StringUtils.isBlank(xml)) { + return null; } - return sb.toString(); + + return xml.codePoints() + .filter(Verifier::isXMLCharacter) + .collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append) + .toString(); } // https://stackoverflow.com/a/40836618 - public String replaceHtmlEntitiesWithNumericEntities(String source) { + String replaceHtmlEntitiesWithNumericEntities(String source) { + if (StringUtils.isBlank(source)) { + return null; + } + // Create a buffer sufficiently large that re-allocations are minimized. StringBuilder sb = new StringBuilder(source.length() << 1); @@ -63,7 +69,11 @@ class FeedCleaner { return sb.toString(); } - public String removeDoctypeDeclarations(String xml) { + String removeDoctypeDeclarations(String xml) { + if (StringUtils.isBlank(xml)) { + return null; + } + return DOCTYPE_PATTERN.matcher(xml).replaceAll(""); } diff --git a/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedParser.java b/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedParser.java index 188ec477..f61c1557 100644 --- a/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedParser.java +++ b/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedParser.java @@ -58,12 +58,11 @@ public class FeedParser { public FeedParserResult parse(String feedUrl, byte[] xml) throws FeedParsingException { try { Charset encoding = encodingDetector.getEncoding(xml); - String xmlString = feedCleaner.trimInvalidXmlCharacters(new String(xml, encoding)); + + String xmlString = feedCleaner.clean(new String(xml, encoding)); if (xmlString == null) { - throw new FeedParsingException("Input string is null for url " + feedUrl); + throw new FeedParsingException("Input string is empty for url " + feedUrl); } - xmlString = feedCleaner.replaceHtmlEntitiesWithNumericEntities(xmlString); - xmlString = feedCleaner.removeDoctypeDeclarations(xmlString); InputSource source = new InputSource(new StringReader(xmlString)); SyndFeed feed = new SyndFeedInput().build(source); diff --git a/commafeed-server/src/test/java/com/commafeed/backend/feed/parser/FeedCleanerTest.java b/commafeed-server/src/test/java/com/commafeed/backend/feed/parser/FeedCleanerTest.java index be575068..d1afafd2 100644 --- a/commafeed-server/src/test/java/com/commafeed/backend/feed/parser/FeedCleanerTest.java +++ b/commafeed-server/src/test/java/com/commafeed/backend/feed/parser/FeedCleanerTest.java @@ -1,34 +1,271 @@ package com.commafeed.backend.feed.parser; import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; class FeedCleanerTest { FeedCleaner feedCleaner = new FeedCleaner(); - @Test - void testReplaceHtmlEntitiesWithNumericEntities() { - String source = "T´l´phone ′"; - Assertions.assertEquals("T´l´phone ′", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source)); + @Nested + class RemoveCharactersBeforeFirstXmlTag { + @Test + void removesWhitespaceBeforeXmlTag() { + String xml = " \n\tcontent"; + Assertions.assertEquals("content", feedCleaner.removeCharactersBeforeFirstXmlTag(xml)); + } + + @Test + void removesTextBeforeXmlTag() { + String xml = "some text herecontent"; + Assertions.assertEquals("content", feedCleaner.removeCharactersBeforeFirstXmlTag(xml)); + } + + @Test + void returnsUnchangedWhenStartsWithXmlTag() { + String xml = "content"; + Assertions.assertEquals("content", feedCleaner.removeCharactersBeforeFirstXmlTag(xml)); + } + + @Test + void returnsNullWhenNoXmlTagFound() { + String xml = "no xml tags here"; + Assertions.assertNull(feedCleaner.removeCharactersBeforeFirstXmlTag(xml)); + } + + @Test + void returnsNullWhenInputIsNull() { + Assertions.assertNull(feedCleaner.removeCharactersBeforeFirstXmlTag(null)); + } + + @Test + void returnsNullWhenInputIsEmpty() { + Assertions.assertNull(feedCleaner.removeCharactersBeforeFirstXmlTag("")); + } + + @Test + void returnsNullWhenInputIsBlank() { + Assertions.assertNull(feedCleaner.removeCharactersBeforeFirstXmlTag(" \n\t ")); + } + + @Test + void preservesMultipleXmlTags() { + String xml = "garbagecontent"; + Assertions.assertEquals("content", feedCleaner.removeCharactersBeforeFirstXmlTag(xml)); + } } - @Test - void testRemoveDoctype() { - String source = ""; - Assertions.assertEquals("", feedCleaner.removeDoctypeDeclarations(source)); + @Nested + class RemoveInvalidXmlCharacters { + @Test + void removesNullCharacter() { + String xml = "content\u0000here"; + Assertions.assertEquals("contenthere", feedCleaner.removeInvalidXmlCharacters(xml)); + } + + @Test + void removesInvalidControlCharacters() { + String xml = "content\u0001\u0002\u0003here"; + Assertions.assertEquals("contenthere", feedCleaner.removeInvalidXmlCharacters(xml)); + } + + @Test + void preservesValidXmlCharacters() { + String xml = "content with\ttab\nand newline"; + Assertions.assertEquals("content with\ttab\nand newline", feedCleaner.removeInvalidXmlCharacters(xml)); + } + + @Test + void preservesUnicodeCharacters() { + String xml = "café résumé 中文 العربية"; + Assertions.assertEquals("café résumé 中文 العربية", feedCleaner.removeInvalidXmlCharacters(xml)); + } + + @Test + void preservesEmojiCharacters() { + String xml = "🎮💪✅"; + Assertions.assertEquals("🎮💪✅", feedCleaner.removeInvalidXmlCharacters(xml)); + } + + @Test + void removesMultipleInvalidCharacters() { + String xml = "test\u0000test\u0001test\u0002test"; + Assertions.assertEquals("testtesttesttest", feedCleaner.removeInvalidXmlCharacters(xml)); + } + + @Test + void returnsNullWhenInputIsNull() { + Assertions.assertNull(feedCleaner.removeInvalidXmlCharacters(null)); + } + + @Test + void returnsNullWhenInputIsEmpty() { + Assertions.assertNull(feedCleaner.removeInvalidXmlCharacters("")); + } + + @Test + void returnsNullWhenInputIsBlank() { + Assertions.assertNull(feedCleaner.removeInvalidXmlCharacters(" ")); + } + + @Test + void handlesStringWithOnlyInvalidCharacters() { + String xml = "\u0000\u0001\u0002"; + Assertions.assertEquals("", feedCleaner.removeInvalidXmlCharacters(xml)); + } } - @Test - void testRemoveMultilineDoctype() { - String source = """ - - """; - Assertions.assertEquals(""" + @Nested + class Entities { + @Test + void testReplaceHtmlEntitiesWithNumericEntities() { + String source = "T´l´phone ′"; + Assertions.assertEquals("T´l´phone ′", + feedCleaner.replaceHtmlEntitiesWithNumericEntities(source)); + } - """, feedCleaner.removeDoctypeDeclarations(source)); + @Test + void replacesMultipleOccurrencesOfSameEntity() { + String source = "   "; + Assertions.assertEquals("   ", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source)); + } + + @Test + void preservesTextWithoutEntities() { + String source = "regular content"; + Assertions.assertEquals("regular content", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source)); + } + + @Test + void preservesNumericEntities() { + String source = "´′"; + Assertions.assertEquals("´′", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source)); + } + + @Test + void replacesCommonHtmlEntities() { + String source = "&""; + Assertions.assertEquals("&"", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source)); + } + + @Test + void handlesPartialEntityMatches() { + String source = "&lifier"; + String result = feedCleaner.replaceHtmlEntitiesWithNumericEntities(source); + Assertions.assertTrue(result.startsWith("&") || result.equals("&lifier")); + } + + @Test + void returnsNullWhenInputIsNull() { + Assertions.assertNull(feedCleaner.replaceHtmlEntitiesWithNumericEntities(null)); + } + + @Test + void returnsNullWhenInputIsEmpty() { + Assertions.assertNull(feedCleaner.replaceHtmlEntitiesWithNumericEntities("")); + } + + @Test + void returnsNullWhenInputIsBlank() { + Assertions.assertNull(feedCleaner.replaceHtmlEntitiesWithNumericEntities(" ")); + } + + @Test + void handlesEntityAtStartOfString() { + String source = "&test"; + Assertions.assertEquals("&test", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source)); + } + + @Test + void handlesEntityAtEndOfString() { + String source = "test&"; + Assertions.assertEquals("test&", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source)); + } + + @Test + void handlesMixedEntitiesAndText() { + String source = "Hello World! Test."; + String result = feedCleaner.replaceHtmlEntitiesWithNumericEntities(source); + Assertions.assertTrue(result.contains("&#")); + } } -} \ No newline at end of file + @Nested + class Doctype { + @Test + void testRemoveDoctype() { + String source = ""; + Assertions.assertEquals("", feedCleaner.removeDoctypeDeclarations(source)); + } + + @Test + void testRemoveMultilineDoctype() { + String source = """ + + """; + Assertions.assertEquals(""" + + """, feedCleaner.removeDoctypeDeclarations(source)); + } + + @Test + void removesComplexDoctypeWithSystemId() { + String source = ""; + Assertions.assertEquals("", feedCleaner.removeDoctypeDeclarations(source)); + } + + @Test + void removesComplexDoctypeWithPublicId() { + String source = ""; + Assertions.assertEquals("", feedCleaner.removeDoctypeDeclarations(source)); + } + + @Test + void removesCaseInsensitiveDoctype() { + String source = ""; + Assertions.assertEquals("", feedCleaner.removeDoctypeDeclarations(source)); + } + + @Test + void removesMixedCaseDoctype() { + String source = ""; + Assertions.assertEquals("", feedCleaner.removeDoctypeDeclarations(source)); + } + + @Test + void removesMultipleDoctypeDeclarations() { + String source = ""; + Assertions.assertEquals("", feedCleaner.removeDoctypeDeclarations(source)); + } + + @Test + void preservesContentWithoutDoctype() { + String source = "No doctype here"; + Assertions.assertEquals("No doctype here", feedCleaner.removeDoctypeDeclarations(source)); + } + + @Test + void returnsNullWhenInputIsNull() { + Assertions.assertNull(feedCleaner.removeDoctypeDeclarations(null)); + } + + @Test + void returnsNullWhenInputIsEmpty() { + Assertions.assertNull(feedCleaner.removeDoctypeDeclarations("")); + } + + @Test + void returnsNullWhenInputIsBlank() { + Assertions.assertNull(feedCleaner.removeDoctypeDeclarations(" ")); + } + + @Test + void handlesDoctypeWithExtraWhitespace() { + String source = ""; + Assertions.assertEquals("", feedCleaner.removeDoctypeDeclarations(source)); + } + } +}