add support for more emojis (#1955)

This commit is contained in:
Athou
2025-11-16 09:04:37 +01:00
parent f927247955
commit e5c271ca1c
3 changed files with 289 additions and 43 deletions

View File

@@ -8,41 +8,47 @@ import jakarta.inject.Singleton;
import org.ahocorasick.trie.Emit;
import org.ahocorasick.trie.Trie;
import org.apache.commons.lang3.StringUtils;
import org.jdom2.Verifier;
@Singleton
class FeedCleaner {
private static final Pattern DOCTYPE_PATTERN = Pattern.compile("<!DOCTYPE[^>]*>", Pattern.CASE_INSENSITIVE);
public String trimInvalidXmlCharacters(String xml) {
public String clean(String xml) {
xml = removeCharactersBeforeFirstXmlTag(xml);
xml = removeInvalidXmlCharacters(xml);
xml = replaceHtmlEntitiesWithNumericEntities(xml);
xml = removeDoctypeDeclarations(xml);
return xml;
}
String removeCharactersBeforeFirstXmlTag(String xml) {
if (StringUtils.isBlank(xml)) {
return null;
}
StringBuilder sb = new StringBuilder();
boolean firstTagFound = false;
for (int i = 0; i < xml.length(); i++) {
char c = xml.charAt(i);
int pos = xml.indexOf('<');
return pos < 0 ? null : xml.substring(pos);
}
if (!firstTagFound) {
if (c == '<') {
firstTagFound = true;
} else {
continue;
}
}
if (c >= 32 || c == 9 || c == 10 || c == 13) {
if (!Character.isHighSurrogate(c) && !Character.isLowSurrogate(c)) {
sb.append(c);
}
}
String removeInvalidXmlCharacters(String xml) {
if (StringUtils.isBlank(xml)) {
return null;
}
return sb.toString();
return xml.codePoints()
.filter(Verifier::isXMLCharacter)
.collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append)
.toString();
}
// https://stackoverflow.com/a/40836618
public String replaceHtmlEntitiesWithNumericEntities(String source) {
String replaceHtmlEntitiesWithNumericEntities(String source) {
if (StringUtils.isBlank(source)) {
return null;
}
// Create a buffer sufficiently large that re-allocations are minimized.
StringBuilder sb = new StringBuilder(source.length() << 1);
@@ -63,7 +69,11 @@ class FeedCleaner {
return sb.toString();
}
public String removeDoctypeDeclarations(String xml) {
String removeDoctypeDeclarations(String xml) {
if (StringUtils.isBlank(xml)) {
return null;
}
return DOCTYPE_PATTERN.matcher(xml).replaceAll("");
}

View File

@@ -58,12 +58,11 @@ public class FeedParser {
public FeedParserResult parse(String feedUrl, byte[] xml) throws FeedParsingException {
try {
Charset encoding = encodingDetector.getEncoding(xml);
String xmlString = feedCleaner.trimInvalidXmlCharacters(new String(xml, encoding));
String xmlString = feedCleaner.clean(new String(xml, encoding));
if (xmlString == null) {
throw new FeedParsingException("Input string is null for url " + feedUrl);
throw new FeedParsingException("Input string is empty for url " + feedUrl);
}
xmlString = feedCleaner.replaceHtmlEntitiesWithNumericEntities(xmlString);
xmlString = feedCleaner.removeDoctypeDeclarations(xmlString);
InputSource source = new InputSource(new StringReader(xmlString));
SyndFeed feed = new SyndFeedInput().build(source);

View File

@@ -1,34 +1,271 @@
package com.commafeed.backend.feed.parser;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Nested;
import org.junit.jupiter.api.Test;
class FeedCleanerTest {
FeedCleaner feedCleaner = new FeedCleaner();
@Test
void testReplaceHtmlEntitiesWithNumericEntities() {
String source = "<source>T&acute;l&acute;phone &prime;</source>";
Assertions.assertEquals("<source>T&#180;l&#180;phone &#8242;</source>", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source));
@Nested
class RemoveCharactersBeforeFirstXmlTag {
@Test
void removesWhitespaceBeforeXmlTag() {
String xml = " \n\t<feed>content</feed>";
Assertions.assertEquals("<feed>content</feed>", feedCleaner.removeCharactersBeforeFirstXmlTag(xml));
}
@Test
void removesTextBeforeXmlTag() {
String xml = "some text here<feed>content</feed>";
Assertions.assertEquals("<feed>content</feed>", feedCleaner.removeCharactersBeforeFirstXmlTag(xml));
}
@Test
void returnsUnchangedWhenStartsWithXmlTag() {
String xml = "<feed>content</feed>";
Assertions.assertEquals("<feed>content</feed>", feedCleaner.removeCharactersBeforeFirstXmlTag(xml));
}
@Test
void returnsNullWhenNoXmlTagFound() {
String xml = "no xml tags here";
Assertions.assertNull(feedCleaner.removeCharactersBeforeFirstXmlTag(xml));
}
@Test
void returnsNullWhenInputIsNull() {
Assertions.assertNull(feedCleaner.removeCharactersBeforeFirstXmlTag(null));
}
@Test
void returnsNullWhenInputIsEmpty() {
Assertions.assertNull(feedCleaner.removeCharactersBeforeFirstXmlTag(""));
}
@Test
void returnsNullWhenInputIsBlank() {
Assertions.assertNull(feedCleaner.removeCharactersBeforeFirstXmlTag(" \n\t "));
}
@Test
void preservesMultipleXmlTags() {
String xml = "garbage<feed><item>content</item></feed>";
Assertions.assertEquals("<feed><item>content</item></feed>", feedCleaner.removeCharactersBeforeFirstXmlTag(xml));
}
}
@Test
void testRemoveDoctype() {
String source = "<!DOCTYPE html><html><head></head><body></body></html>";
Assertions.assertEquals("<html><head></head><body></body></html>", feedCleaner.removeDoctypeDeclarations(source));
@Nested
class RemoveInvalidXmlCharacters {
@Test
void removesNullCharacter() {
String xml = "<feed>content\u0000here</feed>";
Assertions.assertEquals("<feed>contenthere</feed>", feedCleaner.removeInvalidXmlCharacters(xml));
}
@Test
void removesInvalidControlCharacters() {
String xml = "<feed>content\u0001\u0002\u0003here</feed>";
Assertions.assertEquals("<feed>contenthere</feed>", feedCleaner.removeInvalidXmlCharacters(xml));
}
@Test
void preservesValidXmlCharacters() {
String xml = "<feed>content with\ttab\nand newline</feed>";
Assertions.assertEquals("<feed>content with\ttab\nand newline</feed>", feedCleaner.removeInvalidXmlCharacters(xml));
}
@Test
void preservesUnicodeCharacters() {
String xml = "<feed>café résumé 中文 العربية</feed>";
Assertions.assertEquals("<feed>café résumé 中文 العربية</feed>", feedCleaner.removeInvalidXmlCharacters(xml));
}
@Test
void preservesEmojiCharacters() {
String xml = "<feed>🎮💪✅</feed>";
Assertions.assertEquals("<feed>🎮💪✅</feed>", feedCleaner.removeInvalidXmlCharacters(xml));
}
@Test
void removesMultipleInvalidCharacters() {
String xml = "test\u0000test\u0001test\u0002test";
Assertions.assertEquals("testtesttesttest", feedCleaner.removeInvalidXmlCharacters(xml));
}
@Test
void returnsNullWhenInputIsNull() {
Assertions.assertNull(feedCleaner.removeInvalidXmlCharacters(null));
}
@Test
void returnsNullWhenInputIsEmpty() {
Assertions.assertNull(feedCleaner.removeInvalidXmlCharacters(""));
}
@Test
void returnsNullWhenInputIsBlank() {
Assertions.assertNull(feedCleaner.removeInvalidXmlCharacters(" "));
}
@Test
void handlesStringWithOnlyInvalidCharacters() {
String xml = "\u0000\u0001\u0002";
Assertions.assertEquals("", feedCleaner.removeInvalidXmlCharacters(xml));
}
}
@Test
void testRemoveMultilineDoctype() {
String source = """
<!DOCTYPE
html
>
<html><head></head><body></body></html>""";
Assertions.assertEquals("""
@Nested
class Entities {
@Test
void testReplaceHtmlEntitiesWithNumericEntities() {
String source = "<source>T&acute;l&acute;phone &prime;</source>";
Assertions.assertEquals("<source>T&#180;l&#180;phone &#8242;</source>",
feedCleaner.replaceHtmlEntitiesWithNumericEntities(source));
}
<html><head></head><body></body></html>""", feedCleaner.removeDoctypeDeclarations(source));
@Test
void replacesMultipleOccurrencesOfSameEntity() {
String source = "&nbsp;&nbsp;&nbsp;";
Assertions.assertEquals("&#160;&#160;&#160;", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source));
}
@Test
void preservesTextWithoutEntities() {
String source = "<feed>regular content</feed>";
Assertions.assertEquals("<feed>regular content</feed>", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source));
}
@Test
void preservesNumericEntities() {
String source = "&#180;&#8242;";
Assertions.assertEquals("&#180;&#8242;", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source));
}
@Test
void replacesCommonHtmlEntities() {
String source = "&amp;&quot;";
Assertions.assertEquals("&#38;&#34;", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source));
}
@Test
void handlesPartialEntityMatches() {
String source = "&amplifier";
String result = feedCleaner.replaceHtmlEntitiesWithNumericEntities(source);
Assertions.assertTrue(result.startsWith("&#38;") || result.equals("&amplifier"));
}
@Test
void returnsNullWhenInputIsNull() {
Assertions.assertNull(feedCleaner.replaceHtmlEntitiesWithNumericEntities(null));
}
@Test
void returnsNullWhenInputIsEmpty() {
Assertions.assertNull(feedCleaner.replaceHtmlEntitiesWithNumericEntities(""));
}
@Test
void returnsNullWhenInputIsBlank() {
Assertions.assertNull(feedCleaner.replaceHtmlEntitiesWithNumericEntities(" "));
}
@Test
void handlesEntityAtStartOfString() {
String source = "&amp;test";
Assertions.assertEquals("&#38;test", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source));
}
@Test
void handlesEntityAtEndOfString() {
String source = "test&amp;";
Assertions.assertEquals("test&#38;", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source));
}
@Test
void handlesMixedEntitiesAndText() {
String source = "Hello&nbsp;World&excl;&nbsp;Test&period;";
String result = feedCleaner.replaceHtmlEntitiesWithNumericEntities(source);
Assertions.assertTrue(result.contains("&#"));
}
}
}
@Nested
class Doctype {
@Test
void testRemoveDoctype() {
String source = "<!DOCTYPE html><html><head></head><body></body></html>";
Assertions.assertEquals("<html><head></head><body></body></html>", feedCleaner.removeDoctypeDeclarations(source));
}
@Test
void testRemoveMultilineDoctype() {
String source = """
<!DOCTYPE
html
>
<html><head></head><body></body></html>""";
Assertions.assertEquals("""
<html><head></head><body></body></html>""", feedCleaner.removeDoctypeDeclarations(source));
}
@Test
void removesComplexDoctypeWithSystemId() {
String source = "<!DOCTYPE html SYSTEM \"about:legacy-compat\"><html><body></body></html>";
Assertions.assertEquals("<html><body></body></html>", feedCleaner.removeDoctypeDeclarations(source));
}
@Test
void removesComplexDoctypeWithPublicId() {
String source = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html></html>";
Assertions.assertEquals("<html></html>", feedCleaner.removeDoctypeDeclarations(source));
}
@Test
void removesCaseInsensitiveDoctype() {
String source = "<!doctype html><html></html>";
Assertions.assertEquals("<html></html>", feedCleaner.removeDoctypeDeclarations(source));
}
@Test
void removesMixedCaseDoctype() {
String source = "<!DoCtYpE html><html></html>";
Assertions.assertEquals("<html></html>", feedCleaner.removeDoctypeDeclarations(source));
}
@Test
void removesMultipleDoctypeDeclarations() {
String source = "<!DOCTYPE html><!DOCTYPE html><html></html>";
Assertions.assertEquals("<html></html>", feedCleaner.removeDoctypeDeclarations(source));
}
@Test
void preservesContentWithoutDoctype() {
String source = "<html><body>No doctype here</body></html>";
Assertions.assertEquals("<html><body>No doctype here</body></html>", feedCleaner.removeDoctypeDeclarations(source));
}
@Test
void returnsNullWhenInputIsNull() {
Assertions.assertNull(feedCleaner.removeDoctypeDeclarations(null));
}
@Test
void returnsNullWhenInputIsEmpty() {
Assertions.assertNull(feedCleaner.removeDoctypeDeclarations(""));
}
@Test
void returnsNullWhenInputIsBlank() {
Assertions.assertNull(feedCleaner.removeDoctypeDeclarations(" "));
}
@Test
void handlesDoctypeWithExtraWhitespace() {
String source = "<!DOCTYPE html ><html></html>";
Assertions.assertEquals("<html></html>", feedCleaner.removeDoctypeDeclarations(source));
}
}
}