mirror of
https://github.com/Athou/commafeed.git
synced 2026-03-21 21:37:29 +00:00
add support for more emojis (#1955)
This commit is contained in:
@@ -8,41 +8,47 @@ import jakarta.inject.Singleton;
|
||||
import org.ahocorasick.trie.Emit;
|
||||
import org.ahocorasick.trie.Trie;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jdom2.Verifier;
|
||||
|
||||
@Singleton
|
||||
class FeedCleaner {
|
||||
|
||||
private static final Pattern DOCTYPE_PATTERN = Pattern.compile("<!DOCTYPE[^>]*>", Pattern.CASE_INSENSITIVE);
|
||||
|
||||
public String trimInvalidXmlCharacters(String xml) {
|
||||
public String clean(String xml) {
|
||||
xml = removeCharactersBeforeFirstXmlTag(xml);
|
||||
xml = removeInvalidXmlCharacters(xml);
|
||||
xml = replaceHtmlEntitiesWithNumericEntities(xml);
|
||||
xml = removeDoctypeDeclarations(xml);
|
||||
return xml;
|
||||
}
|
||||
|
||||
String removeCharactersBeforeFirstXmlTag(String xml) {
|
||||
if (StringUtils.isBlank(xml)) {
|
||||
return null;
|
||||
}
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
boolean firstTagFound = false;
|
||||
for (int i = 0; i < xml.length(); i++) {
|
||||
char c = xml.charAt(i);
|
||||
int pos = xml.indexOf('<');
|
||||
return pos < 0 ? null : xml.substring(pos);
|
||||
}
|
||||
|
||||
if (!firstTagFound) {
|
||||
if (c == '<') {
|
||||
firstTagFound = true;
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (c >= 32 || c == 9 || c == 10 || c == 13) {
|
||||
if (!Character.isHighSurrogate(c) && !Character.isLowSurrogate(c)) {
|
||||
sb.append(c);
|
||||
}
|
||||
}
|
||||
String removeInvalidXmlCharacters(String xml) {
|
||||
if (StringUtils.isBlank(xml)) {
|
||||
return null;
|
||||
}
|
||||
return sb.toString();
|
||||
|
||||
return xml.codePoints()
|
||||
.filter(Verifier::isXMLCharacter)
|
||||
.collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append)
|
||||
.toString();
|
||||
}
|
||||
|
||||
// https://stackoverflow.com/a/40836618
|
||||
public String replaceHtmlEntitiesWithNumericEntities(String source) {
|
||||
String replaceHtmlEntitiesWithNumericEntities(String source) {
|
||||
if (StringUtils.isBlank(source)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Create a buffer sufficiently large that re-allocations are minimized.
|
||||
StringBuilder sb = new StringBuilder(source.length() << 1);
|
||||
|
||||
@@ -63,7 +69,11 @@ class FeedCleaner {
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public String removeDoctypeDeclarations(String xml) {
|
||||
String removeDoctypeDeclarations(String xml) {
|
||||
if (StringUtils.isBlank(xml)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return DOCTYPE_PATTERN.matcher(xml).replaceAll("");
|
||||
}
|
||||
|
||||
|
||||
@@ -58,12 +58,11 @@ public class FeedParser {
|
||||
public FeedParserResult parse(String feedUrl, byte[] xml) throws FeedParsingException {
|
||||
try {
|
||||
Charset encoding = encodingDetector.getEncoding(xml);
|
||||
String xmlString = feedCleaner.trimInvalidXmlCharacters(new String(xml, encoding));
|
||||
|
||||
String xmlString = feedCleaner.clean(new String(xml, encoding));
|
||||
if (xmlString == null) {
|
||||
throw new FeedParsingException("Input string is null for url " + feedUrl);
|
||||
throw new FeedParsingException("Input string is empty for url " + feedUrl);
|
||||
}
|
||||
xmlString = feedCleaner.replaceHtmlEntitiesWithNumericEntities(xmlString);
|
||||
xmlString = feedCleaner.removeDoctypeDeclarations(xmlString);
|
||||
|
||||
InputSource source = new InputSource(new StringReader(xmlString));
|
||||
SyndFeed feed = new SyndFeedInput().build(source);
|
||||
|
||||
@@ -1,34 +1,271 @@
|
||||
package com.commafeed.backend.feed.parser;
|
||||
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Nested;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
class FeedCleanerTest {
|
||||
|
||||
FeedCleaner feedCleaner = new FeedCleaner();
|
||||
|
||||
@Test
|
||||
void testReplaceHtmlEntitiesWithNumericEntities() {
|
||||
String source = "<source>T´l´phone ′</source>";
|
||||
Assertions.assertEquals("<source>T´l´phone ′</source>", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source));
|
||||
@Nested
|
||||
class RemoveCharactersBeforeFirstXmlTag {
|
||||
@Test
|
||||
void removesWhitespaceBeforeXmlTag() {
|
||||
String xml = " \n\t<feed>content</feed>";
|
||||
Assertions.assertEquals("<feed>content</feed>", feedCleaner.removeCharactersBeforeFirstXmlTag(xml));
|
||||
}
|
||||
|
||||
@Test
|
||||
void removesTextBeforeXmlTag() {
|
||||
String xml = "some text here<feed>content</feed>";
|
||||
Assertions.assertEquals("<feed>content</feed>", feedCleaner.removeCharactersBeforeFirstXmlTag(xml));
|
||||
}
|
||||
|
||||
@Test
|
||||
void returnsUnchangedWhenStartsWithXmlTag() {
|
||||
String xml = "<feed>content</feed>";
|
||||
Assertions.assertEquals("<feed>content</feed>", feedCleaner.removeCharactersBeforeFirstXmlTag(xml));
|
||||
}
|
||||
|
||||
@Test
|
||||
void returnsNullWhenNoXmlTagFound() {
|
||||
String xml = "no xml tags here";
|
||||
Assertions.assertNull(feedCleaner.removeCharactersBeforeFirstXmlTag(xml));
|
||||
}
|
||||
|
||||
@Test
|
||||
void returnsNullWhenInputIsNull() {
|
||||
Assertions.assertNull(feedCleaner.removeCharactersBeforeFirstXmlTag(null));
|
||||
}
|
||||
|
||||
@Test
|
||||
void returnsNullWhenInputIsEmpty() {
|
||||
Assertions.assertNull(feedCleaner.removeCharactersBeforeFirstXmlTag(""));
|
||||
}
|
||||
|
||||
@Test
|
||||
void returnsNullWhenInputIsBlank() {
|
||||
Assertions.assertNull(feedCleaner.removeCharactersBeforeFirstXmlTag(" \n\t "));
|
||||
}
|
||||
|
||||
@Test
|
||||
void preservesMultipleXmlTags() {
|
||||
String xml = "garbage<feed><item>content</item></feed>";
|
||||
Assertions.assertEquals("<feed><item>content</item></feed>", feedCleaner.removeCharactersBeforeFirstXmlTag(xml));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
void testRemoveDoctype() {
|
||||
String source = "<!DOCTYPE html><html><head></head><body></body></html>";
|
||||
Assertions.assertEquals("<html><head></head><body></body></html>", feedCleaner.removeDoctypeDeclarations(source));
|
||||
@Nested
|
||||
class RemoveInvalidXmlCharacters {
|
||||
@Test
|
||||
void removesNullCharacter() {
|
||||
String xml = "<feed>content\u0000here</feed>";
|
||||
Assertions.assertEquals("<feed>contenthere</feed>", feedCleaner.removeInvalidXmlCharacters(xml));
|
||||
}
|
||||
|
||||
@Test
|
||||
void removesInvalidControlCharacters() {
|
||||
String xml = "<feed>content\u0001\u0002\u0003here</feed>";
|
||||
Assertions.assertEquals("<feed>contenthere</feed>", feedCleaner.removeInvalidXmlCharacters(xml));
|
||||
}
|
||||
|
||||
@Test
|
||||
void preservesValidXmlCharacters() {
|
||||
String xml = "<feed>content with\ttab\nand newline</feed>";
|
||||
Assertions.assertEquals("<feed>content with\ttab\nand newline</feed>", feedCleaner.removeInvalidXmlCharacters(xml));
|
||||
}
|
||||
|
||||
@Test
|
||||
void preservesUnicodeCharacters() {
|
||||
String xml = "<feed>café résumé 中文 العربية</feed>";
|
||||
Assertions.assertEquals("<feed>café résumé 中文 العربية</feed>", feedCleaner.removeInvalidXmlCharacters(xml));
|
||||
}
|
||||
|
||||
@Test
|
||||
void preservesEmojiCharacters() {
|
||||
String xml = "<feed>🎮💪✅</feed>";
|
||||
Assertions.assertEquals("<feed>🎮💪✅</feed>", feedCleaner.removeInvalidXmlCharacters(xml));
|
||||
}
|
||||
|
||||
@Test
|
||||
void removesMultipleInvalidCharacters() {
|
||||
String xml = "test\u0000test\u0001test\u0002test";
|
||||
Assertions.assertEquals("testtesttesttest", feedCleaner.removeInvalidXmlCharacters(xml));
|
||||
}
|
||||
|
||||
@Test
|
||||
void returnsNullWhenInputIsNull() {
|
||||
Assertions.assertNull(feedCleaner.removeInvalidXmlCharacters(null));
|
||||
}
|
||||
|
||||
@Test
|
||||
void returnsNullWhenInputIsEmpty() {
|
||||
Assertions.assertNull(feedCleaner.removeInvalidXmlCharacters(""));
|
||||
}
|
||||
|
||||
@Test
|
||||
void returnsNullWhenInputIsBlank() {
|
||||
Assertions.assertNull(feedCleaner.removeInvalidXmlCharacters(" "));
|
||||
}
|
||||
|
||||
@Test
|
||||
void handlesStringWithOnlyInvalidCharacters() {
|
||||
String xml = "\u0000\u0001\u0002";
|
||||
Assertions.assertEquals("", feedCleaner.removeInvalidXmlCharacters(xml));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
void testRemoveMultilineDoctype() {
|
||||
String source = """
|
||||
<!DOCTYPE
|
||||
html
|
||||
>
|
||||
<html><head></head><body></body></html>""";
|
||||
Assertions.assertEquals("""
|
||||
@Nested
|
||||
class Entities {
|
||||
@Test
|
||||
void testReplaceHtmlEntitiesWithNumericEntities() {
|
||||
String source = "<source>T´l´phone ′</source>";
|
||||
Assertions.assertEquals("<source>T´l´phone ′</source>",
|
||||
feedCleaner.replaceHtmlEntitiesWithNumericEntities(source));
|
||||
}
|
||||
|
||||
<html><head></head><body></body></html>""", feedCleaner.removeDoctypeDeclarations(source));
|
||||
@Test
|
||||
void replacesMultipleOccurrencesOfSameEntity() {
|
||||
String source = " ";
|
||||
Assertions.assertEquals("   ", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source));
|
||||
}
|
||||
|
||||
@Test
|
||||
void preservesTextWithoutEntities() {
|
||||
String source = "<feed>regular content</feed>";
|
||||
Assertions.assertEquals("<feed>regular content</feed>", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source));
|
||||
}
|
||||
|
||||
@Test
|
||||
void preservesNumericEntities() {
|
||||
String source = "´′";
|
||||
Assertions.assertEquals("´′", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source));
|
||||
}
|
||||
|
||||
@Test
|
||||
void replacesCommonHtmlEntities() {
|
||||
String source = "&"";
|
||||
Assertions.assertEquals("&"", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source));
|
||||
}
|
||||
|
||||
@Test
|
||||
void handlesPartialEntityMatches() {
|
||||
String source = "&lifier";
|
||||
String result = feedCleaner.replaceHtmlEntitiesWithNumericEntities(source);
|
||||
Assertions.assertTrue(result.startsWith("&") || result.equals("&lifier"));
|
||||
}
|
||||
|
||||
@Test
|
||||
void returnsNullWhenInputIsNull() {
|
||||
Assertions.assertNull(feedCleaner.replaceHtmlEntitiesWithNumericEntities(null));
|
||||
}
|
||||
|
||||
@Test
|
||||
void returnsNullWhenInputIsEmpty() {
|
||||
Assertions.assertNull(feedCleaner.replaceHtmlEntitiesWithNumericEntities(""));
|
||||
}
|
||||
|
||||
@Test
|
||||
void returnsNullWhenInputIsBlank() {
|
||||
Assertions.assertNull(feedCleaner.replaceHtmlEntitiesWithNumericEntities(" "));
|
||||
}
|
||||
|
||||
@Test
|
||||
void handlesEntityAtStartOfString() {
|
||||
String source = "&test";
|
||||
Assertions.assertEquals("&test", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source));
|
||||
}
|
||||
|
||||
@Test
|
||||
void handlesEntityAtEndOfString() {
|
||||
String source = "test&";
|
||||
Assertions.assertEquals("test&", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source));
|
||||
}
|
||||
|
||||
@Test
|
||||
void handlesMixedEntitiesAndText() {
|
||||
String source = "Hello World! Test.";
|
||||
String result = feedCleaner.replaceHtmlEntitiesWithNumericEntities(source);
|
||||
Assertions.assertTrue(result.contains("&#"));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@Nested
|
||||
class Doctype {
|
||||
@Test
|
||||
void testRemoveDoctype() {
|
||||
String source = "<!DOCTYPE html><html><head></head><body></body></html>";
|
||||
Assertions.assertEquals("<html><head></head><body></body></html>", feedCleaner.removeDoctypeDeclarations(source));
|
||||
}
|
||||
|
||||
@Test
|
||||
void testRemoveMultilineDoctype() {
|
||||
String source = """
|
||||
<!DOCTYPE
|
||||
html
|
||||
>
|
||||
<html><head></head><body></body></html>""";
|
||||
Assertions.assertEquals("""
|
||||
|
||||
<html><head></head><body></body></html>""", feedCleaner.removeDoctypeDeclarations(source));
|
||||
}
|
||||
|
||||
@Test
|
||||
void removesComplexDoctypeWithSystemId() {
|
||||
String source = "<!DOCTYPE html SYSTEM \"about:legacy-compat\"><html><body></body></html>";
|
||||
Assertions.assertEquals("<html><body></body></html>", feedCleaner.removeDoctypeDeclarations(source));
|
||||
}
|
||||
|
||||
@Test
|
||||
void removesComplexDoctypeWithPublicId() {
|
||||
String source = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html></html>";
|
||||
Assertions.assertEquals("<html></html>", feedCleaner.removeDoctypeDeclarations(source));
|
||||
}
|
||||
|
||||
@Test
|
||||
void removesCaseInsensitiveDoctype() {
|
||||
String source = "<!doctype html><html></html>";
|
||||
Assertions.assertEquals("<html></html>", feedCleaner.removeDoctypeDeclarations(source));
|
||||
}
|
||||
|
||||
@Test
|
||||
void removesMixedCaseDoctype() {
|
||||
String source = "<!DoCtYpE html><html></html>";
|
||||
Assertions.assertEquals("<html></html>", feedCleaner.removeDoctypeDeclarations(source));
|
||||
}
|
||||
|
||||
@Test
|
||||
void removesMultipleDoctypeDeclarations() {
|
||||
String source = "<!DOCTYPE html><!DOCTYPE html><html></html>";
|
||||
Assertions.assertEquals("<html></html>", feedCleaner.removeDoctypeDeclarations(source));
|
||||
}
|
||||
|
||||
@Test
|
||||
void preservesContentWithoutDoctype() {
|
||||
String source = "<html><body>No doctype here</body></html>";
|
||||
Assertions.assertEquals("<html><body>No doctype here</body></html>", feedCleaner.removeDoctypeDeclarations(source));
|
||||
}
|
||||
|
||||
@Test
|
||||
void returnsNullWhenInputIsNull() {
|
||||
Assertions.assertNull(feedCleaner.removeDoctypeDeclarations(null));
|
||||
}
|
||||
|
||||
@Test
|
||||
void returnsNullWhenInputIsEmpty() {
|
||||
Assertions.assertNull(feedCleaner.removeDoctypeDeclarations(""));
|
||||
}
|
||||
|
||||
@Test
|
||||
void returnsNullWhenInputIsBlank() {
|
||||
Assertions.assertNull(feedCleaner.removeDoctypeDeclarations(" "));
|
||||
}
|
||||
|
||||
@Test
|
||||
void handlesDoctypeWithExtraWhitespace() {
|
||||
String source = "<!DOCTYPE html ><html></html>";
|
||||
Assertions.assertEquals("<html></html>", feedCleaner.removeDoctypeDeclarations(source));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user