mirror of
https://github.com/Athou/commafeed.git
synced 2026-03-21 21:37:29 +00:00
add support for more emojis (#1955)
This commit is contained in:
@@ -8,41 +8,47 @@ import jakarta.inject.Singleton;
|
|||||||
import org.ahocorasick.trie.Emit;
|
import org.ahocorasick.trie.Emit;
|
||||||
import org.ahocorasick.trie.Trie;
|
import org.ahocorasick.trie.Trie;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.jdom2.Verifier;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
class FeedCleaner {
|
class FeedCleaner {
|
||||||
|
|
||||||
private static final Pattern DOCTYPE_PATTERN = Pattern.compile("<!DOCTYPE[^>]*>", Pattern.CASE_INSENSITIVE);
|
private static final Pattern DOCTYPE_PATTERN = Pattern.compile("<!DOCTYPE[^>]*>", Pattern.CASE_INSENSITIVE);
|
||||||
|
|
||||||
public String trimInvalidXmlCharacters(String xml) {
|
public String clean(String xml) {
|
||||||
|
xml = removeCharactersBeforeFirstXmlTag(xml);
|
||||||
|
xml = removeInvalidXmlCharacters(xml);
|
||||||
|
xml = replaceHtmlEntitiesWithNumericEntities(xml);
|
||||||
|
xml = removeDoctypeDeclarations(xml);
|
||||||
|
return xml;
|
||||||
|
}
|
||||||
|
|
||||||
|
String removeCharactersBeforeFirstXmlTag(String xml) {
|
||||||
if (StringUtils.isBlank(xml)) {
|
if (StringUtils.isBlank(xml)) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
StringBuilder sb = new StringBuilder();
|
|
||||||
|
|
||||||
boolean firstTagFound = false;
|
int pos = xml.indexOf('<');
|
||||||
for (int i = 0; i < xml.length(); i++) {
|
return pos < 0 ? null : xml.substring(pos);
|
||||||
char c = xml.charAt(i);
|
|
||||||
|
|
||||||
if (!firstTagFound) {
|
|
||||||
if (c == '<') {
|
|
||||||
firstTagFound = true;
|
|
||||||
} else {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (c >= 32 || c == 9 || c == 10 || c == 13) {
|
String removeInvalidXmlCharacters(String xml) {
|
||||||
if (!Character.isHighSurrogate(c) && !Character.isLowSurrogate(c)) {
|
if (StringUtils.isBlank(xml)) {
|
||||||
sb.append(c);
|
return null;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
return xml.codePoints()
|
||||||
return sb.toString();
|
.filter(Verifier::isXMLCharacter)
|
||||||
|
.collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append)
|
||||||
|
.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
// https://stackoverflow.com/a/40836618
|
// https://stackoverflow.com/a/40836618
|
||||||
public String replaceHtmlEntitiesWithNumericEntities(String source) {
|
String replaceHtmlEntitiesWithNumericEntities(String source) {
|
||||||
|
if (StringUtils.isBlank(source)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
// Create a buffer sufficiently large that re-allocations are minimized.
|
// Create a buffer sufficiently large that re-allocations are minimized.
|
||||||
StringBuilder sb = new StringBuilder(source.length() << 1);
|
StringBuilder sb = new StringBuilder(source.length() << 1);
|
||||||
|
|
||||||
@@ -63,7 +69,11 @@ class FeedCleaner {
|
|||||||
return sb.toString();
|
return sb.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
public String removeDoctypeDeclarations(String xml) {
|
String removeDoctypeDeclarations(String xml) {
|
||||||
|
if (StringUtils.isBlank(xml)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
return DOCTYPE_PATTERN.matcher(xml).replaceAll("");
|
return DOCTYPE_PATTERN.matcher(xml).replaceAll("");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -58,12 +58,11 @@ public class FeedParser {
|
|||||||
public FeedParserResult parse(String feedUrl, byte[] xml) throws FeedParsingException {
|
public FeedParserResult parse(String feedUrl, byte[] xml) throws FeedParsingException {
|
||||||
try {
|
try {
|
||||||
Charset encoding = encodingDetector.getEncoding(xml);
|
Charset encoding = encodingDetector.getEncoding(xml);
|
||||||
String xmlString = feedCleaner.trimInvalidXmlCharacters(new String(xml, encoding));
|
|
||||||
|
String xmlString = feedCleaner.clean(new String(xml, encoding));
|
||||||
if (xmlString == null) {
|
if (xmlString == null) {
|
||||||
throw new FeedParsingException("Input string is null for url " + feedUrl);
|
throw new FeedParsingException("Input string is empty for url " + feedUrl);
|
||||||
}
|
}
|
||||||
xmlString = feedCleaner.replaceHtmlEntitiesWithNumericEntities(xmlString);
|
|
||||||
xmlString = feedCleaner.removeDoctypeDeclarations(xmlString);
|
|
||||||
|
|
||||||
InputSource source = new InputSource(new StringReader(xmlString));
|
InputSource source = new InputSource(new StringReader(xmlString));
|
||||||
SyndFeed feed = new SyndFeedInput().build(source);
|
SyndFeed feed = new SyndFeedInput().build(source);
|
||||||
|
|||||||
@@ -1,18 +1,198 @@
|
|||||||
package com.commafeed.backend.feed.parser;
|
package com.commafeed.backend.feed.parser;
|
||||||
|
|
||||||
import org.junit.jupiter.api.Assertions;
|
import org.junit.jupiter.api.Assertions;
|
||||||
|
import org.junit.jupiter.api.Nested;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
class FeedCleanerTest {
|
class FeedCleanerTest {
|
||||||
|
|
||||||
FeedCleaner feedCleaner = new FeedCleaner();
|
FeedCleaner feedCleaner = new FeedCleaner();
|
||||||
|
|
||||||
|
@Nested
|
||||||
|
class RemoveCharactersBeforeFirstXmlTag {
|
||||||
|
@Test
|
||||||
|
void removesWhitespaceBeforeXmlTag() {
|
||||||
|
String xml = " \n\t<feed>content</feed>";
|
||||||
|
Assertions.assertEquals("<feed>content</feed>", feedCleaner.removeCharactersBeforeFirstXmlTag(xml));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void removesTextBeforeXmlTag() {
|
||||||
|
String xml = "some text here<feed>content</feed>";
|
||||||
|
Assertions.assertEquals("<feed>content</feed>", feedCleaner.removeCharactersBeforeFirstXmlTag(xml));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void returnsUnchangedWhenStartsWithXmlTag() {
|
||||||
|
String xml = "<feed>content</feed>";
|
||||||
|
Assertions.assertEquals("<feed>content</feed>", feedCleaner.removeCharactersBeforeFirstXmlTag(xml));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void returnsNullWhenNoXmlTagFound() {
|
||||||
|
String xml = "no xml tags here";
|
||||||
|
Assertions.assertNull(feedCleaner.removeCharactersBeforeFirstXmlTag(xml));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void returnsNullWhenInputIsNull() {
|
||||||
|
Assertions.assertNull(feedCleaner.removeCharactersBeforeFirstXmlTag(null));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void returnsNullWhenInputIsEmpty() {
|
||||||
|
Assertions.assertNull(feedCleaner.removeCharactersBeforeFirstXmlTag(""));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void returnsNullWhenInputIsBlank() {
|
||||||
|
Assertions.assertNull(feedCleaner.removeCharactersBeforeFirstXmlTag(" \n\t "));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void preservesMultipleXmlTags() {
|
||||||
|
String xml = "garbage<feed><item>content</item></feed>";
|
||||||
|
Assertions.assertEquals("<feed><item>content</item></feed>", feedCleaner.removeCharactersBeforeFirstXmlTag(xml));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Nested
|
||||||
|
class RemoveInvalidXmlCharacters {
|
||||||
|
@Test
|
||||||
|
void removesNullCharacter() {
|
||||||
|
String xml = "<feed>content\u0000here</feed>";
|
||||||
|
Assertions.assertEquals("<feed>contenthere</feed>", feedCleaner.removeInvalidXmlCharacters(xml));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void removesInvalidControlCharacters() {
|
||||||
|
String xml = "<feed>content\u0001\u0002\u0003here</feed>";
|
||||||
|
Assertions.assertEquals("<feed>contenthere</feed>", feedCleaner.removeInvalidXmlCharacters(xml));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void preservesValidXmlCharacters() {
|
||||||
|
String xml = "<feed>content with\ttab\nand newline</feed>";
|
||||||
|
Assertions.assertEquals("<feed>content with\ttab\nand newline</feed>", feedCleaner.removeInvalidXmlCharacters(xml));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void preservesUnicodeCharacters() {
|
||||||
|
String xml = "<feed>café résumé 中文 العربية</feed>";
|
||||||
|
Assertions.assertEquals("<feed>café résumé 中文 العربية</feed>", feedCleaner.removeInvalidXmlCharacters(xml));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void preservesEmojiCharacters() {
|
||||||
|
String xml = "<feed>🎮💪✅</feed>";
|
||||||
|
Assertions.assertEquals("<feed>🎮💪✅</feed>", feedCleaner.removeInvalidXmlCharacters(xml));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void removesMultipleInvalidCharacters() {
|
||||||
|
String xml = "test\u0000test\u0001test\u0002test";
|
||||||
|
Assertions.assertEquals("testtesttesttest", feedCleaner.removeInvalidXmlCharacters(xml));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void returnsNullWhenInputIsNull() {
|
||||||
|
Assertions.assertNull(feedCleaner.removeInvalidXmlCharacters(null));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void returnsNullWhenInputIsEmpty() {
|
||||||
|
Assertions.assertNull(feedCleaner.removeInvalidXmlCharacters(""));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void returnsNullWhenInputIsBlank() {
|
||||||
|
Assertions.assertNull(feedCleaner.removeInvalidXmlCharacters(" "));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void handlesStringWithOnlyInvalidCharacters() {
|
||||||
|
String xml = "\u0000\u0001\u0002";
|
||||||
|
Assertions.assertEquals("", feedCleaner.removeInvalidXmlCharacters(xml));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Nested
|
||||||
|
class Entities {
|
||||||
@Test
|
@Test
|
||||||
void testReplaceHtmlEntitiesWithNumericEntities() {
|
void testReplaceHtmlEntitiesWithNumericEntities() {
|
||||||
String source = "<source>T´l´phone ′</source>";
|
String source = "<source>T´l´phone ′</source>";
|
||||||
Assertions.assertEquals("<source>T´l´phone ′</source>", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source));
|
Assertions.assertEquals("<source>T´l´phone ′</source>",
|
||||||
|
feedCleaner.replaceHtmlEntitiesWithNumericEntities(source));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void replacesMultipleOccurrencesOfSameEntity() {
|
||||||
|
String source = " ";
|
||||||
|
Assertions.assertEquals("   ", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void preservesTextWithoutEntities() {
|
||||||
|
String source = "<feed>regular content</feed>";
|
||||||
|
Assertions.assertEquals("<feed>regular content</feed>", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void preservesNumericEntities() {
|
||||||
|
String source = "´′";
|
||||||
|
Assertions.assertEquals("´′", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void replacesCommonHtmlEntities() {
|
||||||
|
String source = "&"";
|
||||||
|
Assertions.assertEquals("&"", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void handlesPartialEntityMatches() {
|
||||||
|
String source = "&lifier";
|
||||||
|
String result = feedCleaner.replaceHtmlEntitiesWithNumericEntities(source);
|
||||||
|
Assertions.assertTrue(result.startsWith("&") || result.equals("&lifier"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void returnsNullWhenInputIsNull() {
|
||||||
|
Assertions.assertNull(feedCleaner.replaceHtmlEntitiesWithNumericEntities(null));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void returnsNullWhenInputIsEmpty() {
|
||||||
|
Assertions.assertNull(feedCleaner.replaceHtmlEntitiesWithNumericEntities(""));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void returnsNullWhenInputIsBlank() {
|
||||||
|
Assertions.assertNull(feedCleaner.replaceHtmlEntitiesWithNumericEntities(" "));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void handlesEntityAtStartOfString() {
|
||||||
|
String source = "&test";
|
||||||
|
Assertions.assertEquals("&test", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void handlesEntityAtEndOfString() {
|
||||||
|
String source = "test&";
|
||||||
|
Assertions.assertEquals("test&", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void handlesMixedEntitiesAndText() {
|
||||||
|
String source = "Hello World! Test.";
|
||||||
|
String result = feedCleaner.replaceHtmlEntitiesWithNumericEntities(source);
|
||||||
|
Assertions.assertTrue(result.contains("&#"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Nested
|
||||||
|
class Doctype {
|
||||||
@Test
|
@Test
|
||||||
void testRemoveDoctype() {
|
void testRemoveDoctype() {
|
||||||
String source = "<!DOCTYPE html><html><head></head><body></body></html>";
|
String source = "<!DOCTYPE html><html><head></head><body></body></html>";
|
||||||
@@ -31,4 +211,61 @@ class FeedCleanerTest {
|
|||||||
<html><head></head><body></body></html>""", feedCleaner.removeDoctypeDeclarations(source));
|
<html><head></head><body></body></html>""", feedCleaner.removeDoctypeDeclarations(source));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void removesComplexDoctypeWithSystemId() {
|
||||||
|
String source = "<!DOCTYPE html SYSTEM \"about:legacy-compat\"><html><body></body></html>";
|
||||||
|
Assertions.assertEquals("<html><body></body></html>", feedCleaner.removeDoctypeDeclarations(source));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void removesComplexDoctypeWithPublicId() {
|
||||||
|
String source = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\"><html></html>";
|
||||||
|
Assertions.assertEquals("<html></html>", feedCleaner.removeDoctypeDeclarations(source));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void removesCaseInsensitiveDoctype() {
|
||||||
|
String source = "<!doctype html><html></html>";
|
||||||
|
Assertions.assertEquals("<html></html>", feedCleaner.removeDoctypeDeclarations(source));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void removesMixedCaseDoctype() {
|
||||||
|
String source = "<!DoCtYpE html><html></html>";
|
||||||
|
Assertions.assertEquals("<html></html>", feedCleaner.removeDoctypeDeclarations(source));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void removesMultipleDoctypeDeclarations() {
|
||||||
|
String source = "<!DOCTYPE html><!DOCTYPE html><html></html>";
|
||||||
|
Assertions.assertEquals("<html></html>", feedCleaner.removeDoctypeDeclarations(source));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void preservesContentWithoutDoctype() {
|
||||||
|
String source = "<html><body>No doctype here</body></html>";
|
||||||
|
Assertions.assertEquals("<html><body>No doctype here</body></html>", feedCleaner.removeDoctypeDeclarations(source));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void returnsNullWhenInputIsNull() {
|
||||||
|
Assertions.assertNull(feedCleaner.removeDoctypeDeclarations(null));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void returnsNullWhenInputIsEmpty() {
|
||||||
|
Assertions.assertNull(feedCleaner.removeDoctypeDeclarations(""));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void returnsNullWhenInputIsBlank() {
|
||||||
|
Assertions.assertNull(feedCleaner.removeDoctypeDeclarations(" "));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void handlesDoctypeWithExtraWhitespace() {
|
||||||
|
String source = "<!DOCTYPE html ><html></html>";
|
||||||
|
Assertions.assertEquals("<html></html>", feedCleaner.removeDoctypeDeclarations(source));
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Reference in New Issue
Block a user