mirror of
https://github.com/Athou/commafeed.git
synced 2026-03-21 21:37:29 +00:00
add support for more emojis (#1955)
This commit is contained in:
@@ -8,41 +8,47 @@ import jakarta.inject.Singleton;
|
||||
import org.ahocorasick.trie.Emit;
|
||||
import org.ahocorasick.trie.Trie;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jdom2.Verifier;
|
||||
|
||||
@Singleton
|
||||
class FeedCleaner {
|
||||
|
||||
private static final Pattern DOCTYPE_PATTERN = Pattern.compile("<!DOCTYPE[^>]*>", Pattern.CASE_INSENSITIVE);
|
||||
|
||||
public String trimInvalidXmlCharacters(String xml) {
|
||||
public String clean(String xml) {
|
||||
xml = removeCharactersBeforeFirstXmlTag(xml);
|
||||
xml = removeInvalidXmlCharacters(xml);
|
||||
xml = replaceHtmlEntitiesWithNumericEntities(xml);
|
||||
xml = removeDoctypeDeclarations(xml);
|
||||
return xml;
|
||||
}
|
||||
|
||||
String removeCharactersBeforeFirstXmlTag(String xml) {
|
||||
if (StringUtils.isBlank(xml)) {
|
||||
return null;
|
||||
}
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
boolean firstTagFound = false;
|
||||
for (int i = 0; i < xml.length(); i++) {
|
||||
char c = xml.charAt(i);
|
||||
int pos = xml.indexOf('<');
|
||||
return pos < 0 ? null : xml.substring(pos);
|
||||
}
|
||||
|
||||
if (!firstTagFound) {
|
||||
if (c == '<') {
|
||||
firstTagFound = true;
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (c >= 32 || c == 9 || c == 10 || c == 13) {
|
||||
if (!Character.isHighSurrogate(c) && !Character.isLowSurrogate(c)) {
|
||||
sb.append(c);
|
||||
}
|
||||
}
|
||||
String removeInvalidXmlCharacters(String xml) {
|
||||
if (StringUtils.isBlank(xml)) {
|
||||
return null;
|
||||
}
|
||||
return sb.toString();
|
||||
|
||||
return xml.codePoints()
|
||||
.filter(Verifier::isXMLCharacter)
|
||||
.collect(StringBuilder::new, StringBuilder::appendCodePoint, StringBuilder::append)
|
||||
.toString();
|
||||
}
|
||||
|
||||
// https://stackoverflow.com/a/40836618
|
||||
public String replaceHtmlEntitiesWithNumericEntities(String source) {
|
||||
String replaceHtmlEntitiesWithNumericEntities(String source) {
|
||||
if (StringUtils.isBlank(source)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Create a buffer sufficiently large that re-allocations are minimized.
|
||||
StringBuilder sb = new StringBuilder(source.length() << 1);
|
||||
|
||||
@@ -63,7 +69,11 @@ class FeedCleaner {
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public String removeDoctypeDeclarations(String xml) {
|
||||
String removeDoctypeDeclarations(String xml) {
|
||||
if (StringUtils.isBlank(xml)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return DOCTYPE_PATTERN.matcher(xml).replaceAll("");
|
||||
}
|
||||
|
||||
|
||||
@@ -58,12 +58,11 @@ public class FeedParser {
|
||||
public FeedParserResult parse(String feedUrl, byte[] xml) throws FeedParsingException {
|
||||
try {
|
||||
Charset encoding = encodingDetector.getEncoding(xml);
|
||||
String xmlString = feedCleaner.trimInvalidXmlCharacters(new String(xml, encoding));
|
||||
|
||||
String xmlString = feedCleaner.clean(new String(xml, encoding));
|
||||
if (xmlString == null) {
|
||||
throw new FeedParsingException("Input string is null for url " + feedUrl);
|
||||
throw new FeedParsingException("Input string is empty for url " + feedUrl);
|
||||
}
|
||||
xmlString = feedCleaner.replaceHtmlEntitiesWithNumericEntities(xmlString);
|
||||
xmlString = feedCleaner.removeDoctypeDeclarations(xmlString);
|
||||
|
||||
InputSource source = new InputSource(new StringReader(xmlString));
|
||||
SyndFeed feed = new SyndFeedInput().build(source);
|
||||
|
||||
Reference in New Issue
Block a user