forked from Archives/Athou_commafeed
remove DOCTYPE declarations (#1260)
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
package com.commafeed.backend.feed.parser;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.ahocorasick.trie.Emit;
|
||||
import org.ahocorasick.trie.Trie;
|
||||
@@ -11,6 +12,8 @@ import jakarta.inject.Singleton;
|
||||
@Singleton
|
||||
class FeedCleaner {
|
||||
|
||||
private static final Pattern DOCTYPE_PATTERN = Pattern.compile("<!DOCTYPE[^>]*>", Pattern.CASE_INSENSITIVE);
|
||||
|
||||
public String trimInvalidXmlCharacters(String xml) {
|
||||
if (StringUtils.isBlank(xml)) {
|
||||
return null;
|
||||
@@ -60,4 +63,8 @@ class FeedCleaner {
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public String removeDoctypeDeclarations(String xml) {
|
||||
return DOCTYPE_PATTERN.matcher(xml).replaceAll("");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -64,6 +64,7 @@ public class FeedParser {
|
||||
throw new FeedException("Input string is null for url " + feedUrl);
|
||||
}
|
||||
xmlString = feedCleaner.replaceHtmlEntitiesWithNumericEntities(xmlString);
|
||||
xmlString = feedCleaner.removeDoctypeDeclarations(xmlString);
|
||||
|
||||
InputSource source = new InputSource(new StringReader(xmlString));
|
||||
SyndFeed feed = new SyndFeedInput().build(source);
|
||||
|
||||
@@ -13,4 +13,22 @@ class FeedCleanerTest {
|
||||
Assertions.assertEquals("<source>T´l´phone ′</source>", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source));
|
||||
}
|
||||
|
||||
@Test
|
||||
void testRemoveDoctype() {
|
||||
String source = "<!DOCTYPE html><html><head></head><body></body></html>";
|
||||
Assertions.assertEquals("<html><head></head><body></body></html>", feedCleaner.removeDoctypeDeclarations(source));
|
||||
}
|
||||
|
||||
@Test
|
||||
void testRemoveMultilineDoctype() {
|
||||
String source = """
|
||||
<!DOCTYPE
|
||||
html
|
||||
>
|
||||
<html><head></head><body></body></html>""";
|
||||
Assertions.assertEquals("""
|
||||
|
||||
<html><head></head><body></body></html>""", feedCleaner.removeDoctypeDeclarations(source));
|
||||
}
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user