forked from Archives/Athou_commafeed
remove DOCTYPE declarations (#1260)
This commit is contained in:
@@ -1,6 +1,7 @@
|
|||||||
package com.commafeed.backend.feed.parser;
|
package com.commafeed.backend.feed.parser;
|
||||||
|
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import org.ahocorasick.trie.Emit;
|
import org.ahocorasick.trie.Emit;
|
||||||
import org.ahocorasick.trie.Trie;
|
import org.ahocorasick.trie.Trie;
|
||||||
@@ -11,6 +12,8 @@ import jakarta.inject.Singleton;
|
|||||||
@Singleton
|
@Singleton
|
||||||
class FeedCleaner {
|
class FeedCleaner {
|
||||||
|
|
||||||
|
private static final Pattern DOCTYPE_PATTERN = Pattern.compile("<!DOCTYPE[^>]*>", Pattern.CASE_INSENSITIVE);
|
||||||
|
|
||||||
public String trimInvalidXmlCharacters(String xml) {
|
public String trimInvalidXmlCharacters(String xml) {
|
||||||
if (StringUtils.isBlank(xml)) {
|
if (StringUtils.isBlank(xml)) {
|
||||||
return null;
|
return null;
|
||||||
@@ -60,4 +63,8 @@ class FeedCleaner {
|
|||||||
return sb.toString();
|
return sb.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String removeDoctypeDeclarations(String xml) {
|
||||||
|
return DOCTYPE_PATTERN.matcher(xml).replaceAll("");
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -64,6 +64,7 @@ public class FeedParser {
|
|||||||
throw new FeedException("Input string is null for url " + feedUrl);
|
throw new FeedException("Input string is null for url " + feedUrl);
|
||||||
}
|
}
|
||||||
xmlString = feedCleaner.replaceHtmlEntitiesWithNumericEntities(xmlString);
|
xmlString = feedCleaner.replaceHtmlEntitiesWithNumericEntities(xmlString);
|
||||||
|
xmlString = feedCleaner.removeDoctypeDeclarations(xmlString);
|
||||||
|
|
||||||
InputSource source = new InputSource(new StringReader(xmlString));
|
InputSource source = new InputSource(new StringReader(xmlString));
|
||||||
SyndFeed feed = new SyndFeedInput().build(source);
|
SyndFeed feed = new SyndFeedInput().build(source);
|
||||||
|
|||||||
@@ -13,4 +13,22 @@ class FeedCleanerTest {
|
|||||||
Assertions.assertEquals("<source>T´l´phone ′</source>", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source));
|
Assertions.assertEquals("<source>T´l´phone ′</source>", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testRemoveDoctype() {
|
||||||
|
String source = "<!DOCTYPE html><html><head></head><body></body></html>";
|
||||||
|
Assertions.assertEquals("<html><head></head><body></body></html>", feedCleaner.removeDoctypeDeclarations(source));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testRemoveMultilineDoctype() {
|
||||||
|
String source = """
|
||||||
|
<!DOCTYPE
|
||||||
|
html
|
||||||
|
>
|
||||||
|
<html><head></head><body></body></html>""";
|
||||||
|
Assertions.assertEquals("""
|
||||||
|
|
||||||
|
<html><head></head><body></body></html>""", feedCleaner.removeDoctypeDeclarations(source));
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
Reference in New Issue
Block a user