mirror of
https://github.com/Athou/commafeed.git
synced 2026-03-21 21:37:29 +00:00
transform feeds to well-formed xml before parsing
This commit is contained in:
5
pom.xml
5
pom.xml
@@ -172,6 +172,11 @@
|
|||||||
<artifactId>jsoup</artifactId>
|
<artifactId>jsoup</artifactId>
|
||||||
<version>1.7.2</version>
|
<version>1.7.2</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>net.sourceforge.htmlcleaner</groupId>
|
||||||
|
<artifactId>htmlcleaner</artifactId>
|
||||||
|
<version>2.4</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.google.code.gson</groupId>
|
<groupId>com.google.code.gson</groupId>
|
||||||
|
|||||||
@@ -8,8 +8,11 @@ import javax.ejb.Stateless;
|
|||||||
|
|
||||||
import org.apache.commons.lang.StringUtils;
|
import org.apache.commons.lang.StringUtils;
|
||||||
import org.apache.commons.lang.SystemUtils;
|
import org.apache.commons.lang.SystemUtils;
|
||||||
|
import org.htmlcleaner.CleanerProperties;
|
||||||
|
import org.htmlcleaner.HtmlCleaner;
|
||||||
|
import org.htmlcleaner.SimpleHtmlSerializer;
|
||||||
|
import org.htmlcleaner.TagNode;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.jsoup.nodes.Document;
|
|
||||||
|
|
||||||
import com.commafeed.backend.model.Feed;
|
import com.commafeed.backend.model.Feed;
|
||||||
import com.commafeed.backend.model.FeedEntry;
|
import com.commafeed.backend.model.FeedEntry;
|
||||||
@@ -26,12 +29,12 @@ public class FeedParser {
|
|||||||
|
|
||||||
@SuppressWarnings("unchecked")
|
@SuppressWarnings("unchecked")
|
||||||
public Feed parse(String feedUrl, String xml) throws FeedException {
|
public Feed parse(String feedUrl, String xml) throws FeedException {
|
||||||
|
|
||||||
Feed feed = new Feed();
|
Feed feed = new Feed();
|
||||||
feed.setUrl(feedUrl);
|
feed.setUrl(feedUrl);
|
||||||
feed.setLastUpdated(Calendar.getInstance().getTime());
|
feed.setLastUpdated(Calendar.getInstance().getTime());
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
xml = balanceTags(xml);
|
||||||
SyndFeed rss = new SyndFeedInput().build(new StringReader(xml));
|
SyndFeed rss = new SyndFeedInput().build(new StringReader(xml));
|
||||||
|
|
||||||
List<SyndEntry> items = rss.getEntries();
|
List<SyndEntry> items = rss.getEntries();
|
||||||
@@ -72,8 +75,20 @@ public class FeedParser {
|
|||||||
return content;
|
return content;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private String balanceTags(String xml) throws Exception {
|
||||||
|
HtmlCleaner cleaner = new HtmlCleaner();
|
||||||
|
CleanerProperties props = cleaner.getProperties();
|
||||||
|
props.setOmitXmlDeclaration(true);
|
||||||
|
TagNode node = cleaner.clean(xml);
|
||||||
|
SimpleHtmlSerializer serializer = new SimpleHtmlSerializer(
|
||||||
|
cleaner.getProperties());
|
||||||
|
String result = serializer.getAsString(node);
|
||||||
|
result = StringUtils.trim(xml);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
private String handleContent(String content) {
|
private String handleContent(String content) {
|
||||||
Document doc = Jsoup.parse(content, "UTF-8");
|
org.jsoup.nodes.Document doc = Jsoup.parse(content, "UTF-8");
|
||||||
doc.select("a").attr("target", "_blank");
|
doc.select("a").attr("target", "_blank");
|
||||||
return doc.outerHtml();
|
return doc.outerHtml();
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user