transform feeds to well-formed xml before parsing

This commit is contained in:
Athou
2013-03-26 21:01:53 +01:00
parent 7057c5349b
commit 0e56e7fde7
2 changed files with 23 additions and 3 deletions

View File

@@ -172,6 +172,11 @@
<artifactId>jsoup</artifactId>
<version>1.7.2</version>
</dependency>
<dependency>
<groupId>net.sourceforge.htmlcleaner</groupId>
<artifactId>htmlcleaner</artifactId>
<version>2.4</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>

View File

@@ -8,8 +8,11 @@ import javax.ejb.Stateless;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.SystemUtils;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.SimpleHtmlSerializer;
import org.htmlcleaner.TagNode;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import com.commafeed.backend.model.Feed;
import com.commafeed.backend.model.FeedEntry;
@@ -26,12 +29,12 @@ public class FeedParser {
@SuppressWarnings("unchecked")
public Feed parse(String feedUrl, String xml) throws FeedException {
Feed feed = new Feed();
feed.setUrl(feedUrl);
feed.setLastUpdated(Calendar.getInstance().getTime());
try {
xml = balanceTags(xml);
SyndFeed rss = new SyndFeedInput().build(new StringReader(xml));
List<SyndEntry> items = rss.getEntries();
@@ -72,8 +75,20 @@ public class FeedParser {
return content;
}
private String balanceTags(String xml) throws Exception {
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties props = cleaner.getProperties();
props.setOmitXmlDeclaration(true);
TagNode node = cleaner.clean(xml);
SimpleHtmlSerializer serializer = new SimpleHtmlSerializer(
cleaner.getProperties());
String result = serializer.getAsString(node);
result = StringUtils.trim(xml);
return result;
}
private String handleContent(String content) {
Document doc = Jsoup.parse(content, "UTF-8");
org.jsoup.nodes.Document doc = Jsoup.parse(content, "UTF-8");
doc.select("a").attr("target", "_blank");
return doc.outerHtml();
}