From 0e56e7fde7df94d24033f1879fb093b62f3cab06 Mon Sep 17 00:00:00 2001 From: Athou Date: Tue, 26 Mar 2013 21:01:53 +0100 Subject: [PATCH] transform feeds to well-formed xml before parsing --- pom.xml | 5 +++++ .../commafeed/backend/feeds/FeedParser.java | 21 ++++++++++++++++--- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/pom.xml b/pom.xml index f6b8fddc..f3fb1104 100644 --- a/pom.xml +++ b/pom.xml @@ -172,6 +172,11 @@ jsoup 1.7.2 + + net.sourceforge.htmlcleaner + htmlcleaner + 2.4 + com.google.code.gson diff --git a/src/main/java/com/commafeed/backend/feeds/FeedParser.java b/src/main/java/com/commafeed/backend/feeds/FeedParser.java index e1b67d99..d183a26a 100644 --- a/src/main/java/com/commafeed/backend/feeds/FeedParser.java +++ b/src/main/java/com/commafeed/backend/feeds/FeedParser.java @@ -8,8 +8,11 @@ import javax.ejb.Stateless; import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.SystemUtils; +import org.htmlcleaner.CleanerProperties; +import org.htmlcleaner.HtmlCleaner; +import org.htmlcleaner.SimpleHtmlSerializer; +import org.htmlcleaner.TagNode; import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; import com.commafeed.backend.model.Feed; import com.commafeed.backend.model.FeedEntry; @@ -26,12 +29,12 @@ public class FeedParser { @SuppressWarnings("unchecked") public Feed parse(String feedUrl, String xml) throws FeedException { - Feed feed = new Feed(); feed.setUrl(feedUrl); feed.setLastUpdated(Calendar.getInstance().getTime()); try { + xml = balanceTags(xml); SyndFeed rss = new SyndFeedInput().build(new StringReader(xml)); List items = rss.getEntries(); @@ -72,8 +75,20 @@ public class FeedParser { return content; } + private String balanceTags(String xml) throws Exception { + HtmlCleaner cleaner = new HtmlCleaner(); + CleanerProperties props = cleaner.getProperties(); + props.setOmitXmlDeclaration(true); + TagNode node = cleaner.clean(xml); + SimpleHtmlSerializer serializer = new SimpleHtmlSerializer( + cleaner.getProperties()); + String result = serializer.getAsString(node); + result = StringUtils.trim(xml); + return result; + } + private String handleContent(String content) { - Document doc = Jsoup.parse(content, "UTF-8"); + org.jsoup.nodes.Document doc = Jsoup.parse(content, "UTF-8"); doc.select("a").attr("target", "_blank"); return doc.outerHtml(); }