From b2421eb1ffd5acdfe53e699f13f779b399e2782c Mon Sep 17 00:00:00 2001 From: Athou Date: Sat, 6 Apr 2013 16:54:49 +0200 Subject: [PATCH] fix encoding issues with idiots using word for writing content --- .../commafeed/backend/feeds/FeedParser.java | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/main/java/com/commafeed/backend/feeds/FeedParser.java b/src/main/java/com/commafeed/backend/feeds/FeedParser.java index bc395fde..ac4d8c58 100644 --- a/src/main/java/com/commafeed/backend/feeds/FeedParser.java +++ b/src/main/java/com/commafeed/backend/feeds/FeedParser.java @@ -7,6 +7,8 @@ import java.util.List; import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.SystemUtils; import org.jsoup.Jsoup; +import org.jsoup.nodes.Document.OutputSettings; +import org.jsoup.nodes.Entities.EscapeMode; import org.jsoup.safety.Whitelist; import org.xml.sax.InputSource; @@ -28,8 +30,14 @@ public class FeedParser { feed.setLastUpdated(Calendar.getInstance().getTime()); try { - SyndFeed rss = new SyndFeedInput().build(new InputSource( - new ByteArrayInputStream(xml))); + InputSource source = new InputSource(new ByteArrayInputStream(xml)); + if (new String(xml).split(SystemUtils.LINE_SEPARATOR)[0] + .toUpperCase().contains("ISO-8859-1")) { + // they probably use word, we need to handle curly quotes and + // other word special characters + source.setEncoding("windows-1252"); + } + SyndFeed rss = new SyndFeedInput().build(source); feed.setUrl(feedUrl); feed.setTitle(rss.getTitle()); feed.setLink(rss.getLink()); @@ -38,7 +46,7 @@ public class FeedParser { FeedEntry entry = new FeedEntry(); entry.setGuid(item.getUri()); entry.setTitle(handleContent(item.getTitle())); - entry.setContent(getContent(item)); + entry.setContent(handleContent(getContent(item))); entry.setUrl(item.getLink()); entry.setUpdated(item.getUpdatedDate() != null ? item .getUpdatedDate() : item.getPublishedDate()); @@ -67,7 +75,6 @@ public class FeedParser { } }), SystemUtils.LINE_SEPARATOR); } - content = handleContent(content); return content; } @@ -81,7 +88,8 @@ public class FeedParser { whitelist.addAttributes("iframe", "src", "height", "width", "allowfullscreen", "frameborder"); - content = Jsoup.clean(content, whitelist); + content = Jsoup.clean(content, "", whitelist, + new OutputSettings().escapeMode(EscapeMode.extended)); } return content; }