diff --git a/pom.xml b/pom.xml index 9b0528fb..18b907dc 100644 --- a/pom.xml +++ b/pom.xml @@ -238,6 +238,11 @@ rome-opml 1.0 + + com.googlecode.juniversalchardet + juniversalchardet + 1.0.3 + com.google.oauth-client diff --git a/src/main/java/com/commafeed/backend/feeds/FeedParser.java b/src/main/java/com/commafeed/backend/feeds/FeedParser.java index e976b2a7..14f4a2c0 100644 --- a/src/main/java/com/commafeed/backend/feeds/FeedParser.java +++ b/src/main/java/com/commafeed/backend/feeds/FeedParser.java @@ -1,6 +1,7 @@ package com.commafeed.backend.feeds; import java.io.ByteArrayInputStream; +import java.io.StringReader; import java.util.Calendar; import java.util.Date; import java.util.List; @@ -41,14 +42,12 @@ public class FeedParser { feed.setLastUpdated(Calendar.getInstance().getTime()); try { - InputSource source = new InputSource(new ByteArrayInputStream(xml)); - if (new String(ArrayUtils.subarray(xml, 0, 100)) - .split(SystemUtils.LINE_SEPARATOR)[0].toUpperCase() - .contains("ISO-8859-1")) { - // they probably use word, we need to handle curly quotes and - // other word special characters - source.setEncoding("windows-1252"); - } + String encoding = FeedUtils.guessEncoding(xml); + String xmlString = FeedUtils.trimInvalidXmlCharacters(new String( + xml, encoding)); + + InputSource source = new InputSource(new StringReader(xmlString)); + SyndFeed rss = new SyndFeedInput().build(source); feed.setUrl(feedUrl); feed.setTitle(rss.getTitle()); diff --git a/src/main/java/com/commafeed/backend/feeds/FeedUtils.java b/src/main/java/com/commafeed/backend/feeds/FeedUtils.java index 40faa799..da81339a 100644 --- a/src/main/java/com/commafeed/backend/feeds/FeedUtils.java +++ b/src/main/java/com/commafeed/backend/feeds/FeedUtils.java @@ -5,9 +5,25 @@ import org.jsoup.Jsoup; import org.jsoup.nodes.Document.OutputSettings; import org.jsoup.nodes.Entities.EscapeMode; import org.jsoup.safety.Whitelist; +import org.mozilla.universalchardet.UniversalDetector; public class FeedUtils { + public static String guessEncoding(byte[] bytes) { + String DEFAULT_ENCODING = "UTF-8"; + UniversalDetector detector = new UniversalDetector(null); + detector.handleData(bytes, 0, bytes.length); + detector.dataEnd(); + String encoding = detector.getDetectedCharset(); + detector.reset(); + if (encoding == null) { + encoding = DEFAULT_ENCODING; + } else if (encoding.equalsIgnoreCase("ISO-8859-1")) { + encoding = "windows-1252"; + } + return encoding; + } + public static String handleContent(String content) { if (StringUtils.isNotBlank(content)) { content = trimUnicodeSurrogateCharacters(content); @@ -24,6 +40,20 @@ public class FeedUtils { return content; } + public static String trimInvalidXmlCharacters(String xml) { + if (StringUtils.isBlank(xml)) { + return null; + } + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < xml.length(); i++) { + char c = xml.charAt(i); + if (c >= 20 || c == 0x9 || c == 0xA || c == 0xD) { + sb.append(c); + } + } + return sb.toString(); + } + public static String trimUnicodeSurrogateCharacters(String text) { if (StringUtils.isBlank(text)) { return null;