diff --git a/pom.xml b/pom.xml
index 9b0528fb..18b907dc 100644
--- a/pom.xml
+++ b/pom.xml
@@ -238,6 +238,11 @@
rome-opml
1.0
+
+ com.googlecode.juniversalchardet
+ juniversalchardet
+ 1.0.3
+
com.google.oauth-client
diff --git a/src/main/java/com/commafeed/backend/feeds/FeedParser.java b/src/main/java/com/commafeed/backend/feeds/FeedParser.java
index e976b2a7..14f4a2c0 100644
--- a/src/main/java/com/commafeed/backend/feeds/FeedParser.java
+++ b/src/main/java/com/commafeed/backend/feeds/FeedParser.java
@@ -1,6 +1,7 @@
package com.commafeed.backend.feeds;
import java.io.ByteArrayInputStream;
+import java.io.StringReader;
import java.util.Calendar;
import java.util.Date;
import java.util.List;
@@ -41,14 +42,12 @@ public class FeedParser {
feed.setLastUpdated(Calendar.getInstance().getTime());
try {
- InputSource source = new InputSource(new ByteArrayInputStream(xml));
- if (new String(ArrayUtils.subarray(xml, 0, 100))
- .split(SystemUtils.LINE_SEPARATOR)[0].toUpperCase()
- .contains("ISO-8859-1")) {
- // they probably use word, we need to handle curly quotes and
- // other word special characters
- source.setEncoding("windows-1252");
- }
+ String encoding = FeedUtils.guessEncoding(xml);
+ String xmlString = FeedUtils.trimInvalidXmlCharacters(new String(
+ xml, encoding));
+
+ InputSource source = new InputSource(new StringReader(xmlString));
+
SyndFeed rss = new SyndFeedInput().build(source);
feed.setUrl(feedUrl);
feed.setTitle(rss.getTitle());
diff --git a/src/main/java/com/commafeed/backend/feeds/FeedUtils.java b/src/main/java/com/commafeed/backend/feeds/FeedUtils.java
index 40faa799..da81339a 100644
--- a/src/main/java/com/commafeed/backend/feeds/FeedUtils.java
+++ b/src/main/java/com/commafeed/backend/feeds/FeedUtils.java
@@ -5,9 +5,25 @@ import org.jsoup.Jsoup;
import org.jsoup.nodes.Document.OutputSettings;
import org.jsoup.nodes.Entities.EscapeMode;
import org.jsoup.safety.Whitelist;
+import org.mozilla.universalchardet.UniversalDetector;
public class FeedUtils {
+ public static String guessEncoding(byte[] bytes) {
+ String DEFAULT_ENCODING = "UTF-8";
+ UniversalDetector detector = new UniversalDetector(null);
+ detector.handleData(bytes, 0, bytes.length);
+ detector.dataEnd();
+ String encoding = detector.getDetectedCharset();
+ detector.reset();
+ if (encoding == null) {
+ encoding = DEFAULT_ENCODING;
+ } else if (encoding.equalsIgnoreCase("ISO-8859-1")) {
+ encoding = "windows-1252";
+ }
+ return encoding;
+ }
+
public static String handleContent(String content) {
if (StringUtils.isNotBlank(content)) {
content = trimUnicodeSurrogateCharacters(content);
@@ -24,6 +40,20 @@ public class FeedUtils {
return content;
}
+ public static String trimInvalidXmlCharacters(String xml) {
+ if (StringUtils.isBlank(xml)) {
+ return null;
+ }
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < xml.length(); i++) {
+ char c = xml.charAt(i);
+ if (c >= 20 || c == 0x9 || c == 0xA || c == 0xD) {
+ sb.append(c);
+ }
+ }
+ return sb.toString();
+ }
+
public static String trimUnicodeSurrogateCharacters(String text) {
if (StringUtils.isBlank(text)) {
return null;