detect encoding and trim invalid characters before parsing xml (#60)

This commit is contained in:
Athou
2013-04-23 07:20:21 +02:00
parent 9cb56557be
commit 64747881de
3 changed files with 42 additions and 8 deletions

View File

@@ -1,6 +1,7 @@
package com.commafeed.backend.feeds;
import java.io.ByteArrayInputStream;
import java.io.StringReader;
import java.util.Calendar;
import java.util.Date;
import java.util.List;
@@ -41,14 +42,12 @@ public class FeedParser {
feed.setLastUpdated(Calendar.getInstance().getTime());
try {
InputSource source = new InputSource(new ByteArrayInputStream(xml));
if (new String(ArrayUtils.subarray(xml, 0, 100))
.split(SystemUtils.LINE_SEPARATOR)[0].toUpperCase()
.contains("ISO-8859-1")) {
// they probably use word, we need to handle curly quotes and
// other word special characters
source.setEncoding("windows-1252");
}
String encoding = FeedUtils.guessEncoding(xml);
String xmlString = FeedUtils.trimInvalidXmlCharacters(new String(
xml, encoding));
InputSource source = new InputSource(new StringReader(xmlString));
SyndFeed rss = new SyndFeedInput().build(source);
feed.setUrl(feedUrl);
feed.setTitle(rss.getTitle());

View File

@@ -5,9 +5,25 @@ import org.jsoup.Jsoup;
import org.jsoup.nodes.Document.OutputSettings;
import org.jsoup.nodes.Entities.EscapeMode;
import org.jsoup.safety.Whitelist;
import org.mozilla.universalchardet.UniversalDetector;
public class FeedUtils {
public static String guessEncoding(byte[] bytes) {
String DEFAULT_ENCODING = "UTF-8";
UniversalDetector detector = new UniversalDetector(null);
detector.handleData(bytes, 0, bytes.length);
detector.dataEnd();
String encoding = detector.getDetectedCharset();
detector.reset();
if (encoding == null) {
encoding = DEFAULT_ENCODING;
} else if (encoding.equalsIgnoreCase("ISO-8859-1")) {
encoding = "windows-1252";
}
return encoding;
}
public static String handleContent(String content) {
if (StringUtils.isNotBlank(content)) {
content = trimUnicodeSurrogateCharacters(content);
@@ -24,6 +40,20 @@ public class FeedUtils {
return content;
}
public static String trimInvalidXmlCharacters(String xml) {
if (StringUtils.isBlank(xml)) {
return null;
}
StringBuilder sb = new StringBuilder();
for (int i = 0; i < xml.length(); i++) {
char c = xml.charAt(i);
if (c >= 20 || c == 0x9 || c == 0xA || c == 0xD) {
sb.append(c);
}
}
return sb.toString();
}
public static String trimUnicodeSurrogateCharacters(String text) {
if (StringUtils.isBlank(text)) {
return null;