mirror of
https://github.com/Athou/commafeed.git
synced 2026-03-21 21:37:29 +00:00
detect encoding and trim invalid characters before parsing xml (#60)
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
package com.commafeed.backend.feeds;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.StringReader;
|
||||
import java.util.Calendar;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
@@ -41,14 +42,12 @@ public class FeedParser {
|
||||
feed.setLastUpdated(Calendar.getInstance().getTime());
|
||||
|
||||
try {
|
||||
InputSource source = new InputSource(new ByteArrayInputStream(xml));
|
||||
if (new String(ArrayUtils.subarray(xml, 0, 100))
|
||||
.split(SystemUtils.LINE_SEPARATOR)[0].toUpperCase()
|
||||
.contains("ISO-8859-1")) {
|
||||
// they probably use word, we need to handle curly quotes and
|
||||
// other word special characters
|
||||
source.setEncoding("windows-1252");
|
||||
}
|
||||
String encoding = FeedUtils.guessEncoding(xml);
|
||||
String xmlString = FeedUtils.trimInvalidXmlCharacters(new String(
|
||||
xml, encoding));
|
||||
|
||||
InputSource source = new InputSource(new StringReader(xmlString));
|
||||
|
||||
SyndFeed rss = new SyndFeedInput().build(source);
|
||||
feed.setUrl(feedUrl);
|
||||
feed.setTitle(rss.getTitle());
|
||||
|
||||
@@ -5,9 +5,25 @@ import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document.OutputSettings;
|
||||
import org.jsoup.nodes.Entities.EscapeMode;
|
||||
import org.jsoup.safety.Whitelist;
|
||||
import org.mozilla.universalchardet.UniversalDetector;
|
||||
|
||||
public class FeedUtils {
|
||||
|
||||
public static String guessEncoding(byte[] bytes) {
|
||||
String DEFAULT_ENCODING = "UTF-8";
|
||||
UniversalDetector detector = new UniversalDetector(null);
|
||||
detector.handleData(bytes, 0, bytes.length);
|
||||
detector.dataEnd();
|
||||
String encoding = detector.getDetectedCharset();
|
||||
detector.reset();
|
||||
if (encoding == null) {
|
||||
encoding = DEFAULT_ENCODING;
|
||||
} else if (encoding.equalsIgnoreCase("ISO-8859-1")) {
|
||||
encoding = "windows-1252";
|
||||
}
|
||||
return encoding;
|
||||
}
|
||||
|
||||
public static String handleContent(String content) {
|
||||
if (StringUtils.isNotBlank(content)) {
|
||||
content = trimUnicodeSurrogateCharacters(content);
|
||||
@@ -24,6 +40,20 @@ public class FeedUtils {
|
||||
return content;
|
||||
}
|
||||
|
||||
public static String trimInvalidXmlCharacters(String xml) {
|
||||
if (StringUtils.isBlank(xml)) {
|
||||
return null;
|
||||
}
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (int i = 0; i < xml.length(); i++) {
|
||||
char c = xml.charAt(i);
|
||||
if (c >= 20 || c == 0x9 || c == 0xA || c == 0xD) {
|
||||
sb.append(c);
|
||||
}
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public static String trimUnicodeSurrogateCharacters(String text) {
|
||||
if (StringUtils.isBlank(text)) {
|
||||
return null;
|
||||
|
||||
Reference in New Issue
Block a user