forked from Archives/Athou_commafeed
detect encoding and trim invalid characters before parsing xml (#60)
This commit is contained in:
5
pom.xml
5
pom.xml
@@ -238,6 +238,11 @@
|
|||||||
<artifactId>rome-opml</artifactId>
|
<artifactId>rome-opml</artifactId>
|
||||||
<version>1.0</version>
|
<version>1.0</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>com.googlecode.juniversalchardet</groupId>
|
||||||
|
<artifactId>juniversalchardet</artifactId>
|
||||||
|
<version>1.0.3</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.google.oauth-client</groupId>
|
<groupId>com.google.oauth-client</groupId>
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package com.commafeed.backend.feeds;
|
package com.commafeed.backend.feeds;
|
||||||
|
|
||||||
import java.io.ByteArrayInputStream;
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.StringReader;
|
||||||
import java.util.Calendar;
|
import java.util.Calendar;
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@@ -41,14 +42,12 @@ public class FeedParser {
|
|||||||
feed.setLastUpdated(Calendar.getInstance().getTime());
|
feed.setLastUpdated(Calendar.getInstance().getTime());
|
||||||
|
|
||||||
try {
|
try {
|
||||||
InputSource source = new InputSource(new ByteArrayInputStream(xml));
|
String encoding = FeedUtils.guessEncoding(xml);
|
||||||
if (new String(ArrayUtils.subarray(xml, 0, 100))
|
String xmlString = FeedUtils.trimInvalidXmlCharacters(new String(
|
||||||
.split(SystemUtils.LINE_SEPARATOR)[0].toUpperCase()
|
xml, encoding));
|
||||||
.contains("ISO-8859-1")) {
|
|
||||||
// they probably use word, we need to handle curly quotes and
|
InputSource source = new InputSource(new StringReader(xmlString));
|
||||||
// other word special characters
|
|
||||||
source.setEncoding("windows-1252");
|
|
||||||
}
|
|
||||||
SyndFeed rss = new SyndFeedInput().build(source);
|
SyndFeed rss = new SyndFeedInput().build(source);
|
||||||
feed.setUrl(feedUrl);
|
feed.setUrl(feedUrl);
|
||||||
feed.setTitle(rss.getTitle());
|
feed.setTitle(rss.getTitle());
|
||||||
|
|||||||
@@ -5,9 +5,25 @@ import org.jsoup.Jsoup;
|
|||||||
import org.jsoup.nodes.Document.OutputSettings;
|
import org.jsoup.nodes.Document.OutputSettings;
|
||||||
import org.jsoup.nodes.Entities.EscapeMode;
|
import org.jsoup.nodes.Entities.EscapeMode;
|
||||||
import org.jsoup.safety.Whitelist;
|
import org.jsoup.safety.Whitelist;
|
||||||
|
import org.mozilla.universalchardet.UniversalDetector;
|
||||||
|
|
||||||
public class FeedUtils {
|
public class FeedUtils {
|
||||||
|
|
||||||
|
public static String guessEncoding(byte[] bytes) {
|
||||||
|
String DEFAULT_ENCODING = "UTF-8";
|
||||||
|
UniversalDetector detector = new UniversalDetector(null);
|
||||||
|
detector.handleData(bytes, 0, bytes.length);
|
||||||
|
detector.dataEnd();
|
||||||
|
String encoding = detector.getDetectedCharset();
|
||||||
|
detector.reset();
|
||||||
|
if (encoding == null) {
|
||||||
|
encoding = DEFAULT_ENCODING;
|
||||||
|
} else if (encoding.equalsIgnoreCase("ISO-8859-1")) {
|
||||||
|
encoding = "windows-1252";
|
||||||
|
}
|
||||||
|
return encoding;
|
||||||
|
}
|
||||||
|
|
||||||
public static String handleContent(String content) {
|
public static String handleContent(String content) {
|
||||||
if (StringUtils.isNotBlank(content)) {
|
if (StringUtils.isNotBlank(content)) {
|
||||||
content = trimUnicodeSurrogateCharacters(content);
|
content = trimUnicodeSurrogateCharacters(content);
|
||||||
@@ -24,6 +40,20 @@ public class FeedUtils {
|
|||||||
return content;
|
return content;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static String trimInvalidXmlCharacters(String xml) {
|
||||||
|
if (StringUtils.isBlank(xml)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
StringBuilder sb = new StringBuilder();
|
||||||
|
for (int i = 0; i < xml.length(); i++) {
|
||||||
|
char c = xml.charAt(i);
|
||||||
|
if (c >= 20 || c == 0x9 || c == 0xA || c == 0xD) {
|
||||||
|
sb.append(c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
public static String trimUnicodeSurrogateCharacters(String text) {
|
public static String trimUnicodeSurrogateCharacters(String text) {
|
||||||
if (StringUtils.isBlank(text)) {
|
if (StringUtils.isBlank(text)) {
|
||||||
return null;
|
return null;
|
||||||
|
|||||||
Reference in New Issue
Block a user