2013-04-22 13:20:17 +02:00
|
|
|
package com.commafeed.backend.feeds;
|
|
|
|
|
|
|
|
|
|
import org.apache.commons.lang.StringUtils;
|
|
|
|
|
import org.jsoup.Jsoup;
|
|
|
|
|
import org.jsoup.nodes.Document.OutputSettings;
|
|
|
|
|
import org.jsoup.nodes.Entities.EscapeMode;
|
|
|
|
|
import org.jsoup.safety.Whitelist;
|
2013-04-23 07:20:21 +02:00
|
|
|
import org.mozilla.universalchardet.UniversalDetector;
|
2013-04-22 13:20:17 +02:00
|
|
|
|
|
|
|
|
public class FeedUtils {
|
|
|
|
|
|
2013-04-23 07:20:21 +02:00
|
|
|
public static String guessEncoding(byte[] bytes) {
|
|
|
|
|
String DEFAULT_ENCODING = "UTF-8";
|
|
|
|
|
UniversalDetector detector = new UniversalDetector(null);
|
|
|
|
|
detector.handleData(bytes, 0, bytes.length);
|
|
|
|
|
detector.dataEnd();
|
|
|
|
|
String encoding = detector.getDetectedCharset();
|
|
|
|
|
detector.reset();
|
|
|
|
|
if (encoding == null) {
|
|
|
|
|
encoding = DEFAULT_ENCODING;
|
|
|
|
|
} else if (encoding.equalsIgnoreCase("ISO-8859-1")) {
|
|
|
|
|
encoding = "windows-1252";
|
|
|
|
|
}
|
|
|
|
|
return encoding;
|
|
|
|
|
}
|
|
|
|
|
|
2013-04-22 13:20:17 +02:00
|
|
|
public static String handleContent(String content) {
|
|
|
|
|
if (StringUtils.isNotBlank(content)) {
|
|
|
|
|
Whitelist whitelist = Whitelist.relaxed();
|
|
|
|
|
whitelist.addEnforcedAttribute("a", "target", "_blank");
|
|
|
|
|
|
|
|
|
|
whitelist.addTags("iframe");
|
|
|
|
|
whitelist.addAttributes("iframe", "src", "height", "width",
|
|
|
|
|
"allowfullscreen", "frameborder");
|
|
|
|
|
|
|
|
|
|
content = Jsoup.clean(content, "", whitelist,
|
|
|
|
|
new OutputSettings().escapeMode(EscapeMode.base));
|
|
|
|
|
}
|
|
|
|
|
return content;
|
|
|
|
|
}
|
|
|
|
|
|
2013-04-23 07:20:21 +02:00
|
|
|
public static String trimInvalidXmlCharacters(String xml) {
|
|
|
|
|
if (StringUtils.isBlank(xml)) {
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
StringBuilder sb = new StringBuilder();
|
|
|
|
|
for (int i = 0; i < xml.length(); i++) {
|
|
|
|
|
char c = xml.charAt(i);
|
|
|
|
|
if (c >= 20 || c == 0x9 || c == 0xA || c == 0xD) {
|
2013-04-23 09:31:02 +02:00
|
|
|
if (!Character.isHighSurrogate(c)
|
|
|
|
|
&& !Character.isLowSurrogate(c)) {
|
|
|
|
|
sb.append(c);
|
|
|
|
|
}
|
2013-04-22 13:20:17 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return sb.toString();
|
|
|
|
|
}
|
|
|
|
|
}
|