better character encoding detection

This commit is contained in:
Athou
2014-11-24 14:55:20 +01:00
parent 84d7a501d4
commit 9020d95b62
2 changed files with 14 additions and 13 deletions

View File

@@ -304,9 +304,9 @@
<version>1.8.1</version>
</dependency>
<dependency>
<groupId>com.googlecode.juniversalchardet</groupId>
<artifactId>juniversalchardet</artifactId>
<version>1.0.3</version>
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId>
<version>54.1.1</version>
</dependency>
<dependency>
<groupId>net.sourceforge.cssparser</groupId>

View File

@@ -25,7 +25,6 @@ import org.jsoup.nodes.Entities.EscapeMode;
import org.jsoup.safety.Cleaner;
import org.jsoup.safety.Whitelist;
import org.jsoup.select.Elements;
import org.mozilla.universalchardet.UniversalDetector;
import org.w3c.css.sac.InputSource;
import org.w3c.dom.css.CSSStyleDeclaration;
@@ -34,6 +33,8 @@ import com.commafeed.backend.model.FeedEntry;
import com.commafeed.backend.model.FeedSubscription;
import com.commafeed.frontend.model.Entry;
import com.google.common.collect.Lists;
import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
import com.steadystate.css.parser.CSSOMParser;
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
@@ -114,15 +115,15 @@ public class FeedUtils {
* Detect encoding by analyzing characters in the array
*/
public static String detectEncoding(byte[] bytes) {
String DEFAULT_ENCODING = "UTF-8";
UniversalDetector detector = new UniversalDetector(null);
detector.handleData(bytes, 0, bytes.length);
detector.dataEnd();
String encoding = detector.getDetectedCharset();
detector.reset();
if (encoding == null) {
encoding = DEFAULT_ENCODING;
} else if (encoding.equalsIgnoreCase("ISO-8859-1")) {
String encoding = "UTF-8";
CharsetDetector detector = new CharsetDetector();
detector.setText(bytes);
CharsetMatch match = detector.detect();
if (match != null) {
encoding = match.getName();
}
if (encoding.equalsIgnoreCase("ISO-8859-1")) {
encoding = "windows-1252";
}
return encoding;