forked from Archives/Athou_commafeed
reduce artifact size by using a smaller library for charset detection
This commit is contained in:
@@ -433,9 +433,9 @@
|
||||
<version>1.18.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.ibm.icu</groupId>
|
||||
<artifactId>icu4j</artifactId>
|
||||
<version>75.1</version>
|
||||
<groupId>com.github.albfernandez</groupId>
|
||||
<artifactId>juniversalchardet</artifactId>
|
||||
<version>2.5.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>net.sourceforge.cssparser</groupId>
|
||||
|
||||
@@ -1,12 +1,11 @@
|
||||
package com.commafeed.backend.feed.parser;
|
||||
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.lang3.ArrayUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.ibm.icu.text.CharsetDetector;
|
||||
import com.ibm.icu.text.CharsetMatch;
|
||||
import org.mozilla.universalchardet.UniversalDetector;
|
||||
|
||||
import jakarta.inject.Singleton;
|
||||
|
||||
@@ -53,14 +52,10 @@ class EncodingDetector {
|
||||
* Detect encoding by analyzing characters in the array
|
||||
*/
|
||||
private Charset detectEncoding(byte[] bytes) {
|
||||
String encoding = "UTF-8";
|
||||
|
||||
CharsetDetector detector = new CharsetDetector();
|
||||
detector.setText(bytes);
|
||||
CharsetMatch match = detector.detect();
|
||||
if (match != null) {
|
||||
encoding = match.getName();
|
||||
}
|
||||
UniversalDetector detector = new UniversalDetector();
|
||||
detector.handleData(bytes);
|
||||
detector.dataEnd();
|
||||
String encoding = Optional.ofNullable(detector.getDetectedCharset()).orElse("UTF-8");
|
||||
if (encoding.equalsIgnoreCase("ISO-8859-1")) {
|
||||
encoding = "windows-1252";
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user