Revert "reduce artifact size by using a smaller library for charset detection" because juniversalchardet doesn't support as many charsets as icu4j

This commit is contained in:
Athou
2024-09-13 23:40:13 +02:00
parent f4a43e9950
commit cca2d49cc3
2 changed files with 14 additions and 9 deletions

View File

@@ -433,9 +433,9 @@
<version>1.18.1</version>
</dependency>
<dependency>
<groupId>com.github.albfernandez</groupId>
<artifactId>juniversalchardet</artifactId>
<version>2.5.0</version>
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId>
<version>75.1</version>
</dependency>
<dependency>
<groupId>net.sourceforge.cssparser</groupId>

View File

@@ -1,11 +1,12 @@
package com.commafeed.backend.feed.parser;
import java.nio.charset.Charset;
import java.util.Optional;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import org.mozilla.universalchardet.UniversalDetector;
import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
import jakarta.inject.Singleton;
@@ -52,10 +53,14 @@ class EncodingDetector {
* Detect encoding by analyzing characters in the array
*/
private Charset detectEncoding(byte[] bytes) {
UniversalDetector detector = new UniversalDetector();
detector.handleData(bytes);
detector.dataEnd();
String encoding = Optional.ofNullable(detector.getDetectedCharset()).orElse("UTF-8");
String encoding = "UTF-8";
CharsetDetector detector = new CharsetDetector();
detector.setText(bytes);
CharsetMatch match = detector.detect();
if (match != null) {
encoding = match.getName();
}
if (encoding.equalsIgnoreCase("ISO-8859-1")) {
encoding = "windows-1252";
}