Revert "reduce artifact size by using a smaller library for charset detection" because juniversalchardet doesn't support as many charsets as icu4j

This commit is contained in:
Athou
2024-09-13 23:40:13 +02:00
parent f4a43e9950
commit cca2d49cc3
2 changed files with 14 additions and 9 deletions

View File

@@ -433,9 +433,9 @@
<version>1.18.1</version> <version>1.18.1</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.github.albfernandez</groupId> <groupId>com.ibm.icu</groupId>
<artifactId>juniversalchardet</artifactId> <artifactId>icu4j</artifactId>
<version>2.5.0</version> <version>75.1</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>net.sourceforge.cssparser</groupId> <groupId>net.sourceforge.cssparser</groupId>

View File

@@ -1,11 +1,12 @@
package com.commafeed.backend.feed.parser; package com.commafeed.backend.feed.parser;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.util.Optional;
import org.apache.commons.lang3.ArrayUtils; import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.mozilla.universalchardet.UniversalDetector;
import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
import jakarta.inject.Singleton; import jakarta.inject.Singleton;
@@ -52,10 +53,14 @@ class EncodingDetector {
* Detect encoding by analyzing characters in the array * Detect encoding by analyzing characters in the array
*/ */
private Charset detectEncoding(byte[] bytes) { private Charset detectEncoding(byte[] bytes) {
UniversalDetector detector = new UniversalDetector(); String encoding = "UTF-8";
detector.handleData(bytes);
detector.dataEnd(); CharsetDetector detector = new CharsetDetector();
String encoding = Optional.ofNullable(detector.getDetectedCharset()).orElse("UTF-8"); detector.setText(bytes);
CharsetMatch match = detector.detect();
if (match != null) {
encoding = match.getName();
}
if (encoding.equalsIgnoreCase("ISO-8859-1")) { if (encoding.equalsIgnoreCase("ISO-8859-1")) {
encoding = "windows-1252"; encoding = "windows-1252";
} }