reduce artifact size by using a smaller library for charset detection

This commit is contained in:
Athou
2024-09-13 14:33:19 +02:00
parent ca803ff7ce
commit 19bcc2c0da
2 changed files with 9 additions and 14 deletions

View File

@@ -433,9 +433,9 @@
<version>1.18.1</version>
</dependency>
<dependency>
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId>
<version>75.1</version>
<groupId>com.github.albfernandez</groupId>
<artifactId>juniversalchardet</artifactId>
<version>2.5.0</version>
</dependency>
<dependency>
<groupId>net.sourceforge.cssparser</groupId>

View File

@@ -1,12 +1,11 @@
package com.commafeed.backend.feed.parser;
import java.nio.charset.Charset;
import java.util.Optional;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
import org.mozilla.universalchardet.UniversalDetector;
import jakarta.inject.Singleton;
@@ -53,14 +52,10 @@ class EncodingDetector {
* Detect encoding by analyzing characters in the array
*/
private Charset detectEncoding(byte[] bytes) {
String encoding = "UTF-8";
CharsetDetector detector = new CharsetDetector();
detector.setText(bytes);
CharsetMatch match = detector.detect();
if (match != null) {
encoding = match.getName();
}
UniversalDetector detector = new UniversalDetector();
detector.handleData(bytes);
detector.dataEnd();
String encoding = Optional.ofNullable(detector.getDetectedCharset()).orElse("UTF-8");
if (encoding.equalsIgnoreCase("ISO-8859-1")) {
encoding = "windows-1252";
}