reduce artifact size by using a smaller library for charset detection

This commit is contained in:
Athou
2024-09-13 14:33:19 +02:00
parent ca803ff7ce
commit 19bcc2c0da
2 changed files with 9 additions and 14 deletions

View File

@@ -433,9 +433,9 @@
<version>1.18.1</version> <version>1.18.1</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>com.ibm.icu</groupId> <groupId>com.github.albfernandez</groupId>
<artifactId>icu4j</artifactId> <artifactId>juniversalchardet</artifactId>
<version>75.1</version> <version>2.5.0</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>net.sourceforge.cssparser</groupId> <groupId>net.sourceforge.cssparser</groupId>

View File

@@ -1,12 +1,11 @@
package com.commafeed.backend.feed.parser; package com.commafeed.backend.feed.parser;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.util.Optional;
import org.apache.commons.lang3.ArrayUtils; import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.mozilla.universalchardet.UniversalDetector;
import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
import jakarta.inject.Singleton; import jakarta.inject.Singleton;
@@ -53,14 +52,10 @@ class EncodingDetector {
* Detect encoding by analyzing characters in the array * Detect encoding by analyzing characters in the array
*/ */
private Charset detectEncoding(byte[] bytes) { private Charset detectEncoding(byte[] bytes) {
String encoding = "UTF-8"; UniversalDetector detector = new UniversalDetector();
detector.handleData(bytes);
CharsetDetector detector = new CharsetDetector(); detector.dataEnd();
detector.setText(bytes); String encoding = Optional.ofNullable(detector.getDetectedCharset()).orElse("UTF-8");
CharsetMatch match = detector.detect();
if (match != null) {
encoding = match.getName();
}
if (encoding.equalsIgnoreCase("ISO-8859-1")) { if (encoding.equalsIgnoreCase("ISO-8859-1")) {
encoding = "windows-1252"; encoding = "windows-1252";
} }