From 19bcc2c0da30b4b3a86fb622f1942d8911b7b87d Mon Sep 17 00:00:00 2001 From: Athou Date: Fri, 13 Sep 2024 14:33:19 +0200 Subject: [PATCH] reduce artifact size by using a smaller library for charset detection --- commafeed-server/pom.xml | 6 +++--- .../backend/feed/parser/EncodingDetector.java | 17 ++++++----------- 2 files changed, 9 insertions(+), 14 deletions(-) diff --git a/commafeed-server/pom.xml b/commafeed-server/pom.xml index f9508170..ab861694 100644 --- a/commafeed-server/pom.xml +++ b/commafeed-server/pom.xml @@ -433,9 +433,9 @@ 1.18.1 - com.ibm.icu - icu4j - 75.1 + com.github.albfernandez + juniversalchardet + 2.5.0 net.sourceforge.cssparser diff --git a/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/EncodingDetector.java b/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/EncodingDetector.java index 0dc092d7..6932c61b 100644 --- a/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/EncodingDetector.java +++ b/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/EncodingDetector.java @@ -1,12 +1,11 @@ package com.commafeed.backend.feed.parser; import java.nio.charset.Charset; +import java.util.Optional; import org.apache.commons.lang3.ArrayUtils; import org.apache.commons.lang3.StringUtils; - -import com.ibm.icu.text.CharsetDetector; -import com.ibm.icu.text.CharsetMatch; +import org.mozilla.universalchardet.UniversalDetector; import jakarta.inject.Singleton; @@ -53,14 +52,10 @@ class EncodingDetector { * Detect encoding by analyzing characters in the array */ private Charset detectEncoding(byte[] bytes) { - String encoding = "UTF-8"; - - CharsetDetector detector = new CharsetDetector(); - detector.setText(bytes); - CharsetMatch match = detector.detect(); - if (match != null) { - encoding = match.getName(); - } + UniversalDetector detector = new UniversalDetector(); + detector.handleData(bytes); + detector.dataEnd(); + String encoding = Optional.ofNullable(detector.getDetectedCharset()).orElse("UTF-8"); if (encoding.equalsIgnoreCase("ISO-8859-1")) { encoding = "windows-1252"; }