From cca2d49cc3877c279a24769712306414b18daa04 Mon Sep 17 00:00:00 2001 From: Athou Date: Fri, 13 Sep 2024 23:40:13 +0200 Subject: [PATCH] Revert "reduce artifact size by using a smaller library for charset detection" because juniversalchardet doesn't support as many charsets as icu4j --- commafeed-server/pom.xml | 6 +++--- .../backend/feed/parser/EncodingDetector.java | 17 +++++++++++------ 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/commafeed-server/pom.xml b/commafeed-server/pom.xml index 5dae894e..915e4b65 100644 --- a/commafeed-server/pom.xml +++ b/commafeed-server/pom.xml @@ -433,9 +433,9 @@ 1.18.1 - com.github.albfernandez - juniversalchardet - 2.5.0 + com.ibm.icu + icu4j + 75.1 net.sourceforge.cssparser diff --git a/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/EncodingDetector.java b/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/EncodingDetector.java index 6932c61b..0dc092d7 100644 --- a/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/EncodingDetector.java +++ b/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/EncodingDetector.java @@ -1,11 +1,12 @@ package com.commafeed.backend.feed.parser; import java.nio.charset.Charset; -import java.util.Optional; import org.apache.commons.lang3.ArrayUtils; import org.apache.commons.lang3.StringUtils; -import org.mozilla.universalchardet.UniversalDetector; + +import com.ibm.icu.text.CharsetDetector; +import com.ibm.icu.text.CharsetMatch; import jakarta.inject.Singleton; @@ -52,10 +53,14 @@ class EncodingDetector { * Detect encoding by analyzing characters in the array */ private Charset detectEncoding(byte[] bytes) { - UniversalDetector detector = new UniversalDetector(); - detector.handleData(bytes); - detector.dataEnd(); - String encoding = Optional.ofNullable(detector.getDetectedCharset()).orElse("UTF-8"); + String encoding = "UTF-8"; + + CharsetDetector detector = new CharsetDetector(); + detector.setText(bytes); + CharsetMatch match = detector.detect(); + if (match != null) { + encoding = match.getName(); + } if (encoding.equalsIgnoreCase("ISO-8859-1")) { encoding = "windows-1252"; }