diff --git a/commafeed-server/pom.xml b/commafeed-server/pom.xml index ab861694..5dae894e 100644 --- a/commafeed-server/pom.xml +++ b/commafeed-server/pom.xml @@ -447,11 +447,6 @@ urlcanon 0.4.0 - - org.gwtproject - gwt-servlet - 2.11.0 - org.apache.httpcomponents.client5 httpclient5 diff --git a/commafeed-server/src/main/java/com/commafeed/backend/feed/FeedUtils.java b/commafeed-server/src/main/java/com/commafeed/backend/feed/FeedUtils.java index 3a59eff3..7c23e734 100644 --- a/commafeed-server/src/main/java/com/commafeed/backend/feed/FeedUtils.java +++ b/commafeed-server/src/main/java/com/commafeed/backend/feed/FeedUtils.java @@ -16,11 +16,10 @@ import org.netpreserve.urlcanon.Canonicalizer; import org.netpreserve.urlcanon.ParsedUrl; import com.commafeed.backend.feed.FeedEntryKeyword.Mode; +import com.commafeed.backend.feed.parser.TextDirectionDetector; import com.commafeed.backend.model.FeedEntry; import com.commafeed.backend.model.FeedSubscription; import com.commafeed.frontend.model.Entry; -import com.google.gwt.i18n.client.HasDirection.Direction; -import com.google.gwt.i18n.shared.BidiUtils; import lombok.extern.slf4j.Slf4j; @@ -109,8 +108,8 @@ public class FeedUtils { return false; } - Direction direction = BidiUtils.get().estimateDirection(text); - return direction == Direction.RTL; + TextDirectionDetector.Direction direction = TextDirectionDetector.detect(text); + return direction == TextDirectionDetector.Direction.RIGHT_TO_LEFT; } public static String removeTrailingSlash(String url) { diff --git a/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/TextDirectionDetector.java b/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/TextDirectionDetector.java new file mode 100644 index 00000000..b2a482a9 --- /dev/null +++ b/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/TextDirectionDetector.java @@ -0,0 +1,57 @@ +package com.commafeed.backend.feed.parser; + +import java.text.Bidi; +import java.util.concurrent.atomic.AtomicLong; +import java.util.regex.Pattern; + +import org.apache.commons.lang3.math.NumberUtils; + +public class TextDirectionDetector { + + private static final Pattern WORDS_PATTERN = Pattern.compile("\\s+"); + private static final Pattern URL_PATTERN = Pattern.compile("^https?://.*"); + + private static final double RTL_THRESHOLD = 0.4D; + + public enum Direction { + LEFT_TO_RIGHT, RIGHT_TO_LEFT + } + + public static Direction detect(String input) { + if (input == null || input.isBlank()) { + return Direction.LEFT_TO_RIGHT; + } + + AtomicLong rtl = new AtomicLong(); + AtomicLong total = new AtomicLong(); + for (String token : WORDS_PATTERN.split(input)) { + // skip urls + if (URL_PATTERN.matcher(token).matches()) { + continue; + } + + // skip numbers + if (NumberUtils.isCreatable(token)) { + continue; + } + + boolean requiresBidi = Bidi.requiresBidi(token.toCharArray(), 0, token.length()); + if (requiresBidi) { + Bidi bidi = new Bidi(token, Bidi.DIRECTION_DEFAULT_LEFT_TO_RIGHT); + if (bidi.getBaseLevel() == 1) { + rtl.incrementAndGet(); + } + } + + total.incrementAndGet(); + } + + if (total.longValue() == 0) { + return Direction.LEFT_TO_RIGHT; + } + + double ratio = rtl.doubleValue() / total.doubleValue(); + return ratio > RTL_THRESHOLD ? Direction.RIGHT_TO_LEFT : Direction.LEFT_TO_RIGHT; + } + +} diff --git a/commafeed-server/src/test/java/com/commafeed/backend/feed/FeedFetcherTest.java b/commafeed-server/src/test/java/com/commafeed/backend/feed/FeedFetcherTest.java index 089f6e3c..69f4f395 100644 --- a/commafeed-server/src/test/java/com/commafeed/backend/feed/FeedFetcherTest.java +++ b/commafeed-server/src/test/java/com/commafeed/backend/feed/FeedFetcherTest.java @@ -11,12 +11,12 @@ import org.mockito.Mock; import org.mockito.Mockito; import org.mockito.junit.jupiter.MockitoExtension; +import com.commafeed.backend.Digests; import com.commafeed.backend.HttpGetter; import com.commafeed.backend.HttpGetter.HttpResult; import com.commafeed.backend.HttpGetter.NotModifiedException; import com.commafeed.backend.feed.parser.FeedParser; import com.commafeed.backend.urlprovider.FeedURLProvider; -import com.google.gwt.thirdparty.guava.common.hash.Hashing; @ExtendWith(MockitoExtension.class) class FeedFetcherTest { @@ -43,7 +43,7 @@ class FeedFetcherTest { String lastModified = "last-modified-1"; String etag = "etag-1"; byte[] content = "content".getBytes(); - String lastContentHash = Hashing.sha1().hashBytes(content).toString(); + String lastContentHash = Digests.sha1Hex(content); Mockito.when(getter.getBinary(url, lastModified, etag)) .thenReturn(new HttpResult(content, "content-type", "last-modified-2", "etag-2", null)); diff --git a/commafeed-server/src/test/java/com/commafeed/backend/feed/parser/TextDirectionDetectorTest.java b/commafeed-server/src/test/java/com/commafeed/backend/feed/parser/TextDirectionDetectorTest.java new file mode 100644 index 00000000..8e27a06f --- /dev/null +++ b/commafeed-server/src/test/java/com/commafeed/backend/feed/parser/TextDirectionDetectorTest.java @@ -0,0 +1,53 @@ +package com.commafeed.backend.feed.parser; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +class TextDirectionDetectorTest { + + @Test + public void testEstimateDirection() { + Assertions.assertEquals(TextDirectionDetector.Direction.LEFT_TO_RIGHT, TextDirectionDetector.detect("")); + Assertions.assertEquals(TextDirectionDetector.Direction.LEFT_TO_RIGHT, TextDirectionDetector.detect(" ")); + Assertions.assertEquals(TextDirectionDetector.Direction.LEFT_TO_RIGHT, TextDirectionDetector.detect("! (...)")); + Assertions.assertEquals(TextDirectionDetector.Direction.LEFT_TO_RIGHT, TextDirectionDetector.detect("Pure Ascii content")); + Assertions.assertEquals(TextDirectionDetector.Direction.LEFT_TO_RIGHT, TextDirectionDetector.detect("-17.0%")); + Assertions.assertEquals(TextDirectionDetector.Direction.LEFT_TO_RIGHT, TextDirectionDetector.detect("http://foo/bar/")); + Assertions.assertEquals(TextDirectionDetector.Direction.LEFT_TO_RIGHT, + TextDirectionDetector.detect("http://foo/bar/?s=\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0" + + "\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0" + "\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0")); + Assertions.assertEquals(TextDirectionDetector.Direction.RIGHT_TO_LEFT, TextDirectionDetector.detect("\u05d0")); + Assertions.assertEquals(TextDirectionDetector.Direction.RIGHT_TO_LEFT, TextDirectionDetector.detect("\u05d0")); + Assertions.assertEquals(TextDirectionDetector.Direction.RIGHT_TO_LEFT, + TextDirectionDetector.detect("http://foo/bar/ \u05d0 http://foo2/bar2/ http://foo3/bar3/")); + Assertions.assertEquals(TextDirectionDetector.Direction.RIGHT_TO_LEFT, + TextDirectionDetector.detect("\u05d0\u05d9\u05df \u05de\u05de\u05e9 " + "\u05de\u05d4 \u05dc\u05e8\u05d0\u05d5\u05ea: " + + "\u05dc\u05d0 \u05e6\u05d9\u05dc\u05de\u05ea\u05d9 " + "\u05d4\u05e8\u05d1\u05d4 \u05d5\u05d2\u05dd \u05d0" + + "\u05dd \u05d4\u05d9\u05d9\u05ea\u05d9 \u05de\u05e6\u05dc" + "\u05dd, \u05d4\u05d9\u05d4 \u05e9\u05dd")); + Assertions.assertEquals(TextDirectionDetector.Direction.RIGHT_TO_LEFT, + TextDirectionDetector.detect("\u05db\u05d0\u05df - http://geek.co.il/gallery/v/2007-06" + + " - \u05d0\u05d9\u05df \u05de\u05de\u05e9 \u05de\u05d4 " + "\u05dc\u05e8\u05d0\u05d5\u05ea: \u05dc\u05d0 \u05e6" + + "\u05d9\u05dc\u05de\u05ea\u05d9 \u05d4\u05e8\u05d1\u05d4 " + + "\u05d5\u05d2\u05dd \u05d0\u05dd \u05d4\u05d9\u05d9\u05ea" + + "\u05d9 \u05de\u05e6\u05dc\u05dd, \u05d4\u05d9\u05d4 " + + "\u05e9\u05dd \u05d1\u05e2\u05d9\u05e7\u05e8 \u05d4\u05e8" + "\u05d1\u05d4 \u05d0\u05e0\u05e9\u05d9\u05dd. \u05de" + + "\u05d4 \u05e9\u05db\u05df - \u05d0\u05e4\u05e9\u05e8 " + "\u05dc\u05e0\u05e6\u05dc \u05d0\u05ea \u05d4\u05d4 " + + "\u05d3\u05d6\u05de\u05e0\u05d5\u05ea \u05dc\u05d4\u05e1" + "\u05ea\u05db\u05dc \u05e2\u05dc \u05db\u05de\u05d4 " + + "\u05ea\u05de\u05d5\u05e0\u05d5\u05ea \u05de\u05e9\u05e2" + + "\u05e9\u05e2\u05d5\u05ea \u05d9\u05e9\u05e0\u05d5\u05ea " + "\u05d9\u05d5\u05ea\u05e8 \u05e9\u05d9\u05e9 \u05dc" + + "\u05d9 \u05d1\u05d0\u05ea\u05e8")); + Assertions.assertEquals(TextDirectionDetector.Direction.RIGHT_TO_LEFT, + TextDirectionDetector.detect("CAPTCHA \u05de\u05e9\u05d5\u05db\u05dc\u05dc " + "\u05de\u05d3\u05d9?")); + Assertions.assertEquals(TextDirectionDetector.Direction.RIGHT_TO_LEFT, + TextDirectionDetector.detect("Yes Prime Minister \u05e2\u05d3\u05db\u05d5\u05df. " + + "\u05e9\u05d0\u05dc\u05d5 \u05d0\u05d5\u05ea\u05d9 " + "\u05de\u05d4 \u05d0\u05e0\u05d9 \u05e8\u05d5\u05e6" + + "\u05d4 \u05de\u05ea\u05e0\u05d4 \u05dc\u05d7\u05d2")); + Assertions.assertEquals(TextDirectionDetector.Direction.RIGHT_TO_LEFT, TextDirectionDetector + .detect("17.4.02 \u05e9\u05e2\u05d4:13-20 .15-00 .\u05dc\u05d0 " + "\u05d4\u05d9\u05d9\u05ea\u05d9 \u05db\u05d0\u05df.")); + Assertions.assertEquals(TextDirectionDetector.Direction.RIGHT_TO_LEFT, + TextDirectionDetector.detect("5710 5720 5730. \u05d4\u05d3\u05dc\u05ea. " + "\u05d4\u05e0\u05e9\u05d9\u05e7\u05d4")); + Assertions.assertEquals(TextDirectionDetector.Direction.RIGHT_TO_LEFT, + TextDirectionDetector.detect("\u05d4\u05d3\u05dc\u05ea http://www.google.com " + "http://www.gmail.com")); + } + +} \ No newline at end of file