diff --git a/pom.xml b/pom.xml index eddbf3c4..35d075e8 100644 --- a/pom.xml +++ b/pom.xml @@ -264,11 +264,6 @@ juniversalchardet 1.0.3 - - com.google.gwt - gwt-servlet - 2.6.1 - net.sourceforge.cssparser cssparser diff --git a/src/main/java/com/commafeed/backend/feed/EstimateDirection.java b/src/main/java/com/commafeed/backend/feed/EstimateDirection.java new file mode 100644 index 00000000..58bf74d7 --- /dev/null +++ b/src/main/java/com/commafeed/backend/feed/EstimateDirection.java @@ -0,0 +1,55 @@ +package com.commafeed.backend.feed; + +import java.util.regex.Pattern; + +/** + * This code is copied and simplified from GWT + * https://github.com/google-web-toolkit/gwt/blob/master/user/src/com/google/gwt/i18n/shared/BidiUtils.java + * Released under Apache 2.0 license, credit of it goes to Google and please use GWT wherever possible instead of this + */ +class EstimateDirection { + private static final float RTL_DETECTION_THRESHOLD = 0.40f; + + private static volatile String LTR_CHARS = + "A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02B8\u0300-\u0590\u0800-\u1FFF" + + "\u2C00-\uFB1C\uFDFE-\uFE6F\uFEFD-\uFFFF"; + private static volatile String RTL_CHARS = + "\u0591-\u07FF\uFB1D-\uFDFD\uFE70-\uFEFC"; + + private static final Pattern WORD_SEPARATOR_RE = Pattern.compile("\\s+"); + private static final Pattern FIRST_STRONG_IS_RTL_RE = + Pattern.compile("^[^" + LTR_CHARS + "]*[" + RTL_CHARS + ']'); + private static final Pattern IS_REQUIRED_LTR_RE = Pattern.compile("^http://.*"); + private static final Pattern HAS_ANY_LTR_RE = + Pattern.compile("[" + LTR_CHARS + ']'); + private static final Pattern HAS_NUMERALS_RE = Pattern.compile("\\d"); + + private static boolean startsWithRtl(String str) { + return FIRST_STRONG_IS_RTL_RE.matcher(str).matches(); + } + private static boolean hasAnyLtr(String str) { + return HAS_ANY_LTR_RE.matcher(str).matches(); + } + + static boolean isRTL(String str) { + int rtlCount = 0; + int total = 0; + boolean hasWeaklyLtr = false; + String[] tokens = WORD_SEPARATOR_RE.split(str); + for (int i = 0; i < tokens.length; i++) { + String token = tokens[i]; + if (startsWithRtl(token)) { + rtlCount++; + total++; + } else if (IS_REQUIRED_LTR_RE.matcher(token).matches()) { + hasWeaklyLtr = true; + } else if (hasAnyLtr(token)) { + total++; + } else if (HAS_NUMERALS_RE.matcher(token).matches()) { + hasWeaklyLtr = true; + } + } + + return total == 0 ? false : ((float) rtlCount / total > RTL_DETECTION_THRESHOLD ? true : false); + } +} diff --git a/src/main/java/com/commafeed/backend/feed/FeedUtils.java b/src/main/java/com/commafeed/backend/feed/FeedUtils.java index daa1b986..e271f85d 100644 --- a/src/main/java/com/commafeed/backend/feed/FeedUtils.java +++ b/src/main/java/com/commafeed/backend/feed/FeedUtils.java @@ -33,8 +33,6 @@ import com.commafeed.backend.model.FeedEntry; import com.commafeed.backend.model.FeedSubscription; import com.commafeed.frontend.model.Entry; import com.google.common.collect.Lists; -import com.google.gwt.i18n.client.HasDirection.Direction; -import com.google.gwt.i18n.shared.BidiUtils; import com.steadystate.css.parser.CSSOMParser; import edu.uci.ics.crawler4j.url.URLCanonicalizer; @@ -291,8 +289,7 @@ public class FeedUtils { return false; } - Direction direction = BidiUtils.get().estimateDirection(text); - return direction == Direction.RTL; + return EstimateDirection.isRTL(text); } public static String trimInvalidXmlCharacters(String xml) { diff --git a/src/test/java/com/commafeed/backend/feed/EstimateDirectionTest.java b/src/test/java/com/commafeed/backend/feed/EstimateDirectionTest.java new file mode 100644 index 00000000..1fe77b04 --- /dev/null +++ b/src/test/java/com/commafeed/backend/feed/EstimateDirectionTest.java @@ -0,0 +1,65 @@ +package com.commafeed.backend.feed; + +import org.junit.Assert; +import org.junit.Test; + +import static com.commafeed.backend.feed.EstimateDirection.isRTL; + +/** + * These tests are copied and simplified from GWT + * https://github.com/google-web-toolkit/gwt/blob/master/user/test/com/google/gwt/i18n/shared/BidiUtilsTest.java + * Released under Apache 2.0 license, credit of it goes to Google and please use GWT wherever possible instead of this + */ +public class EstimateDirectionTest { + + @Test + public void testEstimateDirection() { + Assert.assertEquals(false, isRTL("")); + Assert.assertEquals(false, isRTL(" ")); + Assert.assertEquals(false, isRTL("! (...)")); + Assert.assertEquals(false, isRTL("Pure Ascii content")); + Assert.assertEquals(false, isRTL("-17.0%")); + Assert.assertEquals(false, isRTL("http://foo/bar/")); + Assert.assertEquals(false, isRTL("http://foo/bar/?s=\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0" + + "\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0" + + "\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0")); + Assert.assertEquals(true, isRTL("\u05d0")); + Assert.assertEquals(true, isRTL("\u05d0")); + Assert.assertEquals(true, isRTL("9 \u05d0 -> 17.5, 23, 45, 19")); + Assert.assertEquals(true, isRTL("http://foo/bar/ \u05d0 http://foo2/bar2/ http://foo3/bar3/")); + Assert.assertEquals(true, isRTL("\u05d0\u05d9\u05df \u05de\u05de\u05e9 " + + "\u05de\u05d4 \u05dc\u05e8\u05d0\u05d5\u05ea: " + + "\u05dc\u05d0 \u05e6\u05d9\u05dc\u05de\u05ea\u05d9 " + + "\u05d4\u05e8\u05d1\u05d4 \u05d5\u05d2\u05dd \u05d0" + + "\u05dd \u05d4\u05d9\u05d9\u05ea\u05d9 \u05de\u05e6\u05dc" + + "\u05dd, \u05d4\u05d9\u05d4 \u05e9\u05dd")); + Assert.assertEquals(true, isRTL("\u05db\u05d0\u05df - http://geek.co.il/gallery/v/2007-06" + + " - \u05d0\u05d9\u05df \u05de\u05de\u05e9 \u05de\u05d4 " + + "\u05dc\u05e8\u05d0\u05d5\u05ea: \u05dc\u05d0 \u05e6" + + "\u05d9\u05dc\u05de\u05ea\u05d9 \u05d4\u05e8\u05d1\u05d4 " + + "\u05d5\u05d2\u05dd \u05d0\u05dd \u05d4\u05d9\u05d9\u05ea" + + "\u05d9 \u05de\u05e6\u05dc\u05dd, \u05d4\u05d9\u05d4 " + + "\u05e9\u05dd \u05d1\u05e2\u05d9\u05e7\u05e8 \u05d4\u05e8" + + "\u05d1\u05d4 \u05d0\u05e0\u05e9\u05d9\u05dd. \u05de" + + "\u05d4 \u05e9\u05db\u05df - \u05d0\u05e4\u05e9\u05e8 " + + "\u05dc\u05e0\u05e6\u05dc \u05d0\u05ea \u05d4\u05d4 " + + "\u05d3\u05d6\u05de\u05e0\u05d5\u05ea \u05dc\u05d4\u05e1" + + "\u05ea\u05db\u05dc \u05e2\u05dc \u05db\u05de\u05d4 " + + "\u05ea\u05de\u05d5\u05e0\u05d5\u05ea \u05de\u05e9\u05e2" + + "\u05e9\u05e2\u05d5\u05ea \u05d9\u05e9\u05e0\u05d5\u05ea " + + "\u05d9\u05d5\u05ea\u05e8 \u05e9\u05d9\u05e9 \u05dc" + + "\u05d9 \u05d1\u05d0\u05ea\u05e8")); + Assert.assertEquals(true, isRTL("CAPTCHA \u05de\u05e9\u05d5\u05db\u05dc\u05dc " + + "\u05de\u05d3\u05d9?")); + Assert.assertEquals(true, isRTL("Yes Prime Minister \u05e2\u05d3\u05db\u05d5\u05df. " + + "\u05e9\u05d0\u05dc\u05d5 \u05d0\u05d5\u05ea\u05d9 " + + "\u05de\u05d4 \u05d0\u05e0\u05d9 \u05e8\u05d5\u05e6" + + "\u05d4 \u05de\u05ea\u05e0\u05d4 \u05dc\u05d7\u05d2")); + Assert.assertEquals(true, isRTL("17.4.02 \u05e9\u05e2\u05d4:13-20 .15-00 .\u05dc\u05d0 " + + "\u05d4\u05d9\u05d9\u05ea\u05d9 \u05db\u05d0\u05df.")); + Assert.assertEquals(true, isRTL("5710 5720 5730. \u05d4\u05d3\u05dc\u05ea. " + + "\u05d4\u05e0\u05e9\u05d9\u05e7\u05d4")); + Assert.assertEquals(true, isRTL("\u05d4\u05d3\u05dc\u05ea http://www.google.com " + + "http://www.gmail.com")); + } +} \ No newline at end of file