Merge pull request #618 from ebraminio/master

Avoid GWT depedency by bringing simplified dir estimate logic
This commit is contained in:
Athou
2014-09-11 15:33:53 +02:00
4 changed files with 121 additions and 9 deletions

View File

@@ -264,11 +264,6 @@
<artifactId>juniversalchardet</artifactId>
<version>1.0.3</version>
</dependency>
<dependency>
<groupId>com.google.gwt</groupId>
<artifactId>gwt-servlet</artifactId>
<version>2.6.1</version>
</dependency>
<dependency>
<groupId>net.sourceforge.cssparser</groupId>
<artifactId>cssparser</artifactId>

View File

@@ -0,0 +1,55 @@
package com.commafeed.backend.feed;
import java.util.regex.Pattern;
/**
* This code is copied and simplified from GWT
* https://github.com/google-web-toolkit/gwt/blob/master/user/src/com/google/gwt/i18n/shared/BidiUtils.java
* Released under Apache 2.0 license, credit of it goes to Google and please use GWT wherever possible instead of this
*/
class EstimateDirection {
private static final float RTL_DETECTION_THRESHOLD = 0.40f;
private static volatile String LTR_CHARS =
"A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02B8\u0300-\u0590\u0800-\u1FFF" +
"\u2C00-\uFB1C\uFDFE-\uFE6F\uFEFD-\uFFFF";
private static volatile String RTL_CHARS =
"\u0591-\u07FF\uFB1D-\uFDFD\uFE70-\uFEFC";
private static final Pattern WORD_SEPARATOR_RE = Pattern.compile("\\s+");
private static final Pattern FIRST_STRONG_IS_RTL_RE =
Pattern.compile("^[^" + LTR_CHARS + "]*[" + RTL_CHARS + ']');
private static final Pattern IS_REQUIRED_LTR_RE = Pattern.compile("^http://.*");
private static final Pattern HAS_ANY_LTR_RE =
Pattern.compile("[" + LTR_CHARS + ']');
private static final Pattern HAS_NUMERALS_RE = Pattern.compile("\\d");
private static boolean startsWithRtl(String str) {
return FIRST_STRONG_IS_RTL_RE.matcher(str).matches();
}
private static boolean hasAnyLtr(String str) {
return HAS_ANY_LTR_RE.matcher(str).matches();
}
static boolean isRTL(String str) {
int rtlCount = 0;
int total = 0;
boolean hasWeaklyLtr = false;
String[] tokens = WORD_SEPARATOR_RE.split(str);
for (int i = 0; i < tokens.length; i++) {
String token = tokens[i];
if (startsWithRtl(token)) {
rtlCount++;
total++;
} else if (IS_REQUIRED_LTR_RE.matcher(token).matches()) {
hasWeaklyLtr = true;
} else if (hasAnyLtr(token)) {
total++;
} else if (HAS_NUMERALS_RE.matcher(token).matches()) {
hasWeaklyLtr = true;
}
}
return total == 0 ? false : ((float) rtlCount / total > RTL_DETECTION_THRESHOLD ? true : false);
}
}

View File

@@ -33,8 +33,6 @@ import com.commafeed.backend.model.FeedEntry;
import com.commafeed.backend.model.FeedSubscription;
import com.commafeed.frontend.model.Entry;
import com.google.common.collect.Lists;
import com.google.gwt.i18n.client.HasDirection.Direction;
import com.google.gwt.i18n.shared.BidiUtils;
import com.steadystate.css.parser.CSSOMParser;
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
@@ -291,8 +289,7 @@ public class FeedUtils {
return false;
}
Direction direction = BidiUtils.get().estimateDirection(text);
return direction == Direction.RTL;
return EstimateDirection.isRTL(text);
}
public static String trimInvalidXmlCharacters(String xml) {

View File

@@ -0,0 +1,65 @@
package com.commafeed.backend.feed;
import org.junit.Assert;
import org.junit.Test;
import static com.commafeed.backend.feed.EstimateDirection.isRTL;
/**
* These tests are copied and simplified from GWT
* https://github.com/google-web-toolkit/gwt/blob/master/user/test/com/google/gwt/i18n/shared/BidiUtilsTest.java
* Released under Apache 2.0 license, credit of it goes to Google and please use GWT wherever possible instead of this
*/
public class EstimateDirectionTest {
@Test
public void testEstimateDirection() {
Assert.assertEquals(false, isRTL(""));
Assert.assertEquals(false, isRTL(" "));
Assert.assertEquals(false, isRTL("! (...)"));
Assert.assertEquals(false, isRTL("Pure Ascii content"));
Assert.assertEquals(false, isRTL("-17.0%"));
Assert.assertEquals(false, isRTL("http://foo/bar/"));
Assert.assertEquals(false, isRTL("http://foo/bar/?s=\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0"
+ "\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0"
+ "\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0"));
Assert.assertEquals(true, isRTL("\u05d0"));
Assert.assertEquals(true, isRTL("\u05d0"));
Assert.assertEquals(true, isRTL("9 \u05d0 -> 17.5, 23, 45, 19"));
Assert.assertEquals(true, isRTL("http://foo/bar/ \u05d0 http://foo2/bar2/ http://foo3/bar3/"));
Assert.assertEquals(true, isRTL("\u05d0\u05d9\u05df \u05de\u05de\u05e9 "
+ "\u05de\u05d4 \u05dc\u05e8\u05d0\u05d5\u05ea: "
+ "\u05dc\u05d0 \u05e6\u05d9\u05dc\u05de\u05ea\u05d9 "
+ "\u05d4\u05e8\u05d1\u05d4 \u05d5\u05d2\u05dd \u05d0"
+ "\u05dd \u05d4\u05d9\u05d9\u05ea\u05d9 \u05de\u05e6\u05dc"
+ "\u05dd, \u05d4\u05d9\u05d4 \u05e9\u05dd"));
Assert.assertEquals(true, isRTL("\u05db\u05d0\u05df - http://geek.co.il/gallery/v/2007-06"
+ " - \u05d0\u05d9\u05df \u05de\u05de\u05e9 \u05de\u05d4 "
+ "\u05dc\u05e8\u05d0\u05d5\u05ea: \u05dc\u05d0 \u05e6"
+ "\u05d9\u05dc\u05de\u05ea\u05d9 \u05d4\u05e8\u05d1\u05d4 "
+ "\u05d5\u05d2\u05dd \u05d0\u05dd \u05d4\u05d9\u05d9\u05ea"
+ "\u05d9 \u05de\u05e6\u05dc\u05dd, \u05d4\u05d9\u05d4 "
+ "\u05e9\u05dd \u05d1\u05e2\u05d9\u05e7\u05e8 \u05d4\u05e8"
+ "\u05d1\u05d4 \u05d0\u05e0\u05e9\u05d9\u05dd. \u05de"
+ "\u05d4 \u05e9\u05db\u05df - \u05d0\u05e4\u05e9\u05e8 "
+ "\u05dc\u05e0\u05e6\u05dc \u05d0\u05ea \u05d4\u05d4 "
+ "\u05d3\u05d6\u05de\u05e0\u05d5\u05ea \u05dc\u05d4\u05e1"
+ "\u05ea\u05db\u05dc \u05e2\u05dc \u05db\u05de\u05d4 "
+ "\u05ea\u05de\u05d5\u05e0\u05d5\u05ea \u05de\u05e9\u05e2"
+ "\u05e9\u05e2\u05d5\u05ea \u05d9\u05e9\u05e0\u05d5\u05ea "
+ "\u05d9\u05d5\u05ea\u05e8 \u05e9\u05d9\u05e9 \u05dc"
+ "\u05d9 \u05d1\u05d0\u05ea\u05e8"));
Assert.assertEquals(true, isRTL("CAPTCHA \u05de\u05e9\u05d5\u05db\u05dc\u05dc "
+ "\u05de\u05d3\u05d9?"));
Assert.assertEquals(true, isRTL("Yes Prime Minister \u05e2\u05d3\u05db\u05d5\u05df. "
+ "\u05e9\u05d0\u05dc\u05d5 \u05d0\u05d5\u05ea\u05d9 "
+ "\u05de\u05d4 \u05d0\u05e0\u05d9 \u05e8\u05d5\u05e6"
+ "\u05d4 \u05de\u05ea\u05e0\u05d4 \u05dc\u05d7\u05d2"));
Assert.assertEquals(true, isRTL("17.4.02 \u05e9\u05e2\u05d4:13-20 .15-00 .\u05dc\u05d0 "
+ "\u05d4\u05d9\u05d9\u05ea\u05d9 \u05db\u05d0\u05df."));
Assert.assertEquals(true, isRTL("5710 5720 5730. \u05d4\u05d3\u05dc\u05ea. "
+ "\u05d4\u05e0\u05e9\u05d9\u05e7\u05d4"));
Assert.assertEquals(true, isRTL("\u05d4\u05d3\u05dc\u05ea http://www.google.com "
+ "http://www.gmail.com"));
}
}