forked from Archives/Athou_commafeed
Merge pull request #618 from ebraminio/master
Avoid GWT depedency by bringing simplified dir estimate logic
This commit is contained in:
5
pom.xml
5
pom.xml
@@ -264,11 +264,6 @@
|
||||
<artifactId>juniversalchardet</artifactId>
|
||||
<version>1.0.3</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.google.gwt</groupId>
|
||||
<artifactId>gwt-servlet</artifactId>
|
||||
<version>2.6.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>net.sourceforge.cssparser</groupId>
|
||||
<artifactId>cssparser</artifactId>
|
||||
|
||||
@@ -0,0 +1,55 @@
|
||||
package com.commafeed.backend.feed;
|
||||
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* This code is copied and simplified from GWT
|
||||
* https://github.com/google-web-toolkit/gwt/blob/master/user/src/com/google/gwt/i18n/shared/BidiUtils.java
|
||||
* Released under Apache 2.0 license, credit of it goes to Google and please use GWT wherever possible instead of this
|
||||
*/
|
||||
class EstimateDirection {
|
||||
private static final float RTL_DETECTION_THRESHOLD = 0.40f;
|
||||
|
||||
private static volatile String LTR_CHARS =
|
||||
"A-Za-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02B8\u0300-\u0590\u0800-\u1FFF" +
|
||||
"\u2C00-\uFB1C\uFDFE-\uFE6F\uFEFD-\uFFFF";
|
||||
private static volatile String RTL_CHARS =
|
||||
"\u0591-\u07FF\uFB1D-\uFDFD\uFE70-\uFEFC";
|
||||
|
||||
private static final Pattern WORD_SEPARATOR_RE = Pattern.compile("\\s+");
|
||||
private static final Pattern FIRST_STRONG_IS_RTL_RE =
|
||||
Pattern.compile("^[^" + LTR_CHARS + "]*[" + RTL_CHARS + ']');
|
||||
private static final Pattern IS_REQUIRED_LTR_RE = Pattern.compile("^http://.*");
|
||||
private static final Pattern HAS_ANY_LTR_RE =
|
||||
Pattern.compile("[" + LTR_CHARS + ']');
|
||||
private static final Pattern HAS_NUMERALS_RE = Pattern.compile("\\d");
|
||||
|
||||
private static boolean startsWithRtl(String str) {
|
||||
return FIRST_STRONG_IS_RTL_RE.matcher(str).matches();
|
||||
}
|
||||
private static boolean hasAnyLtr(String str) {
|
||||
return HAS_ANY_LTR_RE.matcher(str).matches();
|
||||
}
|
||||
|
||||
static boolean isRTL(String str) {
|
||||
int rtlCount = 0;
|
||||
int total = 0;
|
||||
boolean hasWeaklyLtr = false;
|
||||
String[] tokens = WORD_SEPARATOR_RE.split(str);
|
||||
for (int i = 0; i < tokens.length; i++) {
|
||||
String token = tokens[i];
|
||||
if (startsWithRtl(token)) {
|
||||
rtlCount++;
|
||||
total++;
|
||||
} else if (IS_REQUIRED_LTR_RE.matcher(token).matches()) {
|
||||
hasWeaklyLtr = true;
|
||||
} else if (hasAnyLtr(token)) {
|
||||
total++;
|
||||
} else if (HAS_NUMERALS_RE.matcher(token).matches()) {
|
||||
hasWeaklyLtr = true;
|
||||
}
|
||||
}
|
||||
|
||||
return total == 0 ? false : ((float) rtlCount / total > RTL_DETECTION_THRESHOLD ? true : false);
|
||||
}
|
||||
}
|
||||
@@ -33,8 +33,6 @@ import com.commafeed.backend.model.FeedEntry;
|
||||
import com.commafeed.backend.model.FeedSubscription;
|
||||
import com.commafeed.frontend.model.Entry;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.gwt.i18n.client.HasDirection.Direction;
|
||||
import com.google.gwt.i18n.shared.BidiUtils;
|
||||
import com.steadystate.css.parser.CSSOMParser;
|
||||
|
||||
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
|
||||
@@ -291,8 +289,7 @@ public class FeedUtils {
|
||||
return false;
|
||||
}
|
||||
|
||||
Direction direction = BidiUtils.get().estimateDirection(text);
|
||||
return direction == Direction.RTL;
|
||||
return EstimateDirection.isRTL(text);
|
||||
}
|
||||
|
||||
public static String trimInvalidXmlCharacters(String xml) {
|
||||
|
||||
@@ -0,0 +1,65 @@
|
||||
package com.commafeed.backend.feed;
|
||||
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
import static com.commafeed.backend.feed.EstimateDirection.isRTL;
|
||||
|
||||
/**
|
||||
* These tests are copied and simplified from GWT
|
||||
* https://github.com/google-web-toolkit/gwt/blob/master/user/test/com/google/gwt/i18n/shared/BidiUtilsTest.java
|
||||
* Released under Apache 2.0 license, credit of it goes to Google and please use GWT wherever possible instead of this
|
||||
*/
|
||||
public class EstimateDirectionTest {
|
||||
|
||||
@Test
|
||||
public void testEstimateDirection() {
|
||||
Assert.assertEquals(false, isRTL(""));
|
||||
Assert.assertEquals(false, isRTL(" "));
|
||||
Assert.assertEquals(false, isRTL("! (...)"));
|
||||
Assert.assertEquals(false, isRTL("Pure Ascii content"));
|
||||
Assert.assertEquals(false, isRTL("-17.0%"));
|
||||
Assert.assertEquals(false, isRTL("http://foo/bar/"));
|
||||
Assert.assertEquals(false, isRTL("http://foo/bar/?s=\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0"
|
||||
+ "\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0"
|
||||
+ "\u05d0\u05d0\u05d0\u05d0\u05d0\u05d0"));
|
||||
Assert.assertEquals(true, isRTL("\u05d0"));
|
||||
Assert.assertEquals(true, isRTL("\u05d0"));
|
||||
Assert.assertEquals(true, isRTL("9 \u05d0 -> 17.5, 23, 45, 19"));
|
||||
Assert.assertEquals(true, isRTL("http://foo/bar/ \u05d0 http://foo2/bar2/ http://foo3/bar3/"));
|
||||
Assert.assertEquals(true, isRTL("\u05d0\u05d9\u05df \u05de\u05de\u05e9 "
|
||||
+ "\u05de\u05d4 \u05dc\u05e8\u05d0\u05d5\u05ea: "
|
||||
+ "\u05dc\u05d0 \u05e6\u05d9\u05dc\u05de\u05ea\u05d9 "
|
||||
+ "\u05d4\u05e8\u05d1\u05d4 \u05d5\u05d2\u05dd \u05d0"
|
||||
+ "\u05dd \u05d4\u05d9\u05d9\u05ea\u05d9 \u05de\u05e6\u05dc"
|
||||
+ "\u05dd, \u05d4\u05d9\u05d4 \u05e9\u05dd"));
|
||||
Assert.assertEquals(true, isRTL("\u05db\u05d0\u05df - http://geek.co.il/gallery/v/2007-06"
|
||||
+ " - \u05d0\u05d9\u05df \u05de\u05de\u05e9 \u05de\u05d4 "
|
||||
+ "\u05dc\u05e8\u05d0\u05d5\u05ea: \u05dc\u05d0 \u05e6"
|
||||
+ "\u05d9\u05dc\u05de\u05ea\u05d9 \u05d4\u05e8\u05d1\u05d4 "
|
||||
+ "\u05d5\u05d2\u05dd \u05d0\u05dd \u05d4\u05d9\u05d9\u05ea"
|
||||
+ "\u05d9 \u05de\u05e6\u05dc\u05dd, \u05d4\u05d9\u05d4 "
|
||||
+ "\u05e9\u05dd \u05d1\u05e2\u05d9\u05e7\u05e8 \u05d4\u05e8"
|
||||
+ "\u05d1\u05d4 \u05d0\u05e0\u05e9\u05d9\u05dd. \u05de"
|
||||
+ "\u05d4 \u05e9\u05db\u05df - \u05d0\u05e4\u05e9\u05e8 "
|
||||
+ "\u05dc\u05e0\u05e6\u05dc \u05d0\u05ea \u05d4\u05d4 "
|
||||
+ "\u05d3\u05d6\u05de\u05e0\u05d5\u05ea \u05dc\u05d4\u05e1"
|
||||
+ "\u05ea\u05db\u05dc \u05e2\u05dc \u05db\u05de\u05d4 "
|
||||
+ "\u05ea\u05de\u05d5\u05e0\u05d5\u05ea \u05de\u05e9\u05e2"
|
||||
+ "\u05e9\u05e2\u05d5\u05ea \u05d9\u05e9\u05e0\u05d5\u05ea "
|
||||
+ "\u05d9\u05d5\u05ea\u05e8 \u05e9\u05d9\u05e9 \u05dc"
|
||||
+ "\u05d9 \u05d1\u05d0\u05ea\u05e8"));
|
||||
Assert.assertEquals(true, isRTL("CAPTCHA \u05de\u05e9\u05d5\u05db\u05dc\u05dc "
|
||||
+ "\u05de\u05d3\u05d9?"));
|
||||
Assert.assertEquals(true, isRTL("Yes Prime Minister \u05e2\u05d3\u05db\u05d5\u05df. "
|
||||
+ "\u05e9\u05d0\u05dc\u05d5 \u05d0\u05d5\u05ea\u05d9 "
|
||||
+ "\u05de\u05d4 \u05d0\u05e0\u05d9 \u05e8\u05d5\u05e6"
|
||||
+ "\u05d4 \u05de\u05ea\u05e0\u05d4 \u05dc\u05d7\u05d2"));
|
||||
Assert.assertEquals(true, isRTL("17.4.02 \u05e9\u05e2\u05d4:13-20 .15-00 .\u05dc\u05d0 "
|
||||
+ "\u05d4\u05d9\u05d9\u05ea\u05d9 \u05db\u05d0\u05df."));
|
||||
Assert.assertEquals(true, isRTL("5710 5720 5730. \u05d4\u05d3\u05dc\u05ea. "
|
||||
+ "\u05d4\u05e0\u05e9\u05d9\u05e7\u05d4"));
|
||||
Assert.assertEquals(true, isRTL("\u05d4\u05d3\u05dc\u05ea http://www.google.com "
|
||||
+ "http://www.gmail.com"));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user