forked from Archives/Athou_commafeed
use urlcanon instead of crawler4j because we only used it for url canonization
This commit is contained in:
@@ -426,15 +426,9 @@
|
|||||||
<version>0.9.30</version>
|
<version>0.9.30</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>edu.uci.ics</groupId>
|
<groupId>org.netpreserve</groupId>
|
||||||
<artifactId>crawler4j</artifactId>
|
<artifactId>urlcanon</artifactId>
|
||||||
<version>3.5</version>
|
<version>0.4.0</version>
|
||||||
<exclusions>
|
|
||||||
<exclusion>
|
|
||||||
<groupId>log4j</groupId>
|
|
||||||
<artifactId>log4j</artifactId>
|
|
||||||
</exclusion>
|
|
||||||
</exclusions>
|
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.google.gwt</groupId>
|
<groupId>com.google.gwt</groupId>
|
||||||
|
|||||||
@@ -28,6 +28,8 @@ import org.jsoup.nodes.Entities.EscapeMode;
|
|||||||
import org.jsoup.safety.Cleaner;
|
import org.jsoup.safety.Cleaner;
|
||||||
import org.jsoup.safety.Safelist;
|
import org.jsoup.safety.Safelist;
|
||||||
import org.jsoup.select.Elements;
|
import org.jsoup.select.Elements;
|
||||||
|
import org.netpreserve.urlcanon.Canonicalizer;
|
||||||
|
import org.netpreserve.urlcanon.ParsedUrl;
|
||||||
import org.w3c.css.sac.InputSource;
|
import org.w3c.css.sac.InputSource;
|
||||||
import org.w3c.dom.css.CSSStyleDeclaration;
|
import org.w3c.dom.css.CSSStyleDeclaration;
|
||||||
|
|
||||||
@@ -41,7 +43,6 @@ import com.ibm.icu.text.CharsetDetector;
|
|||||||
import com.ibm.icu.text.CharsetMatch;
|
import com.ibm.icu.text.CharsetMatch;
|
||||||
import com.steadystate.css.parser.CSSOMParser;
|
import com.steadystate.css.parser.CSSOMParser;
|
||||||
|
|
||||||
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
|
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -179,7 +180,10 @@ public class FeedUtils {
|
|||||||
if (url == null) {
|
if (url == null) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
String normalized = URLCanonicalizer.getCanonicalURL(url);
|
|
||||||
|
ParsedUrl parsedUrl = ParsedUrl.parseUrl(url);
|
||||||
|
Canonicalizer.AGGRESSIVE.canonicalize(parsedUrl);
|
||||||
|
String normalized = parsedUrl.toString();
|
||||||
if (normalized == null) {
|
if (normalized == null) {
|
||||||
normalized = url;
|
normalized = url;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user