mirror of
https://github.com/Athou/commafeed.git
synced 2026-03-21 21:37:29 +00:00
use urlcanon instead of crawler4j because we only used it for url canonization
This commit is contained in:
@@ -426,15 +426,9 @@
|
||||
<version>0.9.30</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>edu.uci.ics</groupId>
|
||||
<artifactId>crawler4j</artifactId>
|
||||
<version>3.5</version>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>log4j</groupId>
|
||||
<artifactId>log4j</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
<groupId>org.netpreserve</groupId>
|
||||
<artifactId>urlcanon</artifactId>
|
||||
<version>0.4.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.google.gwt</groupId>
|
||||
|
||||
@@ -28,6 +28,8 @@ import org.jsoup.nodes.Entities.EscapeMode;
|
||||
import org.jsoup.safety.Cleaner;
|
||||
import org.jsoup.safety.Safelist;
|
||||
import org.jsoup.select.Elements;
|
||||
import org.netpreserve.urlcanon.Canonicalizer;
|
||||
import org.netpreserve.urlcanon.ParsedUrl;
|
||||
import org.w3c.css.sac.InputSource;
|
||||
import org.w3c.dom.css.CSSStyleDeclaration;
|
||||
|
||||
@@ -41,7 +43,6 @@ import com.ibm.icu.text.CharsetDetector;
|
||||
import com.ibm.icu.text.CharsetMatch;
|
||||
import com.steadystate.css.parser.CSSOMParser;
|
||||
|
||||
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
/**
|
||||
@@ -179,7 +180,10 @@ public class FeedUtils {
|
||||
if (url == null) {
|
||||
return null;
|
||||
}
|
||||
String normalized = URLCanonicalizer.getCanonicalURL(url);
|
||||
|
||||
ParsedUrl parsedUrl = ParsedUrl.parseUrl(url);
|
||||
Canonicalizer.AGGRESSIVE.canonicalize(parsedUrl);
|
||||
String normalized = parsedUrl.toString();
|
||||
if (normalized == null) {
|
||||
normalized = url;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user