diff --git a/commafeed-server/pom.xml b/commafeed-server/pom.xml index 25720d2c..82ddda2e 100644 --- a/commafeed-server/pom.xml +++ b/commafeed-server/pom.xml @@ -426,15 +426,9 @@ 0.9.30 - edu.uci.ics - crawler4j - 3.5 - - - log4j - log4j - - + org.netpreserve + urlcanon + 0.4.0 com.google.gwt diff --git a/commafeed-server/src/main/java/com/commafeed/backend/feed/FeedUtils.java b/commafeed-server/src/main/java/com/commafeed/backend/feed/FeedUtils.java index 5de8adf8..2f144d39 100644 --- a/commafeed-server/src/main/java/com/commafeed/backend/feed/FeedUtils.java +++ b/commafeed-server/src/main/java/com/commafeed/backend/feed/FeedUtils.java @@ -28,6 +28,8 @@ import org.jsoup.nodes.Entities.EscapeMode; import org.jsoup.safety.Cleaner; import org.jsoup.safety.Safelist; import org.jsoup.select.Elements; +import org.netpreserve.urlcanon.Canonicalizer; +import org.netpreserve.urlcanon.ParsedUrl; import org.w3c.css.sac.InputSource; import org.w3c.dom.css.CSSStyleDeclaration; @@ -41,7 +43,6 @@ import com.ibm.icu.text.CharsetDetector; import com.ibm.icu.text.CharsetMatch; import com.steadystate.css.parser.CSSOMParser; -import edu.uci.ics.crawler4j.url.URLCanonicalizer; import lombok.extern.slf4j.Slf4j; /** @@ -179,7 +180,10 @@ public class FeedUtils { if (url == null) { return null; } - String normalized = URLCanonicalizer.getCanonicalURL(url); + + ParsedUrl parsedUrl = ParsedUrl.parseUrl(url); + Canonicalizer.AGGRESSIVE.canonicalize(parsedUrl); + String normalized = parsedUrl.toString(); if (normalized == null) { normalized = url; }