use urlcanon instead of crawler4j because we only used it for url canonization

This commit is contained in:
Athou
2023-04-29 09:04:22 +02:00
parent 00f6c04611
commit 0a99dacb6b
2 changed files with 9 additions and 11 deletions

View File

@@ -426,15 +426,9 @@
<version>0.9.30</version>
</dependency>
<dependency>
<groupId>edu.uci.ics</groupId>
<artifactId>crawler4j</artifactId>
<version>3.5</version>
<exclusions>
<exclusion>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
</exclusion>
</exclusions>
<groupId>org.netpreserve</groupId>
<artifactId>urlcanon</artifactId>
<version>0.4.0</version>
</dependency>
<dependency>
<groupId>com.google.gwt</groupId>

View File

@@ -28,6 +28,8 @@ import org.jsoup.nodes.Entities.EscapeMode;
import org.jsoup.safety.Cleaner;
import org.jsoup.safety.Safelist;
import org.jsoup.select.Elements;
import org.netpreserve.urlcanon.Canonicalizer;
import org.netpreserve.urlcanon.ParsedUrl;
import org.w3c.css.sac.InputSource;
import org.w3c.dom.css.CSSStyleDeclaration;
@@ -41,7 +43,6 @@ import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
import com.steadystate.css.parser.CSSOMParser;
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
import lombok.extern.slf4j.Slf4j;
/**
@@ -179,7 +180,10 @@ public class FeedUtils {
if (url == null) {
return null;
}
String normalized = URLCanonicalizer.getCanonicalURL(url);
ParsedUrl parsedUrl = ParsedUrl.parseUrl(url);
Canonicalizer.AGGRESSIVE.canonicalize(parsedUrl);
String normalized = parsedUrl.toString();
if (normalized == null) {
normalized = url;
}