diff --git a/pom.xml b/pom.xml index 76dc29e7..a822533b 100644 --- a/pom.xml +++ b/pom.xml @@ -46,8 +46,8 @@ maven-compiler-plugin 3.1 - 1.7 - 1.7 + 1.6 + 1.6 -proc:none @@ -270,11 +270,6 @@ - - edu.uci.ics - crawler4j - 3.5 - org.jdom jdom diff --git a/src/main/java/edu/uci/ics/crawler4j/url/URLCanonicalizer.java b/src/main/java/edu/uci/ics/crawler4j/url/URLCanonicalizer.java new file mode 100644 index 00000000..3f5196a8 --- /dev/null +++ b/src/main/java/edu/uci/ics/crawler4j/url/URLCanonicalizer.java @@ -0,0 +1,211 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package edu.uci.ics.crawler4j.url; + +import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.URL; +import java.net.URLDecoder; +import java.net.URLEncoder; +import java.util.HashMap; +import java.util.Map; +import java.util.SortedMap; +import java.util.TreeMap; + +/** + * See http://en.wikipedia.org/wiki/URL_normalization for a reference Note: some + * parts of the code are adapted from: http://stackoverflow.com/a/4057470/405418 + * + * @author Yasser Ganjisaffar + */ +public class URLCanonicalizer { + + public static String getCanonicalURL(String url) { + return getCanonicalURL(url, null); + } + + public static String getCanonicalURL(String href, String context) { + + try { + URL canonicalURL = new URL(UrlResolver.resolveUrl(context == null ? "" : context, href)); + + String host = canonicalURL.getHost().toLowerCase(); + if (host == "") { + // This is an invalid Url. + return null; + } + + String path = canonicalURL.getPath(); + + /* + * Normalize: no empty segments (i.e., "//"), no segments equal to + * ".", and no segments equal to ".." that are preceded by a segment + * not equal to "..". + */ + path = new URI(path).normalize().toString(); + + /* + * Convert '//' -> '/' + */ + int idx = path.indexOf("//"); + while (idx >= 0) { + path = path.replace("//", "/"); + idx = path.indexOf("//"); + } + + /* + * Drop starting '/../' + */ + while (path.startsWith("/../")) { + path = path.substring(3); + } + + /* + * Trim + */ + path = path.trim(); + + final SortedMap params = createParameterMap(canonicalURL.getQuery()); + final String queryString; + + if (params != null && params.size() > 0) { + String canonicalParams = canonicalize(params); + queryString = (canonicalParams.isEmpty() ? "" : "?" + canonicalParams); + } else { + queryString = ""; + } + + /* + * Add starting slash if needed + */ + if (path.length() == 0) { + path = "/" + path; + } + + /* + * Drop default port: example.com:80 -> example.com + */ + int port = canonicalURL.getPort(); + if (port == canonicalURL.getDefaultPort()) { + port = -1; + } + + String protocol = canonicalURL.getProtocol().toLowerCase(); + String pathAndQueryString = normalizePath(path) + queryString; + + URL result = new URL(protocol, host, port, pathAndQueryString); + return result.toExternalForm(); + + } catch (MalformedURLException ex) { + return null; + } catch (URISyntaxException ex) { + return null; + } + } + + /** + * Takes a query string, separates the constituent name-value pairs, and + * stores them in a SortedMap ordered by lexicographical order. + * + * @return Null if there is no query string. + */ + private static SortedMap createParameterMap(final String queryString) { + if (queryString == null || queryString.isEmpty()) { + return null; + } + + final String[] pairs = queryString.split("&"); + final Map params = new HashMap(pairs.length); + + for (final String pair : pairs) { + if (pair.length() == 0) { + continue; + } + + String[] tokens = pair.split("=", 2); + switch (tokens.length) { + case 1: + if (pair.charAt(0) == '=') { + params.put("", tokens[0]); + } else { + params.put(tokens[0], ""); + } + break; + case 2: + params.put(tokens[0], tokens[1]); + break; + } + } + return new TreeMap(params); + } + + /** + * Canonicalize the query string. + * + * @param sortedParamMap + * Parameter name-value pairs in lexicographical order. + * @return Canonical form of query string. + */ + private static String canonicalize(final SortedMap sortedParamMap) { + if (sortedParamMap == null || sortedParamMap.isEmpty()) { + return ""; + } + + final StringBuffer sb = new StringBuffer(100); + for (Map.Entry pair : sortedParamMap.entrySet()) { + final String key = pair.getKey().toLowerCase(); + if (key.equals("jsessionid") || key.equals("phpsessid") || key.equals("aspsessionid")) { + continue; + } + if (sb.length() > 0) { + sb.append('&'); + } + sb.append(percentEncodeRfc3986(pair.getKey())); + if (!pair.getValue().isEmpty()) { + sb.append('='); + sb.append(percentEncodeRfc3986(pair.getValue())); + } + } + return sb.toString(); + } + + /** + * Percent-encode values according the RFC 3986. The built-in Java + * URLEncoder does not encode according to the RFC, so we make the extra + * replacements. + * + * @param string + * Decoded string. + * @return Encoded string per RFC 3986. + */ + private static String percentEncodeRfc3986(String string) { + try { + string = string.replace("+", "%2B"); + string = URLDecoder.decode(string, "UTF-8"); + string = URLEncoder.encode(string, "UTF-8"); + return string.replace("+", "%20").replace("*", "%2A").replace("%7E", "~"); + } catch (Exception e) { + return string; + } + } + + private static String normalizePath(final String path) { + return path.replace("%7E", "~").replace(" ", "%20"); + } +} diff --git a/src/main/java/edu/uci/ics/crawler4j/url/UrlResolver.java b/src/main/java/edu/uci/ics/crawler4j/url/UrlResolver.java new file mode 100644 index 00000000..5a6f65a7 --- /dev/null +++ b/src/main/java/edu/uci/ics/crawler4j/url/UrlResolver.java @@ -0,0 +1,462 @@ +/** + * This class is adopted from Htmlunit with the following copyright: + * + * Copyright (c) 2002-2012 Gargoyle Software Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package edu.uci.ics.crawler4j.url; + + +public final class UrlResolver { + + /** + * Resolves a given relative URL against a base URL. See + * RFC1808 + * Section 4 for more details. + * + * @param baseUrl The base URL in which to resolve the specification. + * @param relativeUrl The relative URL to resolve against the base URL. + * @return the resolved specification. + */ + public static String resolveUrl(final String baseUrl, final String relativeUrl) { + if (baseUrl == null) { + throw new IllegalArgumentException("Base URL must not be null"); + } + if (relativeUrl == null) { + throw new IllegalArgumentException("Relative URL must not be null"); + } + final Url url = resolveUrl(parseUrl(baseUrl.trim()), relativeUrl.trim()); + + return url.toString(); + } + + /** + * Returns the index within the specified string of the first occurrence of + * the specified search character. + * + * @param s the string to search + * @param searchChar the character to search for + * @param beginIndex the index at which to start the search + * @param endIndex the index at which to stop the search + * @return the index of the first occurrence of the character in the string or -1 + */ + private static int indexOf(final String s, final char searchChar, final int beginIndex, final int endIndex) { + for (int i = beginIndex; i < endIndex; i++) { + if (s.charAt(i) == searchChar) { + return i; + } + } + return -1; + } + + /** + * Parses a given specification using the algorithm depicted in + * RFC1808: + * + * Section 2.4: Parsing a URL + * + * An accepted method for parsing URLs is useful to clarify the + * generic-RL syntax of Section 2.2 and to describe the algorithm for + * resolving relative URLs presented in Section 4. This section + * describes the parsing rules for breaking down a URL (relative or + * absolute) into the component parts described in Section 2.1. The + * rules assume that the URL has already been separated from any + * surrounding text and copied to a "parse string". The rules are + * listed in the order in which they would be applied by the parser. + * + * @param spec The specification to parse. + * @return the parsed specification. + */ + private static Url parseUrl(final String spec) { + final Url url = new Url(); + int startIndex = 0; + int endIndex = spec.length(); + + // Section 2.4.1: Parsing the Fragment Identifier + // + // If the parse string contains a crosshatch "#" character, then the + // substring after the first (left-most) crosshatch "#" and up to the + // end of the parse string is the identifier. If the + // crosshatch is the last character, or no crosshatch is present, then + // the fragment identifier is empty. The matched substring, including + // the crosshatch character, is removed from the parse string before + // continuing. + // + // Note that the fragment identifier is not considered part of the URL. + // However, since it is often attached to the URL, parsers must be able + // to recognize and set aside fragment identifiers as part of the + // process. + final int crosshatchIndex = indexOf(spec, '#', startIndex, endIndex); + + if (crosshatchIndex >= 0) { + url.fragment_ = spec.substring(crosshatchIndex + 1, endIndex); + endIndex = crosshatchIndex; + } + // Section 2.4.2: Parsing the Scheme + // + // If the parse string contains a colon ":" after the first character + // and before any characters not allowed as part of a scheme name (i.e., + // any not an alphanumeric, plus "+", period ".", or hyphen "-"), the + // of the URL is the substring of characters up to but not + // including the first colon. These characters and the colon are then + // removed from the parse string before continuing. + final int colonIndex = indexOf(spec, ':', startIndex, endIndex); + + if (colonIndex > 0) { + final String scheme = spec.substring(startIndex, colonIndex); + if (isValidScheme(scheme)) { + url.scheme_ = scheme; + startIndex = colonIndex + 1; + } + } + // Section 2.4.3: Parsing the Network Location/Login + // + // If the parse string begins with a double-slash "//", then the + // substring of characters after the double-slash and up to, but not + // including, the next slash "/" character is the network location/login + // () of the URL. If no trailing slash "/" is present, the + // entire remaining parse string is assigned to . The double- + // slash and are removed from the parse string before + // continuing. + // + // Note: We also accept a question mark "?" or a semicolon ";" character as + // delimiters for the network location/login () of the URL. + final int locationStartIndex; + int locationEndIndex; + + if (spec.startsWith("//", startIndex)) { + locationStartIndex = startIndex + 2; + locationEndIndex = indexOf(spec, '/', locationStartIndex, endIndex); + if (locationEndIndex >= 0) { + startIndex = locationEndIndex; + } + } + else { + locationStartIndex = -1; + locationEndIndex = -1; + } + // Section 2.4.4: Parsing the Query Information + // + // If the parse string contains a question mark "?" character, then the + // substring after the first (left-most) question mark "?" and up to the + // end of the parse string is the information. If the question + // mark is the last character, or no question mark is present, then the + // query information is empty. The matched substring, including the + // question mark character, is removed from the parse string before + // continuing. + final int questionMarkIndex = indexOf(spec, '?', startIndex, endIndex); + + if (questionMarkIndex >= 0) { + if ((locationStartIndex >= 0) && (locationEndIndex < 0)) { + // The substring of characters after the double-slash and up to, but not + // including, the question mark "?" character is the network location/login + // () of the URL. + locationEndIndex = questionMarkIndex; + startIndex = questionMarkIndex; + } + url.query_ = spec.substring(questionMarkIndex + 1, endIndex); + endIndex = questionMarkIndex; + } + // Section 2.4.5: Parsing the Parameters + // + // If the parse string contains a semicolon ";" character, then the + // substring after the first (left-most) semicolon ";" and up to the end + // of the parse string is the parameters (). If the semicolon + // is the last character, or no semicolon is present, then is + // empty. The matched substring, including the semicolon character, is + // removed from the parse string before continuing. + final int semicolonIndex = indexOf(spec, ';', startIndex, endIndex); + + if (semicolonIndex >= 0) { + if ((locationStartIndex >= 0) && (locationEndIndex < 0)) { + // The substring of characters after the double-slash and up to, but not + // including, the semicolon ";" character is the network location/login + // () of the URL. + locationEndIndex = semicolonIndex; + startIndex = semicolonIndex; + } + url.parameters_ = spec.substring(semicolonIndex + 1, endIndex); + endIndex = semicolonIndex; + } + // Section 2.4.6: Parsing the Path + // + // After the above steps, all that is left of the parse string is the + // URL and the slash "/" that may precede it. Even though the + // initial slash is not part of the URL path, the parser must remember + // whether or not it was present so that later processes can + // differentiate between relative and absolute paths. Often this is + // done by simply storing the preceding slash along with the path. + if ((locationStartIndex >= 0) && (locationEndIndex < 0)) { + // The entire remaining parse string is assigned to the network + // location/login () of the URL. + locationEndIndex = endIndex; + } + else if (startIndex < endIndex) { + url.path_ = spec.substring(startIndex, endIndex); + } + // Set the network location/login () of the URL. + if ((locationStartIndex >= 0) && (locationEndIndex >= 0)) { + url.location_ = spec.substring(locationStartIndex, locationEndIndex); + } + return url; + } + + /* + * Returns true if specified string is a valid scheme name. + */ + private static boolean isValidScheme(final String scheme) { + final int length = scheme.length(); + if (length < 1) { + return false; + } + char c = scheme.charAt(0); + if (!Character.isLetter(c)) { + return false; + } + for (int i = 1; i < length; i++) { + c = scheme.charAt(i); + if (!Character.isLetterOrDigit(c) && c != '.' && c != '+' && c != '-') { + return false; + } + } + return true; + } + + /** + * Resolves a given relative URL against a base URL using the algorithm + * depicted in RFC1808: + * + * Section 4: Resolving Relative URLs + * + * This section describes an example algorithm for resolving URLs within + * a context in which the URLs may be relative, such that the result is + * always a URL in absolute form. Although this algorithm cannot + * guarantee that the resulting URL will equal that intended by the + * original author, it does guarantee that any valid URL (relative or + * absolute) can be consistently transformed to an absolute form given a + * valid base URL. + * + * @param baseUrl The base URL in which to resolve the specification. + * @param relativeUrl The relative URL to resolve against the base URL. + * @return the resolved specification. + */ + private static Url resolveUrl(final Url baseUrl, final String relativeUrl) { + final Url url = parseUrl(relativeUrl); + // Step 1: The base URL is established according to the rules of + // Section 3. If the base URL is the empty string (unknown), + // the embedded URL is interpreted as an absolute URL and + // we are done. + if (baseUrl == null) { + return url; + } + // Step 2: Both the base and embedded URLs are parsed into their + // component parts as described in Section 2.4. + // a) If the embedded URL is entirely empty, it inherits the + // entire base URL (i.e., is set equal to the base URL) + // and we are done. + if (relativeUrl.length() == 0) { + return new Url(baseUrl); + } + // b) If the embedded URL starts with a scheme name, it is + // interpreted as an absolute URL and we are done. + if (url.scheme_ != null) { + return url; + } + // c) Otherwise, the embedded URL inherits the scheme of + // the base URL. + url.scheme_ = baseUrl.scheme_; + // Step 3: If the embedded URL's is non-empty, we skip to + // Step 7. Otherwise, the embedded URL inherits the + // (if any) of the base URL. + if (url.location_ != null) { + return url; + } + url.location_ = baseUrl.location_; + // Step 4: If the embedded URL path is preceded by a slash "/", the + // path is not relative and we skip to Step 7. + if ((url.path_ != null) && ((url.path_.length() > 0) && ('/' == url.path_.charAt(0)))) { + url.path_ = removeLeadingSlashPoints(url.path_); + return url; + } + // Step 5: If the embedded URL path is empty (and not preceded by a + // slash), then the embedded URL inherits the base URL path, + // and + if (url.path_ == null) { + url.path_ = baseUrl.path_; + // a) if the embedded URL's is non-empty, we skip to + // step 7; otherwise, it inherits the of the base + // URL (if any) and + if (url.parameters_ != null) { + return url; + } + url.parameters_ = baseUrl.parameters_; + // b) if the embedded URL's is non-empty, we skip to + // step 7; otherwise, it inherits the of the base + // URL (if any) and we skip to step 7. + if (url.query_ != null) { + return url; + } + url.query_ = baseUrl.query_; + return url; + } + // Step 6: The last segment of the base URL's path (anything + // following the rightmost slash "/", or the entire path if no + // slash is present) is removed and the embedded URL's path is + // appended in its place. The following operations are + // then applied, in order, to the new path: + final String basePath = baseUrl.path_; + String path = ""; + + if (basePath != null) { + final int lastSlashIndex = basePath.lastIndexOf('/'); + + if (lastSlashIndex >= 0) { + path = basePath.substring(0, lastSlashIndex + 1); + } + } + else { + path = "/"; + } + path = path.concat(url.path_); + // a) All occurrences of "./", where "." is a complete path + // segment, are removed. + int pathSegmentIndex; + + while ((pathSegmentIndex = path.indexOf("/./")) >= 0) { + path = path.substring(0, pathSegmentIndex + 1).concat(path.substring(pathSegmentIndex + 3)); + } + // b) If the path ends with "." as a complete path segment, + // that "." is removed. + if (path.endsWith("/.")) { + path = path.substring(0, path.length() - 1); + } + // c) All occurrences of "/../", where is a + // complete path segment not equal to "..", are removed. + // Removal of these path segments is performed iteratively, + // removing the leftmost matching pattern on each iteration, + // until no matching pattern remains. + while ((pathSegmentIndex = path.indexOf("/../")) > 0) { + final String pathSegment = path.substring(0, pathSegmentIndex); + final int slashIndex = pathSegment.lastIndexOf('/'); + + if (slashIndex < 0) { + continue; + } + if (!"..".equals(pathSegment.substring(slashIndex))) { + path = path.substring(0, slashIndex + 1).concat(path.substring(pathSegmentIndex + 4)); + } + } + // d) If the path ends with "/..", where is a + // complete path segment not equal to "..", that + // "/.." is removed. + if (path.endsWith("/..")) { + final String pathSegment = path.substring(0, path.length() - 3); + final int slashIndex = pathSegment.lastIndexOf('/'); + + if (slashIndex >= 0) { + path = path.substring(0, slashIndex + 1); + } + } + + path = removeLeadingSlashPoints(path); + + url.path_ = path; + // Step 7: The resulting URL components, including any inherited from + // the base URL, are recombined to give the absolute form of + // the embedded URL. + return url; + } + + /** + * "/.." at the beginning should be removed as browsers do (not in RFC) + */ + private static String removeLeadingSlashPoints(String path) { + while (path.startsWith("/..")) { + path = path.substring(3); + } + + return path; + } + + /** + * Class Url represents a Uniform Resource Locator. + * + * @author Martin Tamme + */ + private static class Url { + + String scheme_; + String location_; + String path_; + String parameters_; + String query_; + String fragment_; + + /** + * Creates a Url object. + */ + public Url() { + } + + /** + * Creates a Url object from the specified + * Url object. + * + * @param url a Url object. + */ + public Url(final Url url) { + scheme_ = url.scheme_; + location_ = url.location_; + path_ = url.path_; + parameters_ = url.parameters_; + query_ = url.query_; + fragment_ = url.fragment_; + } + + /** + * Returns a string representation of the Url object. + * + * @return a string representation of the Url object. + */ + @Override + public String toString() { + final StringBuilder sb = new StringBuilder(); + + if (scheme_ != null) { + sb.append(scheme_); + sb.append(':'); + } + if (location_ != null) { + sb.append("//"); + sb.append(location_); + } + if (path_ != null) { + sb.append(path_); + } + if (parameters_ != null) { + sb.append(';'); + sb.append(parameters_); + } + if (query_ != null) { + sb.append('?'); + sb.append(query_); + } + if (fragment_ != null) { + sb.append('#'); + sb.append(fragment_); + } + return sb.toString(); + } + } +}