mirror of
https://github.com/Athou/commafeed.git
synced 2026-03-21 21:37:29 +00:00
extracted needed classes and remove crawler4j dependency and java7 requirement
This commit is contained in:
211
src/main/java/edu/uci/ics/crawler4j/url/URLCanonicalizer.java
Normal file
211
src/main/java/edu/uci/ics/crawler4j/url/URLCanonicalizer.java
Normal file
@@ -0,0 +1,211 @@
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package edu.uci.ics.crawler4j.url;
|
||||
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
import java.net.URL;
|
||||
import java.net.URLDecoder;
|
||||
import java.net.URLEncoder;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.SortedMap;
|
||||
import java.util.TreeMap;
|
||||
|
||||
/**
|
||||
* See http://en.wikipedia.org/wiki/URL_normalization for a reference Note: some
|
||||
* parts of the code are adapted from: http://stackoverflow.com/a/4057470/405418
|
||||
*
|
||||
* @author Yasser Ganjisaffar <lastname at gmail dot com>
|
||||
*/
|
||||
public class URLCanonicalizer {
|
||||
|
||||
public static String getCanonicalURL(String url) {
|
||||
return getCanonicalURL(url, null);
|
||||
}
|
||||
|
||||
public static String getCanonicalURL(String href, String context) {
|
||||
|
||||
try {
|
||||
URL canonicalURL = new URL(UrlResolver.resolveUrl(context == null ? "" : context, href));
|
||||
|
||||
String host = canonicalURL.getHost().toLowerCase();
|
||||
if (host == "") {
|
||||
// This is an invalid Url.
|
||||
return null;
|
||||
}
|
||||
|
||||
String path = canonicalURL.getPath();
|
||||
|
||||
/*
|
||||
* Normalize: no empty segments (i.e., "//"), no segments equal to
|
||||
* ".", and no segments equal to ".." that are preceded by a segment
|
||||
* not equal to "..".
|
||||
*/
|
||||
path = new URI(path).normalize().toString();
|
||||
|
||||
/*
|
||||
* Convert '//' -> '/'
|
||||
*/
|
||||
int idx = path.indexOf("//");
|
||||
while (idx >= 0) {
|
||||
path = path.replace("//", "/");
|
||||
idx = path.indexOf("//");
|
||||
}
|
||||
|
||||
/*
|
||||
* Drop starting '/../'
|
||||
*/
|
||||
while (path.startsWith("/../")) {
|
||||
path = path.substring(3);
|
||||
}
|
||||
|
||||
/*
|
||||
* Trim
|
||||
*/
|
||||
path = path.trim();
|
||||
|
||||
final SortedMap<String, String> params = createParameterMap(canonicalURL.getQuery());
|
||||
final String queryString;
|
||||
|
||||
if (params != null && params.size() > 0) {
|
||||
String canonicalParams = canonicalize(params);
|
||||
queryString = (canonicalParams.isEmpty() ? "" : "?" + canonicalParams);
|
||||
} else {
|
||||
queryString = "";
|
||||
}
|
||||
|
||||
/*
|
||||
* Add starting slash if needed
|
||||
*/
|
||||
if (path.length() == 0) {
|
||||
path = "/" + path;
|
||||
}
|
||||
|
||||
/*
|
||||
* Drop default port: example.com:80 -> example.com
|
||||
*/
|
||||
int port = canonicalURL.getPort();
|
||||
if (port == canonicalURL.getDefaultPort()) {
|
||||
port = -1;
|
||||
}
|
||||
|
||||
String protocol = canonicalURL.getProtocol().toLowerCase();
|
||||
String pathAndQueryString = normalizePath(path) + queryString;
|
||||
|
||||
URL result = new URL(protocol, host, port, pathAndQueryString);
|
||||
return result.toExternalForm();
|
||||
|
||||
} catch (MalformedURLException ex) {
|
||||
return null;
|
||||
} catch (URISyntaxException ex) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Takes a query string, separates the constituent name-value pairs, and
|
||||
* stores them in a SortedMap ordered by lexicographical order.
|
||||
*
|
||||
* @return Null if there is no query string.
|
||||
*/
|
||||
private static SortedMap<String, String> createParameterMap(final String queryString) {
|
||||
if (queryString == null || queryString.isEmpty()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final String[] pairs = queryString.split("&");
|
||||
final Map<String, String> params = new HashMap<String, String>(pairs.length);
|
||||
|
||||
for (final String pair : pairs) {
|
||||
if (pair.length() == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
String[] tokens = pair.split("=", 2);
|
||||
switch (tokens.length) {
|
||||
case 1:
|
||||
if (pair.charAt(0) == '=') {
|
||||
params.put("", tokens[0]);
|
||||
} else {
|
||||
params.put(tokens[0], "");
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
params.put(tokens[0], tokens[1]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
return new TreeMap<String, String>(params);
|
||||
}
|
||||
|
||||
/**
|
||||
* Canonicalize the query string.
|
||||
*
|
||||
* @param sortedParamMap
|
||||
* Parameter name-value pairs in lexicographical order.
|
||||
* @return Canonical form of query string.
|
||||
*/
|
||||
private static String canonicalize(final SortedMap<String, String> sortedParamMap) {
|
||||
if (sortedParamMap == null || sortedParamMap.isEmpty()) {
|
||||
return "";
|
||||
}
|
||||
|
||||
final StringBuffer sb = new StringBuffer(100);
|
||||
for (Map.Entry<String, String> pair : sortedParamMap.entrySet()) {
|
||||
final String key = pair.getKey().toLowerCase();
|
||||
if (key.equals("jsessionid") || key.equals("phpsessid") || key.equals("aspsessionid")) {
|
||||
continue;
|
||||
}
|
||||
if (sb.length() > 0) {
|
||||
sb.append('&');
|
||||
}
|
||||
sb.append(percentEncodeRfc3986(pair.getKey()));
|
||||
if (!pair.getValue().isEmpty()) {
|
||||
sb.append('=');
|
||||
sb.append(percentEncodeRfc3986(pair.getValue()));
|
||||
}
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Percent-encode values according the RFC 3986. The built-in Java
|
||||
* URLEncoder does not encode according to the RFC, so we make the extra
|
||||
* replacements.
|
||||
*
|
||||
* @param string
|
||||
* Decoded string.
|
||||
* @return Encoded string per RFC 3986.
|
||||
*/
|
||||
private static String percentEncodeRfc3986(String string) {
|
||||
try {
|
||||
string = string.replace("+", "%2B");
|
||||
string = URLDecoder.decode(string, "UTF-8");
|
||||
string = URLEncoder.encode(string, "UTF-8");
|
||||
return string.replace("+", "%20").replace("*", "%2A").replace("%7E", "~");
|
||||
} catch (Exception e) {
|
||||
return string;
|
||||
}
|
||||
}
|
||||
|
||||
private static String normalizePath(final String path) {
|
||||
return path.replace("%7E", "~").replace(" ", "%20");
|
||||
}
|
||||
}
|
||||
462
src/main/java/edu/uci/ics/crawler4j/url/UrlResolver.java
Normal file
462
src/main/java/edu/uci/ics/crawler4j/url/UrlResolver.java
Normal file
@@ -0,0 +1,462 @@
|
||||
/**
|
||||
* This class is adopted from Htmlunit with the following copyright:
|
||||
*
|
||||
* Copyright (c) 2002-2012 Gargoyle Software Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package edu.uci.ics.crawler4j.url;
|
||||
|
||||
|
||||
public final class UrlResolver {
|
||||
|
||||
/**
|
||||
* Resolves a given relative URL against a base URL. See
|
||||
* <a href="http://www.faqs.org/rfcs/rfc1808.html">RFC1808</a>
|
||||
* Section 4 for more details.
|
||||
*
|
||||
* @param baseUrl The base URL in which to resolve the specification.
|
||||
* @param relativeUrl The relative URL to resolve against the base URL.
|
||||
* @return the resolved specification.
|
||||
*/
|
||||
public static String resolveUrl(final String baseUrl, final String relativeUrl) {
|
||||
if (baseUrl == null) {
|
||||
throw new IllegalArgumentException("Base URL must not be null");
|
||||
}
|
||||
if (relativeUrl == null) {
|
||||
throw new IllegalArgumentException("Relative URL must not be null");
|
||||
}
|
||||
final Url url = resolveUrl(parseUrl(baseUrl.trim()), relativeUrl.trim());
|
||||
|
||||
return url.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the index within the specified string of the first occurrence of
|
||||
* the specified search character.
|
||||
*
|
||||
* @param s the string to search
|
||||
* @param searchChar the character to search for
|
||||
* @param beginIndex the index at which to start the search
|
||||
* @param endIndex the index at which to stop the search
|
||||
* @return the index of the first occurrence of the character in the string or <tt>-1</tt>
|
||||
*/
|
||||
private static int indexOf(final String s, final char searchChar, final int beginIndex, final int endIndex) {
|
||||
for (int i = beginIndex; i < endIndex; i++) {
|
||||
if (s.charAt(i) == searchChar) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses a given specification using the algorithm depicted in
|
||||
* <a href="http://www.faqs.org/rfcs/rfc1808.html">RFC1808</a>:
|
||||
*
|
||||
* Section 2.4: Parsing a URL
|
||||
*
|
||||
* An accepted method for parsing URLs is useful to clarify the
|
||||
* generic-RL syntax of Section 2.2 and to describe the algorithm for
|
||||
* resolving relative URLs presented in Section 4. This section
|
||||
* describes the parsing rules for breaking down a URL (relative or
|
||||
* absolute) into the component parts described in Section 2.1. The
|
||||
* rules assume that the URL has already been separated from any
|
||||
* surrounding text and copied to a "parse string". The rules are
|
||||
* listed in the order in which they would be applied by the parser.
|
||||
*
|
||||
* @param spec The specification to parse.
|
||||
* @return the parsed specification.
|
||||
*/
|
||||
private static Url parseUrl(final String spec) {
|
||||
final Url url = new Url();
|
||||
int startIndex = 0;
|
||||
int endIndex = spec.length();
|
||||
|
||||
// Section 2.4.1: Parsing the Fragment Identifier
|
||||
//
|
||||
// If the parse string contains a crosshatch "#" character, then the
|
||||
// substring after the first (left-most) crosshatch "#" and up to the
|
||||
// end of the parse string is the <fragment> identifier. If the
|
||||
// crosshatch is the last character, or no crosshatch is present, then
|
||||
// the fragment identifier is empty. The matched substring, including
|
||||
// the crosshatch character, is removed from the parse string before
|
||||
// continuing.
|
||||
//
|
||||
// Note that the fragment identifier is not considered part of the URL.
|
||||
// However, since it is often attached to the URL, parsers must be able
|
||||
// to recognize and set aside fragment identifiers as part of the
|
||||
// process.
|
||||
final int crosshatchIndex = indexOf(spec, '#', startIndex, endIndex);
|
||||
|
||||
if (crosshatchIndex >= 0) {
|
||||
url.fragment_ = spec.substring(crosshatchIndex + 1, endIndex);
|
||||
endIndex = crosshatchIndex;
|
||||
}
|
||||
// Section 2.4.2: Parsing the Scheme
|
||||
//
|
||||
// If the parse string contains a colon ":" after the first character
|
||||
// and before any characters not allowed as part of a scheme name (i.e.,
|
||||
// any not an alphanumeric, plus "+", period ".", or hyphen "-"), the
|
||||
// <scheme> of the URL is the substring of characters up to but not
|
||||
// including the first colon. These characters and the colon are then
|
||||
// removed from the parse string before continuing.
|
||||
final int colonIndex = indexOf(spec, ':', startIndex, endIndex);
|
||||
|
||||
if (colonIndex > 0) {
|
||||
final String scheme = spec.substring(startIndex, colonIndex);
|
||||
if (isValidScheme(scheme)) {
|
||||
url.scheme_ = scheme;
|
||||
startIndex = colonIndex + 1;
|
||||
}
|
||||
}
|
||||
// Section 2.4.3: Parsing the Network Location/Login
|
||||
//
|
||||
// If the parse string begins with a double-slash "//", then the
|
||||
// substring of characters after the double-slash and up to, but not
|
||||
// including, the next slash "/" character is the network location/login
|
||||
// (<net_loc>) of the URL. If no trailing slash "/" is present, the
|
||||
// entire remaining parse string is assigned to <net_loc>. The double-
|
||||
// slash and <net_loc> are removed from the parse string before
|
||||
// continuing.
|
||||
//
|
||||
// Note: We also accept a question mark "?" or a semicolon ";" character as
|
||||
// delimiters for the network location/login (<net_loc>) of the URL.
|
||||
final int locationStartIndex;
|
||||
int locationEndIndex;
|
||||
|
||||
if (spec.startsWith("//", startIndex)) {
|
||||
locationStartIndex = startIndex + 2;
|
||||
locationEndIndex = indexOf(spec, '/', locationStartIndex, endIndex);
|
||||
if (locationEndIndex >= 0) {
|
||||
startIndex = locationEndIndex;
|
||||
}
|
||||
}
|
||||
else {
|
||||
locationStartIndex = -1;
|
||||
locationEndIndex = -1;
|
||||
}
|
||||
// Section 2.4.4: Parsing the Query Information
|
||||
//
|
||||
// If the parse string contains a question mark "?" character, then the
|
||||
// substring after the first (left-most) question mark "?" and up to the
|
||||
// end of the parse string is the <query> information. If the question
|
||||
// mark is the last character, or no question mark is present, then the
|
||||
// query information is empty. The matched substring, including the
|
||||
// question mark character, is removed from the parse string before
|
||||
// continuing.
|
||||
final int questionMarkIndex = indexOf(spec, '?', startIndex, endIndex);
|
||||
|
||||
if (questionMarkIndex >= 0) {
|
||||
if ((locationStartIndex >= 0) && (locationEndIndex < 0)) {
|
||||
// The substring of characters after the double-slash and up to, but not
|
||||
// including, the question mark "?" character is the network location/login
|
||||
// (<net_loc>) of the URL.
|
||||
locationEndIndex = questionMarkIndex;
|
||||
startIndex = questionMarkIndex;
|
||||
}
|
||||
url.query_ = spec.substring(questionMarkIndex + 1, endIndex);
|
||||
endIndex = questionMarkIndex;
|
||||
}
|
||||
// Section 2.4.5: Parsing the Parameters
|
||||
//
|
||||
// If the parse string contains a semicolon ";" character, then the
|
||||
// substring after the first (left-most) semicolon ";" and up to the end
|
||||
// of the parse string is the parameters (<params>). If the semicolon
|
||||
// is the last character, or no semicolon is present, then <params> is
|
||||
// empty. The matched substring, including the semicolon character, is
|
||||
// removed from the parse string before continuing.
|
||||
final int semicolonIndex = indexOf(spec, ';', startIndex, endIndex);
|
||||
|
||||
if (semicolonIndex >= 0) {
|
||||
if ((locationStartIndex >= 0) && (locationEndIndex < 0)) {
|
||||
// The substring of characters after the double-slash and up to, but not
|
||||
// including, the semicolon ";" character is the network location/login
|
||||
// (<net_loc>) of the URL.
|
||||
locationEndIndex = semicolonIndex;
|
||||
startIndex = semicolonIndex;
|
||||
}
|
||||
url.parameters_ = spec.substring(semicolonIndex + 1, endIndex);
|
||||
endIndex = semicolonIndex;
|
||||
}
|
||||
// Section 2.4.6: Parsing the Path
|
||||
//
|
||||
// After the above steps, all that is left of the parse string is the
|
||||
// URL <path> and the slash "/" that may precede it. Even though the
|
||||
// initial slash is not part of the URL path, the parser must remember
|
||||
// whether or not it was present so that later processes can
|
||||
// differentiate between relative and absolute paths. Often this is
|
||||
// done by simply storing the preceding slash along with the path.
|
||||
if ((locationStartIndex >= 0) && (locationEndIndex < 0)) {
|
||||
// The entire remaining parse string is assigned to the network
|
||||
// location/login (<net_loc>) of the URL.
|
||||
locationEndIndex = endIndex;
|
||||
}
|
||||
else if (startIndex < endIndex) {
|
||||
url.path_ = spec.substring(startIndex, endIndex);
|
||||
}
|
||||
// Set the network location/login (<net_loc>) of the URL.
|
||||
if ((locationStartIndex >= 0) && (locationEndIndex >= 0)) {
|
||||
url.location_ = spec.substring(locationStartIndex, locationEndIndex);
|
||||
}
|
||||
return url;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns true if specified string is a valid scheme name.
|
||||
*/
|
||||
private static boolean isValidScheme(final String scheme) {
|
||||
final int length = scheme.length();
|
||||
if (length < 1) {
|
||||
return false;
|
||||
}
|
||||
char c = scheme.charAt(0);
|
||||
if (!Character.isLetter(c)) {
|
||||
return false;
|
||||
}
|
||||
for (int i = 1; i < length; i++) {
|
||||
c = scheme.charAt(i);
|
||||
if (!Character.isLetterOrDigit(c) && c != '.' && c != '+' && c != '-') {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolves a given relative URL against a base URL using the algorithm
|
||||
* depicted in <a href="http://www.faqs.org/rfcs/rfc1808.html">RFC1808</a>:
|
||||
*
|
||||
* Section 4: Resolving Relative URLs
|
||||
*
|
||||
* This section describes an example algorithm for resolving URLs within
|
||||
* a context in which the URLs may be relative, such that the result is
|
||||
* always a URL in absolute form. Although this algorithm cannot
|
||||
* guarantee that the resulting URL will equal that intended by the
|
||||
* original author, it does guarantee that any valid URL (relative or
|
||||
* absolute) can be consistently transformed to an absolute form given a
|
||||
* valid base URL.
|
||||
*
|
||||
* @param baseUrl The base URL in which to resolve the specification.
|
||||
* @param relativeUrl The relative URL to resolve against the base URL.
|
||||
* @return the resolved specification.
|
||||
*/
|
||||
private static Url resolveUrl(final Url baseUrl, final String relativeUrl) {
|
||||
final Url url = parseUrl(relativeUrl);
|
||||
// Step 1: The base URL is established according to the rules of
|
||||
// Section 3. If the base URL is the empty string (unknown),
|
||||
// the embedded URL is interpreted as an absolute URL and
|
||||
// we are done.
|
||||
if (baseUrl == null) {
|
||||
return url;
|
||||
}
|
||||
// Step 2: Both the base and embedded URLs are parsed into their
|
||||
// component parts as described in Section 2.4.
|
||||
// a) If the embedded URL is entirely empty, it inherits the
|
||||
// entire base URL (i.e., is set equal to the base URL)
|
||||
// and we are done.
|
||||
if (relativeUrl.length() == 0) {
|
||||
return new Url(baseUrl);
|
||||
}
|
||||
// b) If the embedded URL starts with a scheme name, it is
|
||||
// interpreted as an absolute URL and we are done.
|
||||
if (url.scheme_ != null) {
|
||||
return url;
|
||||
}
|
||||
// c) Otherwise, the embedded URL inherits the scheme of
|
||||
// the base URL.
|
||||
url.scheme_ = baseUrl.scheme_;
|
||||
// Step 3: If the embedded URL's <net_loc> is non-empty, we skip to
|
||||
// Step 7. Otherwise, the embedded URL inherits the <net_loc>
|
||||
// (if any) of the base URL.
|
||||
if (url.location_ != null) {
|
||||
return url;
|
||||
}
|
||||
url.location_ = baseUrl.location_;
|
||||
// Step 4: If the embedded URL path is preceded by a slash "/", the
|
||||
// path is not relative and we skip to Step 7.
|
||||
if ((url.path_ != null) && ((url.path_.length() > 0) && ('/' == url.path_.charAt(0)))) {
|
||||
url.path_ = removeLeadingSlashPoints(url.path_);
|
||||
return url;
|
||||
}
|
||||
// Step 5: If the embedded URL path is empty (and not preceded by a
|
||||
// slash), then the embedded URL inherits the base URL path,
|
||||
// and
|
||||
if (url.path_ == null) {
|
||||
url.path_ = baseUrl.path_;
|
||||
// a) if the embedded URL's <params> is non-empty, we skip to
|
||||
// step 7; otherwise, it inherits the <params> of the base
|
||||
// URL (if any) and
|
||||
if (url.parameters_ != null) {
|
||||
return url;
|
||||
}
|
||||
url.parameters_ = baseUrl.parameters_;
|
||||
// b) if the embedded URL's <query> is non-empty, we skip to
|
||||
// step 7; otherwise, it inherits the <query> of the base
|
||||
// URL (if any) and we skip to step 7.
|
||||
if (url.query_ != null) {
|
||||
return url;
|
||||
}
|
||||
url.query_ = baseUrl.query_;
|
||||
return url;
|
||||
}
|
||||
// Step 6: The last segment of the base URL's path (anything
|
||||
// following the rightmost slash "/", or the entire path if no
|
||||
// slash is present) is removed and the embedded URL's path is
|
||||
// appended in its place. The following operations are
|
||||
// then applied, in order, to the new path:
|
||||
final String basePath = baseUrl.path_;
|
||||
String path = "";
|
||||
|
||||
if (basePath != null) {
|
||||
final int lastSlashIndex = basePath.lastIndexOf('/');
|
||||
|
||||
if (lastSlashIndex >= 0) {
|
||||
path = basePath.substring(0, lastSlashIndex + 1);
|
||||
}
|
||||
}
|
||||
else {
|
||||
path = "/";
|
||||
}
|
||||
path = path.concat(url.path_);
|
||||
// a) All occurrences of "./", where "." is a complete path
|
||||
// segment, are removed.
|
||||
int pathSegmentIndex;
|
||||
|
||||
while ((pathSegmentIndex = path.indexOf("/./")) >= 0) {
|
||||
path = path.substring(0, pathSegmentIndex + 1).concat(path.substring(pathSegmentIndex + 3));
|
||||
}
|
||||
// b) If the path ends with "." as a complete path segment,
|
||||
// that "." is removed.
|
||||
if (path.endsWith("/.")) {
|
||||
path = path.substring(0, path.length() - 1);
|
||||
}
|
||||
// c) All occurrences of "<segment>/../", where <segment> is a
|
||||
// complete path segment not equal to "..", are removed.
|
||||
// Removal of these path segments is performed iteratively,
|
||||
// removing the leftmost matching pattern on each iteration,
|
||||
// until no matching pattern remains.
|
||||
while ((pathSegmentIndex = path.indexOf("/../")) > 0) {
|
||||
final String pathSegment = path.substring(0, pathSegmentIndex);
|
||||
final int slashIndex = pathSegment.lastIndexOf('/');
|
||||
|
||||
if (slashIndex < 0) {
|
||||
continue;
|
||||
}
|
||||
if (!"..".equals(pathSegment.substring(slashIndex))) {
|
||||
path = path.substring(0, slashIndex + 1).concat(path.substring(pathSegmentIndex + 4));
|
||||
}
|
||||
}
|
||||
// d) If the path ends with "<segment>/..", where <segment> is a
|
||||
// complete path segment not equal to "..", that
|
||||
// "<segment>/.." is removed.
|
||||
if (path.endsWith("/..")) {
|
||||
final String pathSegment = path.substring(0, path.length() - 3);
|
||||
final int slashIndex = pathSegment.lastIndexOf('/');
|
||||
|
||||
if (slashIndex >= 0) {
|
||||
path = path.substring(0, slashIndex + 1);
|
||||
}
|
||||
}
|
||||
|
||||
path = removeLeadingSlashPoints(path);
|
||||
|
||||
url.path_ = path;
|
||||
// Step 7: The resulting URL components, including any inherited from
|
||||
// the base URL, are recombined to give the absolute form of
|
||||
// the embedded URL.
|
||||
return url;
|
||||
}
|
||||
|
||||
/**
|
||||
* "/.." at the beginning should be removed as browsers do (not in RFC)
|
||||
*/
|
||||
private static String removeLeadingSlashPoints(String path) {
|
||||
while (path.startsWith("/..")) {
|
||||
path = path.substring(3);
|
||||
}
|
||||
|
||||
return path;
|
||||
}
|
||||
|
||||
/**
|
||||
* Class <tt>Url</tt> represents a Uniform Resource Locator.
|
||||
*
|
||||
* @author Martin Tamme
|
||||
*/
|
||||
private static class Url {
|
||||
|
||||
String scheme_;
|
||||
String location_;
|
||||
String path_;
|
||||
String parameters_;
|
||||
String query_;
|
||||
String fragment_;
|
||||
|
||||
/**
|
||||
* Creates a <tt>Url</tt> object.
|
||||
*/
|
||||
public Url() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a <tt>Url</tt> object from the specified
|
||||
* <tt>Url</tt> object.
|
||||
*
|
||||
* @param url a <tt>Url</tt> object.
|
||||
*/
|
||||
public Url(final Url url) {
|
||||
scheme_ = url.scheme_;
|
||||
location_ = url.location_;
|
||||
path_ = url.path_;
|
||||
parameters_ = url.parameters_;
|
||||
query_ = url.query_;
|
||||
fragment_ = url.fragment_;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a string representation of the <tt>Url</tt> object.
|
||||
*
|
||||
* @return a string representation of the <tt>Url</tt> object.
|
||||
*/
|
||||
@Override
|
||||
public String toString() {
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
|
||||
if (scheme_ != null) {
|
||||
sb.append(scheme_);
|
||||
sb.append(':');
|
||||
}
|
||||
if (location_ != null) {
|
||||
sb.append("//");
|
||||
sb.append(location_);
|
||||
}
|
||||
if (path_ != null) {
|
||||
sb.append(path_);
|
||||
}
|
||||
if (parameters_ != null) {
|
||||
sb.append(';');
|
||||
sb.append(parameters_);
|
||||
}
|
||||
if (query_ != null) {
|
||||
sb.append('?');
|
||||
sb.append(query_);
|
||||
}
|
||||
if (fragment_ != null) {
|
||||
sb.append('#');
|
||||
sb.append(fragment_);
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user