extract url utils from FeedUtils

This commit is contained in:
Athou
2025-07-20 07:23:24 +02:00
parent 0b0a964a90
commit f894fdf564
8 changed files with 130 additions and 118 deletions

View File

@@ -0,0 +1,95 @@
package com.commafeed.backend;
import java.net.URI;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.netpreserve.urlcanon.Canonicalizer;
import org.netpreserve.urlcanon.ParsedUrl;
import lombok.experimental.UtilityClass;
@UtilityClass
public class Urls {
private static final String ESCAPED_QUESTION_MARK = Pattern.quote("?");
public static boolean isHttp(String url) {
return url.startsWith("http://");
}
public static boolean isHttps(String url) {
return url.startsWith("https://");
}
public static boolean isAbsolute(String url) {
return isHttp(url) || isHttps(url);
}
/**
*
* @param relativeUrl
* the url of the entry
* @param feedLink
* the url of the feed as described in the feed
* @param feedUrl
* the url of the feed that we used to fetch the feed
* @return an absolute url pointing to the entry
*/
public static String toAbsolute(String relativeUrl, String feedLink, String feedUrl) {
String baseUrl = (feedLink != null && isAbsolute(feedLink)) ? feedLink : feedUrl;
if (baseUrl == null) {
return null;
}
return URI.create(baseUrl).resolve(relativeUrl).toString();
}
public static String removeTrailingSlash(String url) {
if (url.endsWith("/")) {
url = url.substring(0, url.length() - 1);
}
return url;
}
/**
* Normalize the url. The resulting url is not meant to be fetched but rather used as a mean to identify a feed and avoid duplicates
*/
public static String normalize(String url) {
if (url == null) {
return null;
}
ParsedUrl parsedUrl = ParsedUrl.parseUrl(url);
Canonicalizer.AGGRESSIVE.canonicalize(parsedUrl);
String normalized = parsedUrl.toString();
if (normalized == null) {
normalized = url;
}
// convert to lower case, the url probably won't work in some cases
// after that but we don't care we just want to compare urls to avoid
// duplicates
normalized = normalized.toLowerCase();
// store all urls as http
if (normalized.startsWith("https")) {
normalized = "http" + normalized.substring(5);
}
// remove the www. part
normalized = normalized.replace("//www.", "//");
// feedproxy redirects to feedburner
normalized = normalized.replace("feedproxy.google.com", "feeds.feedburner.com");
// feedburner feeds have a special treatment
if (normalized.split(ESCAPED_QUESTION_MARK)[0].contains("feedburner.com")) {
normalized = normalized.replace("feeds2.feedburner.com", "feeds.feedburner.com");
normalized = normalized.split(ESCAPED_QUESTION_MARK)[0];
normalized = StringUtils.removeEnd(normalized, "/");
}
return normalized;
}
}

View File

@@ -10,7 +10,7 @@ import org.jsoup.select.Elements;
import com.commafeed.backend.HttpGetter;
import com.commafeed.backend.HttpGetter.HttpResult;
import com.commafeed.backend.feed.FeedUtils;
import com.commafeed.backend.Urls;
import com.commafeed.backend.model.Feed;
import lombok.RequiredArgsConstructor;
@@ -68,7 +68,7 @@ public class DefaultFaviconFetcher extends AbstractFaviconFetcher {
String contentType = null;
try {
url = FeedUtils.removeTrailingSlash(url) + "/favicon.ico";
url = Urls.removeTrailingSlash(url) + "/favicon.ico";
log.debug("getting root icon at {}", url);
HttpResult result = getter.get(url);
bytes = result.getContent();

View File

@@ -1,8 +1,6 @@
package com.commafeed.backend.feed;
import java.net.URI;
import java.util.List;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.apache.hc.client5.http.utils.Base64;
@@ -10,8 +8,6 @@ import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.netpreserve.urlcanon.Canonicalizer;
import org.netpreserve.urlcanon.ParsedUrl;
import com.commafeed.backend.feed.FeedEntryKeyword.Mode;
import com.commafeed.backend.feed.parser.TextDirectionDetector;
@@ -29,8 +25,6 @@ import lombok.extern.slf4j.Slf4j;
@Slf4j
public class FeedUtils {
private static final String ESCAPED_QUESTION_MARK = Pattern.quote("?");
public static String truncate(String string, int length) {
if (string != null) {
string = string.substring(0, Math.min(length, string.length()));
@@ -38,59 +32,6 @@ public class FeedUtils {
return string;
}
public static boolean isHttp(String url) {
return url.startsWith("http://");
}
public static boolean isHttps(String url) {
return url.startsWith("https://");
}
public static boolean isAbsoluteUrl(String url) {
return isHttp(url) || isHttps(url);
}
/**
* Normalize the url. The resulting url is not meant to be fetched but rather used as a mean to identify a feed and avoid duplicates
*/
public static String normalizeURL(String url) {
if (url == null) {
return null;
}
ParsedUrl parsedUrl = ParsedUrl.parseUrl(url);
Canonicalizer.AGGRESSIVE.canonicalize(parsedUrl);
String normalized = parsedUrl.toString();
if (normalized == null) {
normalized = url;
}
// convert to lower case, the url probably won't work in some cases
// after that but we don't care we just want to compare urls to avoid
// duplicates
normalized = normalized.toLowerCase();
// store all urls as http
if (normalized.startsWith("https")) {
normalized = "http" + normalized.substring(5);
}
// remove the www. part
normalized = normalized.replace("//www.", "//");
// feedproxy redirects to feedburner
normalized = normalized.replace("feedproxy.google.com", "feeds.feedburner.com");
// feedburner feeds have a special treatment
if (normalized.split(ESCAPED_QUESTION_MARK)[0].contains("feedburner.com")) {
normalized = normalized.replace("feeds2.feedburner.com", "feeds.feedburner.com");
normalized = normalized.split(ESCAPED_QUESTION_MARK)[0];
normalized = StringUtils.removeEnd(normalized, "/");
}
return normalized;
}
public static boolean isRTL(String title, String content) {
String text = StringUtils.isNotBlank(content) ? content : title;
if (StringUtils.isBlank(text)) {
@@ -105,32 +46,6 @@ public class FeedUtils {
return TextDirectionDetector.detect(stripped) == TextDirectionDetector.Direction.RIGHT_TO_LEFT;
}
public static String removeTrailingSlash(String url) {
if (url.endsWith("/")) {
url = url.substring(0, url.length() - 1);
}
return url;
}
/**
*
* @param relativeUrl
* the url of the entry
* @param feedLink
* the url of the feed as described in the feed
* @param feedUrl
* the url of the feed that we used to fetch the feed
* @return an absolute url pointing to the entry
*/
public static String toAbsoluteUrl(String relativeUrl, String feedLink, String feedUrl) {
String baseUrl = (feedLink != null && isAbsoluteUrl(feedLink)) ? feedLink : feedUrl;
if (baseUrl == null) {
return null;
}
return URI.create(baseUrl).resolve(relativeUrl).toString();
}
public static String getFaviconUrl(FeedSubscription subscription) {
return "rest/feed/favicon/" + subscription.getId();
}

View File

@@ -19,7 +19,7 @@ import org.jdom2.Element;
import org.jdom2.Namespace;
import org.xml.sax.InputSource;
import com.commafeed.backend.feed.FeedUtils;
import com.commafeed.backend.Urls;
import com.commafeed.backend.feed.parser.FeedParserResult.Content;
import com.commafeed.backend.feed.parser.FeedParserResult.Enclosure;
import com.commafeed.backend.feed.parser.FeedParserResult.Entry;
@@ -119,7 +119,7 @@ public class FeedParser {
}
String url = buildEntryUrl(feed, feedUrl, item);
if (StringUtils.isBlank(url) && FeedUtils.isAbsoluteUrl(guid)) {
if (StringUtils.isBlank(url) && Urls.isAbsolute(guid)) {
// if link is empty but guid is used as url, use guid
url = guid;
}
@@ -165,14 +165,14 @@ public class FeedParser {
private String buildEntryUrl(SyndFeed feed, String feedUrl, SyndEntry item) {
String url = StringUtils.trimToNull(StringUtils.normalizeSpace(item.getLink()));
if (url == null || FeedUtils.isAbsoluteUrl(url)) {
if (url == null || Urls.isAbsolute(url)) {
// url is absolute, nothing to do
return url;
}
// url is relative, trying to resolve it
String feedLink = StringUtils.trimToNull(StringUtils.normalizeSpace(feed.getLink()));
return FeedUtils.toAbsoluteUrl(url, feedLink, feedUrl);
return Urls.toAbsolute(url, feedLink, feedUrl);
}
private Instant toValidInstant(Date date, boolean nullToNow) {

View File

@@ -8,6 +8,7 @@ import java.util.Objects;
import jakarta.inject.Singleton;
import com.commafeed.backend.Digests;
import com.commafeed.backend.Urls;
import com.commafeed.backend.dao.FeedDAO;
import com.commafeed.backend.favicon.AbstractFaviconFetcher;
import com.commafeed.backend.favicon.Favicon;
@@ -33,7 +34,7 @@ public class FeedService {
}
public synchronized Feed findOrCreate(String url) {
String normalizedUrl = FeedUtils.normalizeURL(url);
String normalizedUrl = Urls.normalize(url);
String normalizedUrlHash = Digests.sha1Hex(normalizedUrl);
Feed feed = feedDAO.findByUrl(normalizedUrl, normalizedUrlHash);
if (feed == null) {
@@ -48,7 +49,7 @@ public class FeedService {
}
public void update(Feed feed) {
String normalized = FeedUtils.normalizeURL(feed.getUrl());
String normalized = Urls.normalize(feed.getUrl());
feed.setNormalizedUrl(normalized);
feed.setNormalizedUrlHash(Digests.sha1Hex(normalized));
feed.setLastUpdated(Instant.now());

View File

@@ -8,6 +8,7 @@ import java.util.stream.Collectors;
import jakarta.inject.Singleton;
import com.commafeed.CommaFeedConfiguration;
import com.commafeed.backend.Urls;
import com.commafeed.backend.dao.FeedEntryStatusDAO;
import com.commafeed.backend.dao.FeedSubscriptionDAO;
import com.commafeed.backend.feed.FeedRefreshEngine;
@@ -67,7 +68,7 @@ public class FeedSubscriptionService {
Feed feed = feedService.findOrCreate(url);
// upgrade feed to https if it was using http
if (FeedUtils.isHttp(feed.getUrl()) && FeedUtils.isHttps(url)) {
if (Urls.isHttp(feed.getUrl()) && Urls.isHttps(url)) {
feed.setUrl(url);
}

View File

@@ -32,10 +32,10 @@ import org.apache.hc.core5.net.URIBuilder;
import com.commafeed.CommaFeedConfiguration;
import com.commafeed.CommaFeedConstants;
import com.commafeed.backend.Digests;
import com.commafeed.backend.Urls;
import com.commafeed.backend.dao.UserDAO;
import com.commafeed.backend.dao.UserRoleDAO;
import com.commafeed.backend.dao.UserSettingsDAO;
import com.commafeed.backend.feed.FeedUtils;
import com.commafeed.backend.model.User;
import com.commafeed.backend.model.UserRole;
import com.commafeed.backend.model.UserRole.Role;
@@ -309,7 +309,7 @@ public class UserREST {
}
private String buildEmailContent(User user) throws URISyntaxException, MalformedURLException {
String publicUrl = FeedUtils.removeTrailingSlash(uri.getBaseUri().toString());
String publicUrl = Urls.removeTrailingSlash(uri.getBaseUri().toString());
publicUrl += "/rest/user/passwordResetCallback";
return String.format(
"You asked for password recovery for account '%s', <a href='%s'>follow this link</a> to change your password. Ignore this if you didn't request a password recovery.",

View File

@@ -1,9 +1,9 @@
package com.commafeed.backend.feed;
package com.commafeed.backend;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
class FeedUtilsTest {
class UrlsTest {
@Test
void testNormalization() {
@@ -25,19 +25,19 @@ class FeedUtilsTest {
String urld1 = "http://fivefilters.org/content-only/makefulltextfeed.php?url=http://feeds.feedburner.com/Frandroid";
String urld2 = "http://fivefilters.org/content-only/makefulltextfeed.php?url=http://feeds2.feedburner.com/Frandroid";
Assertions.assertEquals(FeedUtils.normalizeURL(urla1), FeedUtils.normalizeURL(urla2));
Assertions.assertEquals(FeedUtils.normalizeURL(urla1), FeedUtils.normalizeURL(urla3));
Assertions.assertEquals(FeedUtils.normalizeURL(urla1), FeedUtils.normalizeURL(urla4));
Assertions.assertEquals(FeedUtils.normalizeURL(urla1), FeedUtils.normalizeURL(urla5));
Assertions.assertEquals(Urls.normalize(urla1), Urls.normalize(urla2));
Assertions.assertEquals(Urls.normalize(urla1), Urls.normalize(urla3));
Assertions.assertEquals(Urls.normalize(urla1), Urls.normalize(urla4));
Assertions.assertEquals(Urls.normalize(urla1), Urls.normalize(urla5));
Assertions.assertEquals(FeedUtils.normalizeURL(urlb1), FeedUtils.normalizeURL(urlb2));
Assertions.assertEquals(Urls.normalize(urlb1), Urls.normalize(urlb2));
Assertions.assertEquals(FeedUtils.normalizeURL(urlc1), FeedUtils.normalizeURL(urlc2));
Assertions.assertEquals(FeedUtils.normalizeURL(urlc1), FeedUtils.normalizeURL(urlc3));
Assertions.assertEquals(FeedUtils.normalizeURL(urlc1), FeedUtils.normalizeURL(urlc4));
Assertions.assertEquals(FeedUtils.normalizeURL(urlc1), FeedUtils.normalizeURL(urlc5));
Assertions.assertEquals(Urls.normalize(urlc1), Urls.normalize(urlc2));
Assertions.assertEquals(Urls.normalize(urlc1), Urls.normalize(urlc3));
Assertions.assertEquals(Urls.normalize(urlc1), Urls.normalize(urlc4));
Assertions.assertEquals(Urls.normalize(urlc1), Urls.normalize(urlc5));
Assertions.assertNotEquals(FeedUtils.normalizeURL(urld1), FeedUtils.normalizeURL(urld2));
Assertions.assertNotEquals(Urls.normalize(urld1), Urls.normalize(urld2));
}
@@ -46,36 +46,36 @@ class FeedUtilsTest {
String expected = "http://a.com/blog/entry/1";
// usual cases
Assertions.assertEquals(expected, FeedUtils.toAbsoluteUrl("http://a.com/blog/entry/1", "http://a.com/feed/", "http://a.com/feed/"));
Assertions.assertEquals(expected, FeedUtils.toAbsoluteUrl("http://a.com/blog/entry/1", "http://a.com/feed", "http://a.com/feed"));
Assertions.assertEquals(expected, Urls.toAbsolute("http://a.com/blog/entry/1", "http://a.com/feed/", "http://a.com/feed/"));
Assertions.assertEquals(expected, Urls.toAbsolute("http://a.com/blog/entry/1", "http://a.com/feed", "http://a.com/feed"));
// relative links
Assertions.assertEquals(expected, FeedUtils.toAbsoluteUrl("../blog/entry/1", "http://a.com/feed/", "http://a.com/feed/"));
Assertions.assertEquals(expected, FeedUtils.toAbsoluteUrl("../blog/entry/1", "feed.xml", "http://a.com/feed/feed.xml"));
Assertions.assertEquals(expected, Urls.toAbsolute("../blog/entry/1", "http://a.com/feed/", "http://a.com/feed/"));
Assertions.assertEquals(expected, Urls.toAbsolute("../blog/entry/1", "feed.xml", "http://a.com/feed/feed.xml"));
// root-relative links
Assertions.assertEquals(expected, FeedUtils.toAbsoluteUrl("/blog/entry/1", "/feed", "http://a.com/feed"));
Assertions.assertEquals(expected, Urls.toAbsolute("/blog/entry/1", "/feed", "http://a.com/feed"));
// real cases
Assertions.assertEquals("https://github.com/erusev/parsedown/releases/tag/1.3.0", FeedUtils.toAbsoluteUrl(
Assertions.assertEquals("https://github.com/erusev/parsedown/releases/tag/1.3.0", Urls.toAbsolute(
"/erusev/parsedown/releases/tag/1.3.0", "/erusev/parsedown/releases", "https://github.com/erusev/parsedown/tags.atom"));
Assertions.assertEquals("http://ergoemacs.org/emacs/elisp_all_about_lines.html",
FeedUtils.toAbsoluteUrl("elisp_all_about_lines.html", "blog.xml", "http://ergoemacs.org/emacs/blog.xml"));
Urls.toAbsolute("elisp_all_about_lines.html", "blog.xml", "http://ergoemacs.org/emacs/blog.xml"));
}
@Test
void testRemoveTrailingSlash() {
final String url = "http://localhost/";
final String result = FeedUtils.removeTrailingSlash(url);
final String result = Urls.removeTrailingSlash(url);
Assertions.assertEquals("http://localhost", result);
}
@Test
void testRemoveTrailingSlashLastSlashOnly() {
final String url = "http://localhost//";
final String result = FeedUtils.removeTrailingSlash(url);
final String result = Urls.removeTrailingSlash(url);
Assertions.assertEquals("http://localhost/", result);
}
}
}