extract url utils from FeedUtils

This commit is contained in:
Athou
2025-07-20 07:23:24 +02:00
parent 0b0a964a90
commit f894fdf564
8 changed files with 130 additions and 118 deletions

View File

@@ -0,0 +1,95 @@
package com.commafeed.backend;
import java.net.URI;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.netpreserve.urlcanon.Canonicalizer;
import org.netpreserve.urlcanon.ParsedUrl;
import lombok.experimental.UtilityClass;
@UtilityClass
public class Urls {
private static final String ESCAPED_QUESTION_MARK = Pattern.quote("?");
public static boolean isHttp(String url) {
return url.startsWith("http://");
}
public static boolean isHttps(String url) {
return url.startsWith("https://");
}
public static boolean isAbsolute(String url) {
return isHttp(url) || isHttps(url);
}
/**
*
* @param relativeUrl
* the url of the entry
* @param feedLink
* the url of the feed as described in the feed
* @param feedUrl
* the url of the feed that we used to fetch the feed
* @return an absolute url pointing to the entry
*/
public static String toAbsolute(String relativeUrl, String feedLink, String feedUrl) {
String baseUrl = (feedLink != null && isAbsolute(feedLink)) ? feedLink : feedUrl;
if (baseUrl == null) {
return null;
}
return URI.create(baseUrl).resolve(relativeUrl).toString();
}
public static String removeTrailingSlash(String url) {
if (url.endsWith("/")) {
url = url.substring(0, url.length() - 1);
}
return url;
}
/**
* Normalize the url. The resulting url is not meant to be fetched but rather used as a mean to identify a feed and avoid duplicates
*/
public static String normalize(String url) {
if (url == null) {
return null;
}
ParsedUrl parsedUrl = ParsedUrl.parseUrl(url);
Canonicalizer.AGGRESSIVE.canonicalize(parsedUrl);
String normalized = parsedUrl.toString();
if (normalized == null) {
normalized = url;
}
// convert to lower case, the url probably won't work in some cases
// after that but we don't care we just want to compare urls to avoid
// duplicates
normalized = normalized.toLowerCase();
// store all urls as http
if (normalized.startsWith("https")) {
normalized = "http" + normalized.substring(5);
}
// remove the www. part
normalized = normalized.replace("//www.", "//");
// feedproxy redirects to feedburner
normalized = normalized.replace("feedproxy.google.com", "feeds.feedburner.com");
// feedburner feeds have a special treatment
if (normalized.split(ESCAPED_QUESTION_MARK)[0].contains("feedburner.com")) {
normalized = normalized.replace("feeds2.feedburner.com", "feeds.feedburner.com");
normalized = normalized.split(ESCAPED_QUESTION_MARK)[0];
normalized = StringUtils.removeEnd(normalized, "/");
}
return normalized;
}
}

View File

@@ -10,7 +10,7 @@ import org.jsoup.select.Elements;
import com.commafeed.backend.HttpGetter; import com.commafeed.backend.HttpGetter;
import com.commafeed.backend.HttpGetter.HttpResult; import com.commafeed.backend.HttpGetter.HttpResult;
import com.commafeed.backend.feed.FeedUtils; import com.commafeed.backend.Urls;
import com.commafeed.backend.model.Feed; import com.commafeed.backend.model.Feed;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
@@ -68,7 +68,7 @@ public class DefaultFaviconFetcher extends AbstractFaviconFetcher {
String contentType = null; String contentType = null;
try { try {
url = FeedUtils.removeTrailingSlash(url) + "/favicon.ico"; url = Urls.removeTrailingSlash(url) + "/favicon.ico";
log.debug("getting root icon at {}", url); log.debug("getting root icon at {}", url);
HttpResult result = getter.get(url); HttpResult result = getter.get(url);
bytes = result.getContent(); bytes = result.getContent();

View File

@@ -1,8 +1,6 @@
package com.commafeed.backend.feed; package com.commafeed.backend.feed;
import java.net.URI;
import java.util.List; import java.util.List;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.hc.client5.http.utils.Base64; import org.apache.hc.client5.http.utils.Base64;
@@ -10,8 +8,6 @@ import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element; import org.jsoup.nodes.Element;
import org.jsoup.select.Elements; import org.jsoup.select.Elements;
import org.netpreserve.urlcanon.Canonicalizer;
import org.netpreserve.urlcanon.ParsedUrl;
import com.commafeed.backend.feed.FeedEntryKeyword.Mode; import com.commafeed.backend.feed.FeedEntryKeyword.Mode;
import com.commafeed.backend.feed.parser.TextDirectionDetector; import com.commafeed.backend.feed.parser.TextDirectionDetector;
@@ -29,8 +25,6 @@ import lombok.extern.slf4j.Slf4j;
@Slf4j @Slf4j
public class FeedUtils { public class FeedUtils {
private static final String ESCAPED_QUESTION_MARK = Pattern.quote("?");
public static String truncate(String string, int length) { public static String truncate(String string, int length) {
if (string != null) { if (string != null) {
string = string.substring(0, Math.min(length, string.length())); string = string.substring(0, Math.min(length, string.length()));
@@ -38,59 +32,6 @@ public class FeedUtils {
return string; return string;
} }
public static boolean isHttp(String url) {
return url.startsWith("http://");
}
public static boolean isHttps(String url) {
return url.startsWith("https://");
}
public static boolean isAbsoluteUrl(String url) {
return isHttp(url) || isHttps(url);
}
/**
* Normalize the url. The resulting url is not meant to be fetched but rather used as a mean to identify a feed and avoid duplicates
*/
public static String normalizeURL(String url) {
if (url == null) {
return null;
}
ParsedUrl parsedUrl = ParsedUrl.parseUrl(url);
Canonicalizer.AGGRESSIVE.canonicalize(parsedUrl);
String normalized = parsedUrl.toString();
if (normalized == null) {
normalized = url;
}
// convert to lower case, the url probably won't work in some cases
// after that but we don't care we just want to compare urls to avoid
// duplicates
normalized = normalized.toLowerCase();
// store all urls as http
if (normalized.startsWith("https")) {
normalized = "http" + normalized.substring(5);
}
// remove the www. part
normalized = normalized.replace("//www.", "//");
// feedproxy redirects to feedburner
normalized = normalized.replace("feedproxy.google.com", "feeds.feedburner.com");
// feedburner feeds have a special treatment
if (normalized.split(ESCAPED_QUESTION_MARK)[0].contains("feedburner.com")) {
normalized = normalized.replace("feeds2.feedburner.com", "feeds.feedburner.com");
normalized = normalized.split(ESCAPED_QUESTION_MARK)[0];
normalized = StringUtils.removeEnd(normalized, "/");
}
return normalized;
}
public static boolean isRTL(String title, String content) { public static boolean isRTL(String title, String content) {
String text = StringUtils.isNotBlank(content) ? content : title; String text = StringUtils.isNotBlank(content) ? content : title;
if (StringUtils.isBlank(text)) { if (StringUtils.isBlank(text)) {
@@ -105,32 +46,6 @@ public class FeedUtils {
return TextDirectionDetector.detect(stripped) == TextDirectionDetector.Direction.RIGHT_TO_LEFT; return TextDirectionDetector.detect(stripped) == TextDirectionDetector.Direction.RIGHT_TO_LEFT;
} }
public static String removeTrailingSlash(String url) {
if (url.endsWith("/")) {
url = url.substring(0, url.length() - 1);
}
return url;
}
/**
*
* @param relativeUrl
* the url of the entry
* @param feedLink
* the url of the feed as described in the feed
* @param feedUrl
* the url of the feed that we used to fetch the feed
* @return an absolute url pointing to the entry
*/
public static String toAbsoluteUrl(String relativeUrl, String feedLink, String feedUrl) {
String baseUrl = (feedLink != null && isAbsoluteUrl(feedLink)) ? feedLink : feedUrl;
if (baseUrl == null) {
return null;
}
return URI.create(baseUrl).resolve(relativeUrl).toString();
}
public static String getFaviconUrl(FeedSubscription subscription) { public static String getFaviconUrl(FeedSubscription subscription) {
return "rest/feed/favicon/" + subscription.getId(); return "rest/feed/favicon/" + subscription.getId();
} }

View File

@@ -19,7 +19,7 @@ import org.jdom2.Element;
import org.jdom2.Namespace; import org.jdom2.Namespace;
import org.xml.sax.InputSource; import org.xml.sax.InputSource;
import com.commafeed.backend.feed.FeedUtils; import com.commafeed.backend.Urls;
import com.commafeed.backend.feed.parser.FeedParserResult.Content; import com.commafeed.backend.feed.parser.FeedParserResult.Content;
import com.commafeed.backend.feed.parser.FeedParserResult.Enclosure; import com.commafeed.backend.feed.parser.FeedParserResult.Enclosure;
import com.commafeed.backend.feed.parser.FeedParserResult.Entry; import com.commafeed.backend.feed.parser.FeedParserResult.Entry;
@@ -119,7 +119,7 @@ public class FeedParser {
} }
String url = buildEntryUrl(feed, feedUrl, item); String url = buildEntryUrl(feed, feedUrl, item);
if (StringUtils.isBlank(url) && FeedUtils.isAbsoluteUrl(guid)) { if (StringUtils.isBlank(url) && Urls.isAbsolute(guid)) {
// if link is empty but guid is used as url, use guid // if link is empty but guid is used as url, use guid
url = guid; url = guid;
} }
@@ -165,14 +165,14 @@ public class FeedParser {
private String buildEntryUrl(SyndFeed feed, String feedUrl, SyndEntry item) { private String buildEntryUrl(SyndFeed feed, String feedUrl, SyndEntry item) {
String url = StringUtils.trimToNull(StringUtils.normalizeSpace(item.getLink())); String url = StringUtils.trimToNull(StringUtils.normalizeSpace(item.getLink()));
if (url == null || FeedUtils.isAbsoluteUrl(url)) { if (url == null || Urls.isAbsolute(url)) {
// url is absolute, nothing to do // url is absolute, nothing to do
return url; return url;
} }
// url is relative, trying to resolve it // url is relative, trying to resolve it
String feedLink = StringUtils.trimToNull(StringUtils.normalizeSpace(feed.getLink())); String feedLink = StringUtils.trimToNull(StringUtils.normalizeSpace(feed.getLink()));
return FeedUtils.toAbsoluteUrl(url, feedLink, feedUrl); return Urls.toAbsolute(url, feedLink, feedUrl);
} }
private Instant toValidInstant(Date date, boolean nullToNow) { private Instant toValidInstant(Date date, boolean nullToNow) {

View File

@@ -8,6 +8,7 @@ import java.util.Objects;
import jakarta.inject.Singleton; import jakarta.inject.Singleton;
import com.commafeed.backend.Digests; import com.commafeed.backend.Digests;
import com.commafeed.backend.Urls;
import com.commafeed.backend.dao.FeedDAO; import com.commafeed.backend.dao.FeedDAO;
import com.commafeed.backend.favicon.AbstractFaviconFetcher; import com.commafeed.backend.favicon.AbstractFaviconFetcher;
import com.commafeed.backend.favicon.Favicon; import com.commafeed.backend.favicon.Favicon;
@@ -33,7 +34,7 @@ public class FeedService {
} }
public synchronized Feed findOrCreate(String url) { public synchronized Feed findOrCreate(String url) {
String normalizedUrl = FeedUtils.normalizeURL(url); String normalizedUrl = Urls.normalize(url);
String normalizedUrlHash = Digests.sha1Hex(normalizedUrl); String normalizedUrlHash = Digests.sha1Hex(normalizedUrl);
Feed feed = feedDAO.findByUrl(normalizedUrl, normalizedUrlHash); Feed feed = feedDAO.findByUrl(normalizedUrl, normalizedUrlHash);
if (feed == null) { if (feed == null) {
@@ -48,7 +49,7 @@ public class FeedService {
} }
public void update(Feed feed) { public void update(Feed feed) {
String normalized = FeedUtils.normalizeURL(feed.getUrl()); String normalized = Urls.normalize(feed.getUrl());
feed.setNormalizedUrl(normalized); feed.setNormalizedUrl(normalized);
feed.setNormalizedUrlHash(Digests.sha1Hex(normalized)); feed.setNormalizedUrlHash(Digests.sha1Hex(normalized));
feed.setLastUpdated(Instant.now()); feed.setLastUpdated(Instant.now());

View File

@@ -8,6 +8,7 @@ import java.util.stream.Collectors;
import jakarta.inject.Singleton; import jakarta.inject.Singleton;
import com.commafeed.CommaFeedConfiguration; import com.commafeed.CommaFeedConfiguration;
import com.commafeed.backend.Urls;
import com.commafeed.backend.dao.FeedEntryStatusDAO; import com.commafeed.backend.dao.FeedEntryStatusDAO;
import com.commafeed.backend.dao.FeedSubscriptionDAO; import com.commafeed.backend.dao.FeedSubscriptionDAO;
import com.commafeed.backend.feed.FeedRefreshEngine; import com.commafeed.backend.feed.FeedRefreshEngine;
@@ -67,7 +68,7 @@ public class FeedSubscriptionService {
Feed feed = feedService.findOrCreate(url); Feed feed = feedService.findOrCreate(url);
// upgrade feed to https if it was using http // upgrade feed to https if it was using http
if (FeedUtils.isHttp(feed.getUrl()) && FeedUtils.isHttps(url)) { if (Urls.isHttp(feed.getUrl()) && Urls.isHttps(url)) {
feed.setUrl(url); feed.setUrl(url);
} }

View File

@@ -32,10 +32,10 @@ import org.apache.hc.core5.net.URIBuilder;
import com.commafeed.CommaFeedConfiguration; import com.commafeed.CommaFeedConfiguration;
import com.commafeed.CommaFeedConstants; import com.commafeed.CommaFeedConstants;
import com.commafeed.backend.Digests; import com.commafeed.backend.Digests;
import com.commafeed.backend.Urls;
import com.commafeed.backend.dao.UserDAO; import com.commafeed.backend.dao.UserDAO;
import com.commafeed.backend.dao.UserRoleDAO; import com.commafeed.backend.dao.UserRoleDAO;
import com.commafeed.backend.dao.UserSettingsDAO; import com.commafeed.backend.dao.UserSettingsDAO;
import com.commafeed.backend.feed.FeedUtils;
import com.commafeed.backend.model.User; import com.commafeed.backend.model.User;
import com.commafeed.backend.model.UserRole; import com.commafeed.backend.model.UserRole;
import com.commafeed.backend.model.UserRole.Role; import com.commafeed.backend.model.UserRole.Role;
@@ -309,7 +309,7 @@ public class UserREST {
} }
private String buildEmailContent(User user) throws URISyntaxException, MalformedURLException { private String buildEmailContent(User user) throws URISyntaxException, MalformedURLException {
String publicUrl = FeedUtils.removeTrailingSlash(uri.getBaseUri().toString()); String publicUrl = Urls.removeTrailingSlash(uri.getBaseUri().toString());
publicUrl += "/rest/user/passwordResetCallback"; publicUrl += "/rest/user/passwordResetCallback";
return String.format( return String.format(
"You asked for password recovery for account '%s', <a href='%s'>follow this link</a> to change your password. Ignore this if you didn't request a password recovery.", "You asked for password recovery for account '%s', <a href='%s'>follow this link</a> to change your password. Ignore this if you didn't request a password recovery.",

View File

@@ -1,9 +1,9 @@
package com.commafeed.backend.feed; package com.commafeed.backend;
import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
class FeedUtilsTest { class UrlsTest {
@Test @Test
void testNormalization() { void testNormalization() {
@@ -25,19 +25,19 @@ class FeedUtilsTest {
String urld1 = "http://fivefilters.org/content-only/makefulltextfeed.php?url=http://feeds.feedburner.com/Frandroid"; String urld1 = "http://fivefilters.org/content-only/makefulltextfeed.php?url=http://feeds.feedburner.com/Frandroid";
String urld2 = "http://fivefilters.org/content-only/makefulltextfeed.php?url=http://feeds2.feedburner.com/Frandroid"; String urld2 = "http://fivefilters.org/content-only/makefulltextfeed.php?url=http://feeds2.feedburner.com/Frandroid";
Assertions.assertEquals(FeedUtils.normalizeURL(urla1), FeedUtils.normalizeURL(urla2)); Assertions.assertEquals(Urls.normalize(urla1), Urls.normalize(urla2));
Assertions.assertEquals(FeedUtils.normalizeURL(urla1), FeedUtils.normalizeURL(urla3)); Assertions.assertEquals(Urls.normalize(urla1), Urls.normalize(urla3));
Assertions.assertEquals(FeedUtils.normalizeURL(urla1), FeedUtils.normalizeURL(urla4)); Assertions.assertEquals(Urls.normalize(urla1), Urls.normalize(urla4));
Assertions.assertEquals(FeedUtils.normalizeURL(urla1), FeedUtils.normalizeURL(urla5)); Assertions.assertEquals(Urls.normalize(urla1), Urls.normalize(urla5));
Assertions.assertEquals(FeedUtils.normalizeURL(urlb1), FeedUtils.normalizeURL(urlb2)); Assertions.assertEquals(Urls.normalize(urlb1), Urls.normalize(urlb2));
Assertions.assertEquals(FeedUtils.normalizeURL(urlc1), FeedUtils.normalizeURL(urlc2)); Assertions.assertEquals(Urls.normalize(urlc1), Urls.normalize(urlc2));
Assertions.assertEquals(FeedUtils.normalizeURL(urlc1), FeedUtils.normalizeURL(urlc3)); Assertions.assertEquals(Urls.normalize(urlc1), Urls.normalize(urlc3));
Assertions.assertEquals(FeedUtils.normalizeURL(urlc1), FeedUtils.normalizeURL(urlc4)); Assertions.assertEquals(Urls.normalize(urlc1), Urls.normalize(urlc4));
Assertions.assertEquals(FeedUtils.normalizeURL(urlc1), FeedUtils.normalizeURL(urlc5)); Assertions.assertEquals(Urls.normalize(urlc1), Urls.normalize(urlc5));
Assertions.assertNotEquals(FeedUtils.normalizeURL(urld1), FeedUtils.normalizeURL(urld2)); Assertions.assertNotEquals(Urls.normalize(urld1), Urls.normalize(urld2));
} }
@@ -46,36 +46,36 @@ class FeedUtilsTest {
String expected = "http://a.com/blog/entry/1"; String expected = "http://a.com/blog/entry/1";
// usual cases // usual cases
Assertions.assertEquals(expected, FeedUtils.toAbsoluteUrl("http://a.com/blog/entry/1", "http://a.com/feed/", "http://a.com/feed/")); Assertions.assertEquals(expected, Urls.toAbsolute("http://a.com/blog/entry/1", "http://a.com/feed/", "http://a.com/feed/"));
Assertions.assertEquals(expected, FeedUtils.toAbsoluteUrl("http://a.com/blog/entry/1", "http://a.com/feed", "http://a.com/feed")); Assertions.assertEquals(expected, Urls.toAbsolute("http://a.com/blog/entry/1", "http://a.com/feed", "http://a.com/feed"));
// relative links // relative links
Assertions.assertEquals(expected, FeedUtils.toAbsoluteUrl("../blog/entry/1", "http://a.com/feed/", "http://a.com/feed/")); Assertions.assertEquals(expected, Urls.toAbsolute("../blog/entry/1", "http://a.com/feed/", "http://a.com/feed/"));
Assertions.assertEquals(expected, FeedUtils.toAbsoluteUrl("../blog/entry/1", "feed.xml", "http://a.com/feed/feed.xml")); Assertions.assertEquals(expected, Urls.toAbsolute("../blog/entry/1", "feed.xml", "http://a.com/feed/feed.xml"));
// root-relative links // root-relative links
Assertions.assertEquals(expected, FeedUtils.toAbsoluteUrl("/blog/entry/1", "/feed", "http://a.com/feed")); Assertions.assertEquals(expected, Urls.toAbsolute("/blog/entry/1", "/feed", "http://a.com/feed"));
// real cases // real cases
Assertions.assertEquals("https://github.com/erusev/parsedown/releases/tag/1.3.0", FeedUtils.toAbsoluteUrl( Assertions.assertEquals("https://github.com/erusev/parsedown/releases/tag/1.3.0", Urls.toAbsolute(
"/erusev/parsedown/releases/tag/1.3.0", "/erusev/parsedown/releases", "https://github.com/erusev/parsedown/tags.atom")); "/erusev/parsedown/releases/tag/1.3.0", "/erusev/parsedown/releases", "https://github.com/erusev/parsedown/tags.atom"));
Assertions.assertEquals("http://ergoemacs.org/emacs/elisp_all_about_lines.html", Assertions.assertEquals("http://ergoemacs.org/emacs/elisp_all_about_lines.html",
FeedUtils.toAbsoluteUrl("elisp_all_about_lines.html", "blog.xml", "http://ergoemacs.org/emacs/blog.xml")); Urls.toAbsolute("elisp_all_about_lines.html", "blog.xml", "http://ergoemacs.org/emacs/blog.xml"));
} }
@Test @Test
void testRemoveTrailingSlash() { void testRemoveTrailingSlash() {
final String url = "http://localhost/"; final String url = "http://localhost/";
final String result = FeedUtils.removeTrailingSlash(url); final String result = Urls.removeTrailingSlash(url);
Assertions.assertEquals("http://localhost", result); Assertions.assertEquals("http://localhost", result);
} }
@Test @Test
void testRemoveTrailingSlashLastSlashOnly() { void testRemoveTrailingSlashLastSlashOnly() {
final String url = "http://localhost//"; final String url = "http://localhost//";
final String result = FeedUtils.removeTrailingSlash(url); final String result = Urls.removeTrailingSlash(url);
Assertions.assertEquals("http://localhost/", result); Assertions.assertEquals("http://localhost/", result);
} }
} }