providers can now return multiple urls

This commit is contained in:
Athou
2025-07-21 16:46:40 +02:00
parent 085a3cbb50
commit 23a91aab12
6 changed files with 36 additions and 77 deletions

View File

@@ -98,14 +98,11 @@ public class FeedFetcher {
}
private static String extractFeedUrl(List<FeedURLProvider> urlProviders, String url, String urlContent) {
for (FeedURLProvider urlProvider : urlProviders) {
String feedUrl = urlProvider.get(url, urlContent);
if (feedUrl != null) {
return feedUrl;
}
}
return null;
return urlProviders.stream()
.flatMap(provider -> provider.get(url, urlContent).stream())
.filter(StringUtils::isNotBlank)
.findFirst()
.orElse(null);
}
public record FeedFetcherResult(FeedParserResult feed, String urlAfterRedirect, String lastModifiedHeader, String lastETagHeader,

View File

@@ -1,10 +1,12 @@
package com.commafeed.backend.urlprovider;
import java.util.List;
/**
* Tries to find a feed url given the url and page content
*/
public interface FeedURLProvider {
String get(String url, String urlContent);
List<String> get(String url, String urlContent);
}

View File

@@ -1,31 +1,25 @@
package com.commafeed.backend.urlprovider;
import java.util.List;
import java.util.stream.Stream;
import jakarta.inject.Singleton;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
@Singleton
public class InPageReferenceFeedURLProvider implements FeedURLProvider {
@Override
public String get(String url, String urlContent) {
String foundUrl = null;
public List<String> get(String url, String urlContent) {
Document doc = Jsoup.parse(urlContent, url);
String root = doc.children().get(0).tagName();
if ("html".equals(root)) {
Elements atom = doc.select("link[type=application/atom+xml]");
Elements rss = doc.select("link[type=application/rss+xml]");
if (!atom.isEmpty()) {
foundUrl = atom.get(0).attr("abs:href");
} else if (!rss.isEmpty()) {
foundUrl = rss.get(0).attr("abs:href");
}
if (!"html".equals(doc.children().get(0).tagName())) {
return List.of();
}
return foundUrl;
return Stream.concat(doc.select("link[type=application/atom+xml]").stream(), doc.select("link[type=application/rss+xml]").stream())
.map(node -> node.attr("abs:href"))
.toList();
}
}

View File

@@ -1,5 +1,7 @@
package com.commafeed.backend.urlprovider;
import java.util.List;
import jakarta.inject.Singleton;
import org.apache.commons.lang3.StringUtils;
@@ -17,12 +19,12 @@ public class YoutubeFeedURLProvider implements FeedURLProvider {
private static final String REPLACEMENT_PREFIX = "https://www.youtube.com/feeds/videos.xml?channel_id=";
@Override
public String get(String url, String urlContent) {
public List<String> get(String url, String urlContent) {
if (!StringUtils.startsWithIgnoreCase(url, PREFIX)) {
return null;
return List.of();
}
return REPLACEMENT_PREFIX + url.substring(PREFIX.length());
return List.of(REPLACEMENT_PREFIX + url.substring(PREFIX.length()));
}
}

View File

@@ -1,5 +1,7 @@
package com.commafeed.backend.urlprovider;
import java.util.List;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
@@ -8,41 +10,7 @@ class InPageReferenceFeedURLProviderTest {
private final InPageReferenceFeedURLProvider provider = new InPageReferenceFeedURLProvider();
@Test
void extractsAtomFeedURL() {
String url = "http://example.com";
String html = """
<html>
<head>
<link type="application/atom+xml" href="/feed.atom">
</head>
<body>
</body>
</html>""";
String result = provider.get(url, html);
Assertions.assertEquals("http://example.com/feed.atom", result);
}
@Test
void extractsRSSFeedURL() {
String url = "http://example.com";
String html = """
<html>
<head>
<link type="application/rss+xml" href="/feed.rss">
</head>
<body>
</body>
</html>""";
String result = provider.get(url, html);
Assertions.assertEquals("http://example.com/feed.rss", result);
}
@Test
void prefersAtomOverRSS() {
void extractUrls() {
String url = "http://example.com";
String html = """
<html>
@@ -54,26 +22,22 @@ class InPageReferenceFeedURLProviderTest {
</body>
</html>""";
String result = provider.get(url, html);
Assertions.assertEquals("http://example.com/feed.atom", result);
Assertions.assertIterableEquals(List.of("http://example.com/feed.atom", "http://example.com/feed.rss"), provider.get(url, html));
}
@Test
void returnsNullForNonHtmlContent() {
void returnsEmptyListForNonHtmlContent() {
String url = "http://example.com";
String content = """
String html = """
<?xml version="1.0"?>
<feed></feed>
</xml>""";
String result = provider.get(url, content);
Assertions.assertNull(result);
Assertions.assertTrue(provider.get(url, html).isEmpty());
}
@Test
void returnsNullForHtmlWithoutFeedLinks() {
void returnsEmptyListForHtmlWithoutFeedLinks() {
String url = "http://example.com";
String html = """
<html>
@@ -84,8 +48,6 @@ class InPageReferenceFeedURLProviderTest {
</body>
</html>""";
String result = provider.get(url, html);
Assertions.assertNull(result);
Assertions.assertTrue(provider.get(url, html).isEmpty());
}
}

View File

@@ -1,5 +1,7 @@
package com.commafeed.backend.urlprovider;
import java.util.List;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
@@ -9,14 +11,14 @@ class YoutubeFeedURLProviderTest {
@Test
void matchesYoutubeChannelURL() {
Assertions.assertEquals("https://www.youtube.com/feeds/videos.xml?channel_id=abc",
Assertions.assertIterableEquals(List.of("https://www.youtube.com/feeds/videos.xml?channel_id=abc"),
provider.get("https://www.youtube.com/channel/abc", null));
}
@Test
void doesNotmatchYoutubeChannelURL() {
Assertions.assertNull(provider.get("https://www.anothersite.com/channel/abc", null));
Assertions.assertNull(provider.get("https://www.youtube.com/user/abc", null));
Assertions.assertTrue(provider.get("https://www.anothersite.com/channel/abc", null).isEmpty());
Assertions.assertTrue(provider.get("https://www.youtube.com/user/abc", null).isEmpty());
}
}