extensible mechanism for feed url building

2026-03-21 21:37:29 +00:00 · 2019-03-17 06:44:09 +01:00
parent b64115dcbd
commit 2f70f654f7
5 changed files with 78 additions and 37 deletions
--- a/src/main/java/com/commafeed/backend/feed/FeedFetcher.java
+++ b/src/main/java/com/commafeed/backend/feed/FeedFetcher.java
@@ -2,27 +2,24 @@ package com.commafeed.backend.feed;

 import java.io.IOException;
 import java.util.Date;
-import java.util.regex.*;
+import java.util.Set;

 import javax.inject.Inject;
 import javax.inject.Singleton;

-import lombok.RequiredArgsConstructor;
-import lombok.extern.slf4j.Slf4j;
-
 import org.apache.commons.codec.binary.StringUtils;
 import org.apache.commons.codec.digest.DigestUtils;
-import org.apache.http.client.ClientProtocolException;
-import org.jsoup.Jsoup;
-import org.jsoup.nodes.Document;
-import org.jsoup.select.Elements;

 import com.commafeed.backend.HttpGetter;
 import com.commafeed.backend.HttpGetter.HttpResult;
 import com.commafeed.backend.HttpGetter.NotModifiedException;
 import com.commafeed.backend.model.Feed;
+import com.commafeed.backend.urlprovider.FeedURLProvider;
 import com.rometools.rome.io.FeedException;

+import lombok.RequiredArgsConstructor;
+import lombok.extern.slf4j.Slf4j;
+
@Slf4j
@RequiredArgsConstructor(onConstructor = @__({ @Inject }))
@Singleton
@@ -30,9 +27,10 @@ public class FeedFetcher {

 	private final FeedParser parser;
 	private final HttpGetter getter;
+	private final Set<FeedURLProvider> urlProviders;

 	public FetchedFeed fetch(String feedUrl, boolean extractFeedUrlFromHtml, String lastModified, String eTag, Date lastPublishedDate,
-			String lastContentHash) throws FeedException, ClientProtocolException, IOException, NotModifiedException {
+			String lastContentHash) throws FeedException, IOException, NotModifiedException {
 		log.debug("Fetching feed {}", feedUrl);
 		FetchedFeed fetchedFeed = null;

@@ -45,7 +43,7 @@ public class FeedFetcher {
 			fetchedFeed = parser.parse(result.getUrlAfterRedirect(), content);
 		} catch (FeedException e) {
 			if (extractFeedUrlFromHtml) {
-				String extractedUrl = extractFeedUrl(StringUtils.newStringUtf8(result.getContent()), feedUrl);
+				String extractedUrl = extractFeedUrl(urlProviders, StringUtils.newStringUtf8(result.getContent()), feedUrl);
 				if (org.apache.commons.lang3.StringUtils.isNotBlank(extractedUrl)) {
 					feedUrl = extractedUrl;

@@ -85,34 +83,13 @@ public class FeedFetcher {
 		return fetchedFeed;
 	}

-	private String extractFeedUrl(String html, String baseUri) {
-		String foundUrl = null;
-
-		Document doc = Jsoup.parse(html, baseUri);
-		String root = doc.children().get(0).tagName();
-		if ("html".equals(root)) {
-			Elements atom = doc.select("link[type=application/atom+xml]");
-			Elements rss = doc.select("link[type=application/rss+xml]");
-			if (!atom.isEmpty()) {
-				foundUrl = atom.get(0).attr("abs:href");
-			} else if (!rss.isEmpty()) {
-				foundUrl = rss.get(0).attr("abs:href");
-			} else {
-				foundUrl = extractYoutubeFeedUrl(baseUri);
-			}
+	private static String extractFeedUrl(Set<FeedURLProvider> urlProviders, String html, String baseUri) {
+		for (FeedURLProvider urlProvider : urlProviders) {
+			String url = urlProvider.get(html, baseUri);
+			if (url != null)
+				return url;
 		}
-		return foundUrl;
-	}

-	/*
-	* Workaround for Youtube channels:
-	* convert the channel URL to the valid feed URL
-	* https://www.youtube.com/channel/CHANNEL_ID
-	* https://www.youtube.com/feeds/videos.xml?channel_id=CHANNEL_ID
-	*/
-	private String extractYoutubeFeedUrl(String url) {
-		Pattern regexp = Pattern.compile("(.*\\byoutube\\.com)\\/channel\\/([^\\/]+)", Pattern.CASE_INSENSITIVE);
-		Matcher matcher = regexp.matcher(url);
-		return matcher.find() ? matcher.group(1) + "/feeds/videos.xml?channel_id=" + matcher.group(2) : null;
+		return null;
 	}
 }