From c9c71d85826c7e6dc3f9b557842e943253bb58f9 Mon Sep 17 00:00:00 2001 From: ildar-shaimordanov Date: Tue, 12 Mar 2019 02:13:41 +0400 Subject: [PATCH 1/3] workaround for youtube channels --- .../com/commafeed/backend/feed/FeedFetcher.java | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/main/java/com/commafeed/backend/feed/FeedFetcher.java b/src/main/java/com/commafeed/backend/feed/FeedFetcher.java index 8ba02971..de7c417d 100644 --- a/src/main/java/com/commafeed/backend/feed/FeedFetcher.java +++ b/src/main/java/com/commafeed/backend/feed/FeedFetcher.java @@ -2,6 +2,7 @@ package com.commafeed.backend.feed; import java.io.IOException; import java.util.Date; +import java.util.regex.*; import javax.inject.Inject; import javax.inject.Singleton; @@ -96,8 +97,24 @@ public class FeedFetcher { foundUrl = atom.get(0).attr("abs:href"); } else if (!rss.isEmpty()) { foundUrl = rss.get(0).attr("abs:href"); + } else { + foundUrl = extractYoutubeFeedUrl(baseUri); } } return foundUrl; } + + /* + * Workaround for Youtube channels: + * convert the channel URL to the valid feed URL + * https://www.youtube.com/channel/CHANNEL_ID + * https://www.youtube.com/feeds/videos.xml?channel_id=CHANNEL_ID + */ + private String extractYoutubeFeedUrl(String url) { + Pattern regexp = Pattern.compile("(.*\\byoutube\\.com)\\/channel\\/([^\\/]+)", Pattern.CASE_INSENSITIVE); + Matcher matcher = regexp.matcher(url); + if ( matcher.find() ) { + return matcher.group(1) + "/feeds/videos.xml?channel_id=" + matcher.group(2); + } + } } From b64115dcbdf3a77632af83303f49cdaa9d785d92 Mon Sep 17 00:00:00 2001 From: ildar-shaimordanov Date: Tue, 12 Mar 2019 05:52:00 +0400 Subject: [PATCH 2/3] improve youtube feed URL getter --- src/main/java/com/commafeed/backend/feed/FeedFetcher.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/main/java/com/commafeed/backend/feed/FeedFetcher.java b/src/main/java/com/commafeed/backend/feed/FeedFetcher.java index de7c417d..bc69e11d 100644 --- a/src/main/java/com/commafeed/backend/feed/FeedFetcher.java +++ b/src/main/java/com/commafeed/backend/feed/FeedFetcher.java @@ -113,8 +113,6 @@ public class FeedFetcher { private String extractYoutubeFeedUrl(String url) { Pattern regexp = Pattern.compile("(.*\\byoutube\\.com)\\/channel\\/([^\\/]+)", Pattern.CASE_INSENSITIVE); Matcher matcher = regexp.matcher(url); - if ( matcher.find() ) { - return matcher.group(1) + "/feeds/videos.xml?channel_id=" + matcher.group(2); - } + return matcher.find() ? matcher.group(1) + "/feeds/videos.xml?channel_id=" + matcher.group(2) : null; } } From 2f70f654f79f24c5c72e26ba743719e20c88933f Mon Sep 17 00:00:00 2001 From: Athou Date: Sun, 17 Mar 2019 06:44:09 +0100 Subject: [PATCH 3/3] extensible mechanism for feed url building --- .../java/com/commafeed/CommaFeedModule.java | 7 +++ .../commafeed/backend/feed/FeedFetcher.java | 51 +++++-------------- .../backend/urlprovider/FeedURLProvider.java | 7 +++ .../InPageReferenceFeedURLProvider.java | 28 ++++++++++ .../urlprovider/YoutubeFeedURLProvider.java | 22 ++++++++ 5 files changed, 78 insertions(+), 37 deletions(-) create mode 100644 src/main/java/com/commafeed/backend/urlprovider/FeedURLProvider.java create mode 100644 src/main/java/com/commafeed/backend/urlprovider/InPageReferenceFeedURLProvider.java create mode 100644 src/main/java/com/commafeed/backend/urlprovider/YoutubeFeedURLProvider.java diff --git a/src/main/java/com/commafeed/CommaFeedModule.java b/src/main/java/com/commafeed/CommaFeedModule.java index a9a1b524..01ac5d12 100644 --- a/src/main/java/com/commafeed/CommaFeedModule.java +++ b/src/main/java/com/commafeed/CommaFeedModule.java @@ -26,6 +26,9 @@ import com.commafeed.backend.task.OldStatusesCleanupTask; import com.commafeed.backend.task.OrphanedContentsCleanupTask; import com.commafeed.backend.task.OrphanedFeedsCleanupTask; import com.commafeed.backend.task.ScheduledTask; +import com.commafeed.backend.urlprovider.FeedURLProvider; +import com.commafeed.backend.urlprovider.InPageReferenceFeedURLProvider; +import com.commafeed.backend.urlprovider.YoutubeFeedURLProvider; import com.google.inject.AbstractModule; import com.google.inject.Provides; import com.google.inject.multibindings.Multibinder; @@ -55,6 +58,10 @@ public class CommaFeedModule extends AbstractModule { faviconMultibinder.addBinding().to(FacebookFaviconFetcher.class); faviconMultibinder.addBinding().to(DefaultFaviconFetcher.class); + Multibinder urlProviderMultibinder = Multibinder.newSetBinder(binder(), FeedURLProvider.class); + urlProviderMultibinder.addBinding().to(InPageReferenceFeedURLProvider.class); + urlProviderMultibinder.addBinding().to(YoutubeFeedURLProvider.class); + Multibinder taskMultibinder = Multibinder.newSetBinder(binder(), ScheduledTask.class); taskMultibinder.addBinding().to(OldStatusesCleanupTask.class); taskMultibinder.addBinding().to(OldEntriesCleanupTask.class); diff --git a/src/main/java/com/commafeed/backend/feed/FeedFetcher.java b/src/main/java/com/commafeed/backend/feed/FeedFetcher.java index bc69e11d..0cc546c7 100644 --- a/src/main/java/com/commafeed/backend/feed/FeedFetcher.java +++ b/src/main/java/com/commafeed/backend/feed/FeedFetcher.java @@ -2,27 +2,24 @@ package com.commafeed.backend.feed; import java.io.IOException; import java.util.Date; -import java.util.regex.*; +import java.util.Set; import javax.inject.Inject; import javax.inject.Singleton; -import lombok.RequiredArgsConstructor; -import lombok.extern.slf4j.Slf4j; - import org.apache.commons.codec.binary.StringUtils; import org.apache.commons.codec.digest.DigestUtils; -import org.apache.http.client.ClientProtocolException; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.jsoup.select.Elements; import com.commafeed.backend.HttpGetter; import com.commafeed.backend.HttpGetter.HttpResult; import com.commafeed.backend.HttpGetter.NotModifiedException; import com.commafeed.backend.model.Feed; +import com.commafeed.backend.urlprovider.FeedURLProvider; import com.rometools.rome.io.FeedException; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; + @Slf4j @RequiredArgsConstructor(onConstructor = @__({ @Inject })) @Singleton @@ -30,9 +27,10 @@ public class FeedFetcher { private final FeedParser parser; private final HttpGetter getter; + private final Set urlProviders; public FetchedFeed fetch(String feedUrl, boolean extractFeedUrlFromHtml, String lastModified, String eTag, Date lastPublishedDate, - String lastContentHash) throws FeedException, ClientProtocolException, IOException, NotModifiedException { + String lastContentHash) throws FeedException, IOException, NotModifiedException { log.debug("Fetching feed {}", feedUrl); FetchedFeed fetchedFeed = null; @@ -45,7 +43,7 @@ public class FeedFetcher { fetchedFeed = parser.parse(result.getUrlAfterRedirect(), content); } catch (FeedException e) { if (extractFeedUrlFromHtml) { - String extractedUrl = extractFeedUrl(StringUtils.newStringUtf8(result.getContent()), feedUrl); + String extractedUrl = extractFeedUrl(urlProviders, StringUtils.newStringUtf8(result.getContent()), feedUrl); if (org.apache.commons.lang3.StringUtils.isNotBlank(extractedUrl)) { feedUrl = extractedUrl; @@ -85,34 +83,13 @@ public class FeedFetcher { return fetchedFeed; } - private String extractFeedUrl(String html, String baseUri) { - String foundUrl = null; - - Document doc = Jsoup.parse(html, baseUri); - String root = doc.children().get(0).tagName(); - if ("html".equals(root)) { - Elements atom = doc.select("link[type=application/atom+xml]"); - Elements rss = doc.select("link[type=application/rss+xml]"); - if (!atom.isEmpty()) { - foundUrl = atom.get(0).attr("abs:href"); - } else if (!rss.isEmpty()) { - foundUrl = rss.get(0).attr("abs:href"); - } else { - foundUrl = extractYoutubeFeedUrl(baseUri); - } + private static String extractFeedUrl(Set urlProviders, String html, String baseUri) { + for (FeedURLProvider urlProvider : urlProviders) { + String url = urlProvider.get(html, baseUri); + if (url != null) + return url; } - return foundUrl; - } - /* - * Workaround for Youtube channels: - * convert the channel URL to the valid feed URL - * https://www.youtube.com/channel/CHANNEL_ID - * https://www.youtube.com/feeds/videos.xml?channel_id=CHANNEL_ID - */ - private String extractYoutubeFeedUrl(String url) { - Pattern regexp = Pattern.compile("(.*\\byoutube\\.com)\\/channel\\/([^\\/]+)", Pattern.CASE_INSENSITIVE); - Matcher matcher = regexp.matcher(url); - return matcher.find() ? matcher.group(1) + "/feeds/videos.xml?channel_id=" + matcher.group(2) : null; + return null; } } diff --git a/src/main/java/com/commafeed/backend/urlprovider/FeedURLProvider.java b/src/main/java/com/commafeed/backend/urlprovider/FeedURLProvider.java new file mode 100644 index 00000000..87547c2c --- /dev/null +++ b/src/main/java/com/commafeed/backend/urlprovider/FeedURLProvider.java @@ -0,0 +1,7 @@ +package com.commafeed.backend.urlprovider; + +public interface FeedURLProvider { + + String get(String html, String url); + +} diff --git a/src/main/java/com/commafeed/backend/urlprovider/InPageReferenceFeedURLProvider.java b/src/main/java/com/commafeed/backend/urlprovider/InPageReferenceFeedURLProvider.java new file mode 100644 index 00000000..11c72261 --- /dev/null +++ b/src/main/java/com/commafeed/backend/urlprovider/InPageReferenceFeedURLProvider.java @@ -0,0 +1,28 @@ +package com.commafeed.backend.urlprovider; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.select.Elements; + +public class InPageReferenceFeedURLProvider implements FeedURLProvider { + + @Override + public String get(String html, String url) { + String foundUrl = null; + + Document doc = Jsoup.parse(html, url); + String root = doc.children().get(0).tagName(); + if ("html".equals(root)) { + Elements atom = doc.select("link[type=application/atom+xml]"); + Elements rss = doc.select("link[type=application/rss+xml]"); + if (!atom.isEmpty()) { + foundUrl = atom.get(0).attr("abs:href"); + } else if (!rss.isEmpty()) { + foundUrl = rss.get(0).attr("abs:href"); + } + } + + return foundUrl; + } + +} diff --git a/src/main/java/com/commafeed/backend/urlprovider/YoutubeFeedURLProvider.java b/src/main/java/com/commafeed/backend/urlprovider/YoutubeFeedURLProvider.java new file mode 100644 index 00000000..4fc60e57 --- /dev/null +++ b/src/main/java/com/commafeed/backend/urlprovider/YoutubeFeedURLProvider.java @@ -0,0 +1,22 @@ +package com.commafeed.backend.urlprovider; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Workaround for Youtube channels + * + * converts the channel URL https://www.youtube.com/channel/CHANNEL_ID to the valid feed URL + * https://www.youtube.com/feeds/videos.xml?channel_id=CHANNEL_ID + */ +public class YoutubeFeedURLProvider implements FeedURLProvider { + + private static final Pattern REGEXP = Pattern.compile("(.*\\byoutube\\.com)\\/channel\\/([^\\/]+)", Pattern.CASE_INSENSITIVE); + + @Override + public String get(String html, String url) { + Matcher matcher = REGEXP.matcher(url); + return matcher.find() ? matcher.group(1) + "/feeds/videos.xml?channel_id=" + matcher.group(2) : null; + } + +}