diff --git a/src/main/java/com/commafeed/CommaFeedModule.java b/src/main/java/com/commafeed/CommaFeedModule.java index a9a1b524..01ac5d12 100644 --- a/src/main/java/com/commafeed/CommaFeedModule.java +++ b/src/main/java/com/commafeed/CommaFeedModule.java @@ -26,6 +26,9 @@ import com.commafeed.backend.task.OldStatusesCleanupTask; import com.commafeed.backend.task.OrphanedContentsCleanupTask; import com.commafeed.backend.task.OrphanedFeedsCleanupTask; import com.commafeed.backend.task.ScheduledTask; +import com.commafeed.backend.urlprovider.FeedURLProvider; +import com.commafeed.backend.urlprovider.InPageReferenceFeedURLProvider; +import com.commafeed.backend.urlprovider.YoutubeFeedURLProvider; import com.google.inject.AbstractModule; import com.google.inject.Provides; import com.google.inject.multibindings.Multibinder; @@ -55,6 +58,10 @@ public class CommaFeedModule extends AbstractModule { faviconMultibinder.addBinding().to(FacebookFaviconFetcher.class); faviconMultibinder.addBinding().to(DefaultFaviconFetcher.class); + Multibinder urlProviderMultibinder = Multibinder.newSetBinder(binder(), FeedURLProvider.class); + urlProviderMultibinder.addBinding().to(InPageReferenceFeedURLProvider.class); + urlProviderMultibinder.addBinding().to(YoutubeFeedURLProvider.class); + Multibinder taskMultibinder = Multibinder.newSetBinder(binder(), ScheduledTask.class); taskMultibinder.addBinding().to(OldStatusesCleanupTask.class); taskMultibinder.addBinding().to(OldEntriesCleanupTask.class); diff --git a/src/main/java/com/commafeed/backend/feed/FeedFetcher.java b/src/main/java/com/commafeed/backend/feed/FeedFetcher.java index 8ba02971..0cc546c7 100644 --- a/src/main/java/com/commafeed/backend/feed/FeedFetcher.java +++ b/src/main/java/com/commafeed/backend/feed/FeedFetcher.java @@ -2,26 +2,24 @@ package com.commafeed.backend.feed; import java.io.IOException; import java.util.Date; +import java.util.Set; import javax.inject.Inject; import javax.inject.Singleton; -import lombok.RequiredArgsConstructor; -import lombok.extern.slf4j.Slf4j; - import org.apache.commons.codec.binary.StringUtils; import org.apache.commons.codec.digest.DigestUtils; -import org.apache.http.client.ClientProtocolException; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; -import org.jsoup.select.Elements; import com.commafeed.backend.HttpGetter; import com.commafeed.backend.HttpGetter.HttpResult; import com.commafeed.backend.HttpGetter.NotModifiedException; import com.commafeed.backend.model.Feed; +import com.commafeed.backend.urlprovider.FeedURLProvider; import com.rometools.rome.io.FeedException; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; + @Slf4j @RequiredArgsConstructor(onConstructor = @__({ @Inject })) @Singleton @@ -29,9 +27,10 @@ public class FeedFetcher { private final FeedParser parser; private final HttpGetter getter; + private final Set urlProviders; public FetchedFeed fetch(String feedUrl, boolean extractFeedUrlFromHtml, String lastModified, String eTag, Date lastPublishedDate, - String lastContentHash) throws FeedException, ClientProtocolException, IOException, NotModifiedException { + String lastContentHash) throws FeedException, IOException, NotModifiedException { log.debug("Fetching feed {}", feedUrl); FetchedFeed fetchedFeed = null; @@ -44,7 +43,7 @@ public class FeedFetcher { fetchedFeed = parser.parse(result.getUrlAfterRedirect(), content); } catch (FeedException e) { if (extractFeedUrlFromHtml) { - String extractedUrl = extractFeedUrl(StringUtils.newStringUtf8(result.getContent()), feedUrl); + String extractedUrl = extractFeedUrl(urlProviders, StringUtils.newStringUtf8(result.getContent()), feedUrl); if (org.apache.commons.lang3.StringUtils.isNotBlank(extractedUrl)) { feedUrl = extractedUrl; @@ -84,20 +83,13 @@ public class FeedFetcher { return fetchedFeed; } - private String extractFeedUrl(String html, String baseUri) { - String foundUrl = null; - - Document doc = Jsoup.parse(html, baseUri); - String root = doc.children().get(0).tagName(); - if ("html".equals(root)) { - Elements atom = doc.select("link[type=application/atom+xml]"); - Elements rss = doc.select("link[type=application/rss+xml]"); - if (!atom.isEmpty()) { - foundUrl = atom.get(0).attr("abs:href"); - } else if (!rss.isEmpty()) { - foundUrl = rss.get(0).attr("abs:href"); - } + private static String extractFeedUrl(Set urlProviders, String html, String baseUri) { + for (FeedURLProvider urlProvider : urlProviders) { + String url = urlProvider.get(html, baseUri); + if (url != null) + return url; } - return foundUrl; + + return null; } } diff --git a/src/main/java/com/commafeed/backend/urlprovider/FeedURLProvider.java b/src/main/java/com/commafeed/backend/urlprovider/FeedURLProvider.java new file mode 100644 index 00000000..87547c2c --- /dev/null +++ b/src/main/java/com/commafeed/backend/urlprovider/FeedURLProvider.java @@ -0,0 +1,7 @@ +package com.commafeed.backend.urlprovider; + +public interface FeedURLProvider { + + String get(String html, String url); + +} diff --git a/src/main/java/com/commafeed/backend/urlprovider/InPageReferenceFeedURLProvider.java b/src/main/java/com/commafeed/backend/urlprovider/InPageReferenceFeedURLProvider.java new file mode 100644 index 00000000..11c72261 --- /dev/null +++ b/src/main/java/com/commafeed/backend/urlprovider/InPageReferenceFeedURLProvider.java @@ -0,0 +1,28 @@ +package com.commafeed.backend.urlprovider; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.select.Elements; + +public class InPageReferenceFeedURLProvider implements FeedURLProvider { + + @Override + public String get(String html, String url) { + String foundUrl = null; + + Document doc = Jsoup.parse(html, url); + String root = doc.children().get(0).tagName(); + if ("html".equals(root)) { + Elements atom = doc.select("link[type=application/atom+xml]"); + Elements rss = doc.select("link[type=application/rss+xml]"); + if (!atom.isEmpty()) { + foundUrl = atom.get(0).attr("abs:href"); + } else if (!rss.isEmpty()) { + foundUrl = rss.get(0).attr("abs:href"); + } + } + + return foundUrl; + } + +} diff --git a/src/main/java/com/commafeed/backend/urlprovider/YoutubeFeedURLProvider.java b/src/main/java/com/commafeed/backend/urlprovider/YoutubeFeedURLProvider.java new file mode 100644 index 00000000..4fc60e57 --- /dev/null +++ b/src/main/java/com/commafeed/backend/urlprovider/YoutubeFeedURLProvider.java @@ -0,0 +1,22 @@ +package com.commafeed.backend.urlprovider; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Workaround for Youtube channels + * + * converts the channel URL https://www.youtube.com/channel/CHANNEL_ID to the valid feed URL + * https://www.youtube.com/feeds/videos.xml?channel_id=CHANNEL_ID + */ +public class YoutubeFeedURLProvider implements FeedURLProvider { + + private static final Pattern REGEXP = Pattern.compile("(.*\\byoutube\\.com)\\/channel\\/([^\\/]+)", Pattern.CASE_INSENSITIVE); + + @Override + public String get(String html, String url) { + Matcher matcher = REGEXP.matcher(url); + return matcher.find() ? matcher.group(1) + "/feeds/videos.xml?channel_id=" + matcher.group(2) : null; + } + +}