extensible mechanism for feed url building

This commit is contained in:
Athou
2019-03-17 06:44:09 +01:00
parent b64115dcbd
commit 2f70f654f7
5 changed files with 78 additions and 37 deletions

View File

@@ -26,6 +26,9 @@ import com.commafeed.backend.task.OldStatusesCleanupTask;
import com.commafeed.backend.task.OrphanedContentsCleanupTask; import com.commafeed.backend.task.OrphanedContentsCleanupTask;
import com.commafeed.backend.task.OrphanedFeedsCleanupTask; import com.commafeed.backend.task.OrphanedFeedsCleanupTask;
import com.commafeed.backend.task.ScheduledTask; import com.commafeed.backend.task.ScheduledTask;
import com.commafeed.backend.urlprovider.FeedURLProvider;
import com.commafeed.backend.urlprovider.InPageReferenceFeedURLProvider;
import com.commafeed.backend.urlprovider.YoutubeFeedURLProvider;
import com.google.inject.AbstractModule; import com.google.inject.AbstractModule;
import com.google.inject.Provides; import com.google.inject.Provides;
import com.google.inject.multibindings.Multibinder; import com.google.inject.multibindings.Multibinder;
@@ -55,6 +58,10 @@ public class CommaFeedModule extends AbstractModule {
faviconMultibinder.addBinding().to(FacebookFaviconFetcher.class); faviconMultibinder.addBinding().to(FacebookFaviconFetcher.class);
faviconMultibinder.addBinding().to(DefaultFaviconFetcher.class); faviconMultibinder.addBinding().to(DefaultFaviconFetcher.class);
Multibinder<FeedURLProvider> urlProviderMultibinder = Multibinder.newSetBinder(binder(), FeedURLProvider.class);
urlProviderMultibinder.addBinding().to(InPageReferenceFeedURLProvider.class);
urlProviderMultibinder.addBinding().to(YoutubeFeedURLProvider.class);
Multibinder<ScheduledTask> taskMultibinder = Multibinder.newSetBinder(binder(), ScheduledTask.class); Multibinder<ScheduledTask> taskMultibinder = Multibinder.newSetBinder(binder(), ScheduledTask.class);
taskMultibinder.addBinding().to(OldStatusesCleanupTask.class); taskMultibinder.addBinding().to(OldStatusesCleanupTask.class);
taskMultibinder.addBinding().to(OldEntriesCleanupTask.class); taskMultibinder.addBinding().to(OldEntriesCleanupTask.class);

View File

@@ -2,27 +2,24 @@ package com.commafeed.backend.feed;
import java.io.IOException; import java.io.IOException;
import java.util.Date; import java.util.Date;
import java.util.regex.*; import java.util.Set;
import javax.inject.Inject; import javax.inject.Inject;
import javax.inject.Singleton; import javax.inject.Singleton;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.codec.binary.StringUtils; import org.apache.commons.codec.binary.StringUtils;
import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.codec.digest.DigestUtils;
import org.apache.http.client.ClientProtocolException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import com.commafeed.backend.HttpGetter; import com.commafeed.backend.HttpGetter;
import com.commafeed.backend.HttpGetter.HttpResult; import com.commafeed.backend.HttpGetter.HttpResult;
import com.commafeed.backend.HttpGetter.NotModifiedException; import com.commafeed.backend.HttpGetter.NotModifiedException;
import com.commafeed.backend.model.Feed; import com.commafeed.backend.model.Feed;
import com.commafeed.backend.urlprovider.FeedURLProvider;
import com.rometools.rome.io.FeedException; import com.rometools.rome.io.FeedException;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j @Slf4j
@RequiredArgsConstructor(onConstructor = @__({ @Inject })) @RequiredArgsConstructor(onConstructor = @__({ @Inject }))
@Singleton @Singleton
@@ -30,9 +27,10 @@ public class FeedFetcher {
private final FeedParser parser; private final FeedParser parser;
private final HttpGetter getter; private final HttpGetter getter;
private final Set<FeedURLProvider> urlProviders;
public FetchedFeed fetch(String feedUrl, boolean extractFeedUrlFromHtml, String lastModified, String eTag, Date lastPublishedDate, public FetchedFeed fetch(String feedUrl, boolean extractFeedUrlFromHtml, String lastModified, String eTag, Date lastPublishedDate,
String lastContentHash) throws FeedException, ClientProtocolException, IOException, NotModifiedException { String lastContentHash) throws FeedException, IOException, NotModifiedException {
log.debug("Fetching feed {}", feedUrl); log.debug("Fetching feed {}", feedUrl);
FetchedFeed fetchedFeed = null; FetchedFeed fetchedFeed = null;
@@ -45,7 +43,7 @@ public class FeedFetcher {
fetchedFeed = parser.parse(result.getUrlAfterRedirect(), content); fetchedFeed = parser.parse(result.getUrlAfterRedirect(), content);
} catch (FeedException e) { } catch (FeedException e) {
if (extractFeedUrlFromHtml) { if (extractFeedUrlFromHtml) {
String extractedUrl = extractFeedUrl(StringUtils.newStringUtf8(result.getContent()), feedUrl); String extractedUrl = extractFeedUrl(urlProviders, StringUtils.newStringUtf8(result.getContent()), feedUrl);
if (org.apache.commons.lang3.StringUtils.isNotBlank(extractedUrl)) { if (org.apache.commons.lang3.StringUtils.isNotBlank(extractedUrl)) {
feedUrl = extractedUrl; feedUrl = extractedUrl;
@@ -85,34 +83,13 @@ public class FeedFetcher {
return fetchedFeed; return fetchedFeed;
} }
private String extractFeedUrl(String html, String baseUri) { private static String extractFeedUrl(Set<FeedURLProvider> urlProviders, String html, String baseUri) {
String foundUrl = null; for (FeedURLProvider urlProvider : urlProviders) {
String url = urlProvider.get(html, baseUri);
Document doc = Jsoup.parse(html, baseUri); if (url != null)
String root = doc.children().get(0).tagName(); return url;
if ("html".equals(root)) {
Elements atom = doc.select("link[type=application/atom+xml]");
Elements rss = doc.select("link[type=application/rss+xml]");
if (!atom.isEmpty()) {
foundUrl = atom.get(0).attr("abs:href");
} else if (!rss.isEmpty()) {
foundUrl = rss.get(0).attr("abs:href");
} else {
foundUrl = extractYoutubeFeedUrl(baseUri);
}
}
return foundUrl;
} }
/* return null;
* Workaround for Youtube channels:
* convert the channel URL to the valid feed URL
* https://www.youtube.com/channel/CHANNEL_ID
* https://www.youtube.com/feeds/videos.xml?channel_id=CHANNEL_ID
*/
private String extractYoutubeFeedUrl(String url) {
Pattern regexp = Pattern.compile("(.*\\byoutube\\.com)\\/channel\\/([^\\/]+)", Pattern.CASE_INSENSITIVE);
Matcher matcher = regexp.matcher(url);
return matcher.find() ? matcher.group(1) + "/feeds/videos.xml?channel_id=" + matcher.group(2) : null;
} }
} }

View File

@@ -0,0 +1,7 @@
package com.commafeed.backend.urlprovider;
public interface FeedURLProvider {
String get(String html, String url);
}

View File

@@ -0,0 +1,28 @@
package com.commafeed.backend.urlprovider;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
public class InPageReferenceFeedURLProvider implements FeedURLProvider {
@Override
public String get(String html, String url) {
String foundUrl = null;
Document doc = Jsoup.parse(html, url);
String root = doc.children().get(0).tagName();
if ("html".equals(root)) {
Elements atom = doc.select("link[type=application/atom+xml]");
Elements rss = doc.select("link[type=application/rss+xml]");
if (!atom.isEmpty()) {
foundUrl = atom.get(0).attr("abs:href");
} else if (!rss.isEmpty()) {
foundUrl = rss.get(0).attr("abs:href");
}
}
return foundUrl;
}
}

View File

@@ -0,0 +1,22 @@
package com.commafeed.backend.urlprovider;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Workaround for Youtube channels
*
* converts the channel URL https://www.youtube.com/channel/CHANNEL_ID to the valid feed URL
* https://www.youtube.com/feeds/videos.xml?channel_id=CHANNEL_ID
*/
public class YoutubeFeedURLProvider implements FeedURLProvider {
private static final Pattern REGEXP = Pattern.compile("(.*\\byoutube\\.com)\\/channel\\/([^\\/]+)", Pattern.CASE_INSENSITIVE);
@Override
public String get(String html, String url) {
Matcher matcher = REGEXP.matcher(url);
return matcher.find() ? matcher.group(1) + "/feeds/videos.xml?channel_id=" + matcher.group(2) : null;
}
}