Merge branch 'ildar-shaimordanov-master'

This commit is contained in:
Athou
2019-03-17 06:46:51 +01:00
5 changed files with 79 additions and 23 deletions

View File

@@ -26,6 +26,9 @@ import com.commafeed.backend.task.OldStatusesCleanupTask;
import com.commafeed.backend.task.OrphanedContentsCleanupTask; import com.commafeed.backend.task.OrphanedContentsCleanupTask;
import com.commafeed.backend.task.OrphanedFeedsCleanupTask; import com.commafeed.backend.task.OrphanedFeedsCleanupTask;
import com.commafeed.backend.task.ScheduledTask; import com.commafeed.backend.task.ScheduledTask;
import com.commafeed.backend.urlprovider.FeedURLProvider;
import com.commafeed.backend.urlprovider.InPageReferenceFeedURLProvider;
import com.commafeed.backend.urlprovider.YoutubeFeedURLProvider;
import com.google.inject.AbstractModule; import com.google.inject.AbstractModule;
import com.google.inject.Provides; import com.google.inject.Provides;
import com.google.inject.multibindings.Multibinder; import com.google.inject.multibindings.Multibinder;
@@ -55,6 +58,10 @@ public class CommaFeedModule extends AbstractModule {
faviconMultibinder.addBinding().to(FacebookFaviconFetcher.class); faviconMultibinder.addBinding().to(FacebookFaviconFetcher.class);
faviconMultibinder.addBinding().to(DefaultFaviconFetcher.class); faviconMultibinder.addBinding().to(DefaultFaviconFetcher.class);
Multibinder<FeedURLProvider> urlProviderMultibinder = Multibinder.newSetBinder(binder(), FeedURLProvider.class);
urlProviderMultibinder.addBinding().to(InPageReferenceFeedURLProvider.class);
urlProviderMultibinder.addBinding().to(YoutubeFeedURLProvider.class);
Multibinder<ScheduledTask> taskMultibinder = Multibinder.newSetBinder(binder(), ScheduledTask.class); Multibinder<ScheduledTask> taskMultibinder = Multibinder.newSetBinder(binder(), ScheduledTask.class);
taskMultibinder.addBinding().to(OldStatusesCleanupTask.class); taskMultibinder.addBinding().to(OldStatusesCleanupTask.class);
taskMultibinder.addBinding().to(OldEntriesCleanupTask.class); taskMultibinder.addBinding().to(OldEntriesCleanupTask.class);

View File

@@ -2,26 +2,24 @@ package com.commafeed.backend.feed;
import java.io.IOException; import java.io.IOException;
import java.util.Date; import java.util.Date;
import java.util.Set;
import javax.inject.Inject; import javax.inject.Inject;
import javax.inject.Singleton; import javax.inject.Singleton;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.codec.binary.StringUtils; import org.apache.commons.codec.binary.StringUtils;
import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.codec.digest.DigestUtils;
import org.apache.http.client.ClientProtocolException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import com.commafeed.backend.HttpGetter; import com.commafeed.backend.HttpGetter;
import com.commafeed.backend.HttpGetter.HttpResult; import com.commafeed.backend.HttpGetter.HttpResult;
import com.commafeed.backend.HttpGetter.NotModifiedException; import com.commafeed.backend.HttpGetter.NotModifiedException;
import com.commafeed.backend.model.Feed; import com.commafeed.backend.model.Feed;
import com.commafeed.backend.urlprovider.FeedURLProvider;
import com.rometools.rome.io.FeedException; import com.rometools.rome.io.FeedException;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j @Slf4j
@RequiredArgsConstructor(onConstructor = @__({ @Inject })) @RequiredArgsConstructor(onConstructor = @__({ @Inject }))
@Singleton @Singleton
@@ -29,9 +27,10 @@ public class FeedFetcher {
private final FeedParser parser; private final FeedParser parser;
private final HttpGetter getter; private final HttpGetter getter;
private final Set<FeedURLProvider> urlProviders;
public FetchedFeed fetch(String feedUrl, boolean extractFeedUrlFromHtml, String lastModified, String eTag, Date lastPublishedDate, public FetchedFeed fetch(String feedUrl, boolean extractFeedUrlFromHtml, String lastModified, String eTag, Date lastPublishedDate,
String lastContentHash) throws FeedException, ClientProtocolException, IOException, NotModifiedException { String lastContentHash) throws FeedException, IOException, NotModifiedException {
log.debug("Fetching feed {}", feedUrl); log.debug("Fetching feed {}", feedUrl);
FetchedFeed fetchedFeed = null; FetchedFeed fetchedFeed = null;
@@ -44,7 +43,7 @@ public class FeedFetcher {
fetchedFeed = parser.parse(result.getUrlAfterRedirect(), content); fetchedFeed = parser.parse(result.getUrlAfterRedirect(), content);
} catch (FeedException e) { } catch (FeedException e) {
if (extractFeedUrlFromHtml) { if (extractFeedUrlFromHtml) {
String extractedUrl = extractFeedUrl(StringUtils.newStringUtf8(result.getContent()), feedUrl); String extractedUrl = extractFeedUrl(urlProviders, StringUtils.newStringUtf8(result.getContent()), feedUrl);
if (org.apache.commons.lang3.StringUtils.isNotBlank(extractedUrl)) { if (org.apache.commons.lang3.StringUtils.isNotBlank(extractedUrl)) {
feedUrl = extractedUrl; feedUrl = extractedUrl;
@@ -84,20 +83,13 @@ public class FeedFetcher {
return fetchedFeed; return fetchedFeed;
} }
private String extractFeedUrl(String html, String baseUri) { private static String extractFeedUrl(Set<FeedURLProvider> urlProviders, String html, String baseUri) {
String foundUrl = null; for (FeedURLProvider urlProvider : urlProviders) {
String url = urlProvider.get(html, baseUri);
Document doc = Jsoup.parse(html, baseUri); if (url != null)
String root = doc.children().get(0).tagName(); return url;
if ("html".equals(root)) {
Elements atom = doc.select("link[type=application/atom+xml]");
Elements rss = doc.select("link[type=application/rss+xml]");
if (!atom.isEmpty()) {
foundUrl = atom.get(0).attr("abs:href");
} else if (!rss.isEmpty()) {
foundUrl = rss.get(0).attr("abs:href");
}
} }
return foundUrl;
return null;
} }
} }

View File

@@ -0,0 +1,7 @@
package com.commafeed.backend.urlprovider;
public interface FeedURLProvider {
String get(String html, String url);
}

View File

@@ -0,0 +1,28 @@
package com.commafeed.backend.urlprovider;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
public class InPageReferenceFeedURLProvider implements FeedURLProvider {
@Override
public String get(String html, String url) {
String foundUrl = null;
Document doc = Jsoup.parse(html, url);
String root = doc.children().get(0).tagName();
if ("html".equals(root)) {
Elements atom = doc.select("link[type=application/atom+xml]");
Elements rss = doc.select("link[type=application/rss+xml]");
if (!atom.isEmpty()) {
foundUrl = atom.get(0).attr("abs:href");
} else if (!rss.isEmpty()) {
foundUrl = rss.get(0).attr("abs:href");
}
}
return foundUrl;
}
}

View File

@@ -0,0 +1,22 @@
package com.commafeed.backend.urlprovider;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Workaround for Youtube channels
*
* converts the channel URL https://www.youtube.com/channel/CHANNEL_ID to the valid feed URL
* https://www.youtube.com/feeds/videos.xml?channel_id=CHANNEL_ID
*/
public class YoutubeFeedURLProvider implements FeedURLProvider {
private static final Pattern REGEXP = Pattern.compile("(.*\\byoutube\\.com)\\/channel\\/([^\\/]+)", Pattern.CASE_INSENSITIVE);
@Override
public String get(String html, String url) {
Matcher matcher = REGEXP.matcher(url);
return matcher.find() ? matcher.group(1) + "/feeds/videos.xml?channel_id=" + matcher.group(2) : null;
}
}