Merge branch 'ildar-shaimordanov-master'

This commit is contained in:
Athou
2019-03-17 06:46:51 +01:00
5 changed files with 79 additions and 23 deletions

View File

@@ -26,6 +26,9 @@ import com.commafeed.backend.task.OldStatusesCleanupTask;
import com.commafeed.backend.task.OrphanedContentsCleanupTask;
import com.commafeed.backend.task.OrphanedFeedsCleanupTask;
import com.commafeed.backend.task.ScheduledTask;
import com.commafeed.backend.urlprovider.FeedURLProvider;
import com.commafeed.backend.urlprovider.InPageReferenceFeedURLProvider;
import com.commafeed.backend.urlprovider.YoutubeFeedURLProvider;
import com.google.inject.AbstractModule;
import com.google.inject.Provides;
import com.google.inject.multibindings.Multibinder;
@@ -55,6 +58,10 @@ public class CommaFeedModule extends AbstractModule {
faviconMultibinder.addBinding().to(FacebookFaviconFetcher.class);
faviconMultibinder.addBinding().to(DefaultFaviconFetcher.class);
Multibinder<FeedURLProvider> urlProviderMultibinder = Multibinder.newSetBinder(binder(), FeedURLProvider.class);
urlProviderMultibinder.addBinding().to(InPageReferenceFeedURLProvider.class);
urlProviderMultibinder.addBinding().to(YoutubeFeedURLProvider.class);
Multibinder<ScheduledTask> taskMultibinder = Multibinder.newSetBinder(binder(), ScheduledTask.class);
taskMultibinder.addBinding().to(OldStatusesCleanupTask.class);
taskMultibinder.addBinding().to(OldEntriesCleanupTask.class);

View File

@@ -2,26 +2,24 @@ package com.commafeed.backend.feed;
import java.io.IOException;
import java.util.Date;
import java.util.Set;
import javax.inject.Inject;
import javax.inject.Singleton;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.codec.binary.StringUtils;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.http.client.ClientProtocolException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import com.commafeed.backend.HttpGetter;
import com.commafeed.backend.HttpGetter.HttpResult;
import com.commafeed.backend.HttpGetter.NotModifiedException;
import com.commafeed.backend.model.Feed;
import com.commafeed.backend.urlprovider.FeedURLProvider;
import com.rometools.rome.io.FeedException;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@RequiredArgsConstructor(onConstructor = @__({ @Inject }))
@Singleton
@@ -29,9 +27,10 @@ public class FeedFetcher {
private final FeedParser parser;
private final HttpGetter getter;
private final Set<FeedURLProvider> urlProviders;
public FetchedFeed fetch(String feedUrl, boolean extractFeedUrlFromHtml, String lastModified, String eTag, Date lastPublishedDate,
String lastContentHash) throws FeedException, ClientProtocolException, IOException, NotModifiedException {
String lastContentHash) throws FeedException, IOException, NotModifiedException {
log.debug("Fetching feed {}", feedUrl);
FetchedFeed fetchedFeed = null;
@@ -44,7 +43,7 @@ public class FeedFetcher {
fetchedFeed = parser.parse(result.getUrlAfterRedirect(), content);
} catch (FeedException e) {
if (extractFeedUrlFromHtml) {
String extractedUrl = extractFeedUrl(StringUtils.newStringUtf8(result.getContent()), feedUrl);
String extractedUrl = extractFeedUrl(urlProviders, StringUtils.newStringUtf8(result.getContent()), feedUrl);
if (org.apache.commons.lang3.StringUtils.isNotBlank(extractedUrl)) {
feedUrl = extractedUrl;
@@ -84,20 +83,13 @@ public class FeedFetcher {
return fetchedFeed;
}
private String extractFeedUrl(String html, String baseUri) {
String foundUrl = null;
Document doc = Jsoup.parse(html, baseUri);
String root = doc.children().get(0).tagName();
if ("html".equals(root)) {
Elements atom = doc.select("link[type=application/atom+xml]");
Elements rss = doc.select("link[type=application/rss+xml]");
if (!atom.isEmpty()) {
foundUrl = atom.get(0).attr("abs:href");
} else if (!rss.isEmpty()) {
foundUrl = rss.get(0).attr("abs:href");
}
private static String extractFeedUrl(Set<FeedURLProvider> urlProviders, String html, String baseUri) {
for (FeedURLProvider urlProvider : urlProviders) {
String url = urlProvider.get(html, baseUri);
if (url != null)
return url;
}
return foundUrl;
return null;
}
}

View File

@@ -0,0 +1,7 @@
package com.commafeed.backend.urlprovider;
public interface FeedURLProvider {
String get(String html, String url);
}

View File

@@ -0,0 +1,28 @@
package com.commafeed.backend.urlprovider;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
public class InPageReferenceFeedURLProvider implements FeedURLProvider {
@Override
public String get(String html, String url) {
String foundUrl = null;
Document doc = Jsoup.parse(html, url);
String root = doc.children().get(0).tagName();
if ("html".equals(root)) {
Elements atom = doc.select("link[type=application/atom+xml]");
Elements rss = doc.select("link[type=application/rss+xml]");
if (!atom.isEmpty()) {
foundUrl = atom.get(0).attr("abs:href");
} else if (!rss.isEmpty()) {
foundUrl = rss.get(0).attr("abs:href");
}
}
return foundUrl;
}
}

View File

@@ -0,0 +1,22 @@
package com.commafeed.backend.urlprovider;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Workaround for Youtube channels
*
* converts the channel URL https://www.youtube.com/channel/CHANNEL_ID to the valid feed URL
* https://www.youtube.com/feeds/videos.xml?channel_id=CHANNEL_ID
*/
public class YoutubeFeedURLProvider implements FeedURLProvider {
private static final Pattern REGEXP = Pattern.compile("(.*\\byoutube\\.com)\\/channel\\/([^\\/]+)", Pattern.CASE_INSENSITIVE);
@Override
public String get(String html, String url) {
Matcher matcher = REGEXP.matcher(url);
return matcher.find() ? matcher.group(1) + "/feeds/videos.xml?channel_id=" + matcher.group(2) : null;
}
}