2014-08-08 16:49:02 +02:00
|
|
|
package com.commafeed.backend.feed;
|
2013-03-20 20:33:42 +01:00
|
|
|
|
2013-04-11 12:49:54 +02:00
|
|
|
import java.io.IOException;
|
2013-06-08 21:47:19 +02:00
|
|
|
import java.util.Date;
|
2019-03-17 06:44:09 +01:00
|
|
|
import java.util.Set;
|
2013-04-11 12:49:54 +02:00
|
|
|
|
2014-08-17 14:16:30 +02:00
|
|
|
import javax.inject.Inject;
|
|
|
|
|
import javax.inject.Singleton;
|
|
|
|
|
|
2013-04-11 12:49:54 +02:00
|
|
|
import org.apache.commons.codec.binary.StringUtils;
|
2013-06-09 16:22:38 +02:00
|
|
|
import org.apache.commons.codec.digest.DigestUtils;
|
2013-03-20 20:33:42 +01:00
|
|
|
|
2013-04-03 15:53:57 +02:00
|
|
|
import com.commafeed.backend.HttpGetter;
|
2013-04-17 12:49:03 +02:00
|
|
|
import com.commafeed.backend.HttpGetter.HttpResult;
|
|
|
|
|
import com.commafeed.backend.HttpGetter.NotModifiedException;
|
2013-03-23 16:17:19 +01:00
|
|
|
import com.commafeed.backend.model.Feed;
|
2019-03-17 06:44:09 +01:00
|
|
|
import com.commafeed.backend.urlprovider.FeedURLProvider;
|
2014-08-15 13:51:13 +02:00
|
|
|
import com.rometools.rome.io.FeedException;
|
2013-03-20 20:33:42 +01:00
|
|
|
|
2019-03-17 06:44:09 +01:00
|
|
|
import lombok.RequiredArgsConstructor;
|
|
|
|
|
import lombok.extern.slf4j.Slf4j;
|
|
|
|
|
|
2013-08-11 11:45:32 +02:00
|
|
|
@Slf4j
|
2014-08-17 14:16:30 +02:00
|
|
|
@RequiredArgsConstructor(onConstructor = @__({ @Inject }))
|
|
|
|
|
@Singleton
|
2013-03-20 20:33:42 +01:00
|
|
|
public class FeedFetcher {
|
|
|
|
|
|
2014-08-08 16:49:02 +02:00
|
|
|
private final FeedParser parser;
|
|
|
|
|
private final HttpGetter getter;
|
2019-03-17 06:44:09 +01:00
|
|
|
private final Set<FeedURLProvider> urlProviders;
|
2013-04-03 15:53:57 +02:00
|
|
|
|
2013-07-25 09:17:33 +02:00
|
|
|
public FetchedFeed fetch(String feedUrl, boolean extractFeedUrlFromHtml, String lastModified, String eTag, Date lastPublishedDate,
|
2019-03-17 06:44:09 +01:00
|
|
|
String lastContentHash) throws FeedException, IOException, NotModifiedException {
|
2013-03-20 20:33:42 +01:00
|
|
|
log.debug("Fetching feed {}", feedUrl);
|
2013-04-25 12:30:21 +02:00
|
|
|
FetchedFeed fetchedFeed = null;
|
2013-03-20 20:33:42 +01:00
|
|
|
|
2013-06-29 12:05:22 +02:00
|
|
|
int timeout = 20000;
|
2013-08-06 13:49:03 +02:00
|
|
|
|
2013-06-29 12:05:22 +02:00
|
|
|
HttpResult result = getter.getBinary(feedUrl, lastModified, eTag, timeout);
|
2013-08-06 13:49:03 +02:00
|
|
|
byte[] content = result.getContent();
|
|
|
|
|
|
|
|
|
|
try {
|
2015-01-12 09:57:30 +01:00
|
|
|
fetchedFeed = parser.parse(result.getUrlAfterRedirect(), content);
|
2013-08-06 13:49:03 +02:00
|
|
|
} catch (FeedException e) {
|
|
|
|
|
if (extractFeedUrlFromHtml) {
|
2019-03-17 07:05:29 +01:00
|
|
|
String extractedUrl = extractFeedUrl(urlProviders, feedUrl, StringUtils.newStringUtf8(result.getContent()));
|
2014-10-28 16:36:09 +01:00
|
|
|
if (org.apache.commons.lang3.StringUtils.isNotBlank(extractedUrl)) {
|
2013-08-06 13:49:03 +02:00
|
|
|
feedUrl = extractedUrl;
|
|
|
|
|
|
|
|
|
|
result = getter.getBinary(extractedUrl, lastModified, eTag, timeout);
|
|
|
|
|
content = result.getContent();
|
2015-01-12 09:57:30 +01:00
|
|
|
fetchedFeed = parser.parse(result.getUrlAfterRedirect(), content);
|
2014-02-02 12:30:41 +01:00
|
|
|
} else {
|
|
|
|
|
throw e;
|
2013-08-06 13:49:03 +02:00
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
throw e;
|
2013-04-14 18:51:12 +02:00
|
|
|
}
|
2013-03-20 20:33:42 +01:00
|
|
|
}
|
2013-06-09 16:22:38 +02:00
|
|
|
|
|
|
|
|
if (content == null) {
|
2013-04-27 11:03:47 +02:00
|
|
|
throw new IOException("Feed content is empty.");
|
|
|
|
|
}
|
|
|
|
|
|
2013-06-09 16:22:38 +02:00
|
|
|
String hash = DigestUtils.sha1Hex(content);
|
2013-07-25 09:17:33 +02:00
|
|
|
if (lastContentHash != null && hash != null && lastContentHash.equals(hash)) {
|
2013-06-09 16:22:38 +02:00
|
|
|
log.debug("content hash not modified: {}", feedUrl);
|
2013-07-03 07:56:52 +02:00
|
|
|
throw new NotModifiedException("content hash not modified");
|
2013-06-09 16:22:38 +02:00
|
|
|
}
|
|
|
|
|
|
2013-07-25 09:17:33 +02:00
|
|
|
if (lastPublishedDate != null && fetchedFeed.getFeed().getLastPublishedDate() != null
|
|
|
|
|
&& lastPublishedDate.getTime() == fetchedFeed.getFeed().getLastPublishedDate().getTime()) {
|
2013-06-09 16:22:38 +02:00
|
|
|
log.debug("publishedDate not modified: {}", feedUrl);
|
2013-07-03 07:56:52 +02:00
|
|
|
throw new NotModifiedException("publishedDate not modified");
|
2013-06-08 21:47:19 +02:00
|
|
|
}
|
|
|
|
|
|
2013-04-25 12:30:21 +02:00
|
|
|
Feed feed = fetchedFeed.getFeed();
|
2013-04-17 12:49:03 +02:00
|
|
|
feed.setLastModifiedHeader(result.getLastModifiedSince());
|
2014-08-19 00:56:21 +02:00
|
|
|
feed.setEtagHeader(FeedUtils.truncate(result.getETag(), 255));
|
2013-06-09 16:22:38 +02:00
|
|
|
feed.setLastContentHash(hash);
|
2013-04-25 12:30:21 +02:00
|
|
|
fetchedFeed.setFetchDuration(result.getDuration());
|
2013-08-22 15:35:34 +02:00
|
|
|
fetchedFeed.setUrlAfterRedirect(result.getUrlAfterRedirect());
|
2013-04-25 12:30:21 +02:00
|
|
|
return fetchedFeed;
|
2013-03-20 20:33:42 +01:00
|
|
|
}
|
|
|
|
|
|
2019-03-17 07:05:29 +01:00
|
|
|
private static String extractFeedUrl(Set<FeedURLProvider> urlProviders, String url, String urlContent) {
|
2019-03-17 06:44:09 +01:00
|
|
|
for (FeedURLProvider urlProvider : urlProviders) {
|
2019-03-17 07:05:29 +01:00
|
|
|
String feedUrl = urlProvider.get(url, urlContent);
|
|
|
|
|
if (feedUrl != null)
|
|
|
|
|
return feedUrl;
|
2013-03-31 09:53:19 +02:00
|
|
|
}
|
2019-03-12 02:13:41 +04:00
|
|
|
|
2019-03-17 06:44:09 +01:00
|
|
|
return null;
|
2019-03-12 02:13:41 +04:00
|
|
|
}
|
2013-03-20 20:33:42 +01:00
|
|
|
}
|