2013-03-20 20:33:42 +01:00
|
|
|
package com.commafeed.backend.feeds;
|
|
|
|
|
|
2013-04-11 12:49:54 +02:00
|
|
|
import java.io.IOException;
|
2013-06-08 21:47:19 +02:00
|
|
|
import java.util.Date;
|
2013-04-11 12:49:54 +02:00
|
|
|
|
2013-03-20 20:33:42 +01:00
|
|
|
import javax.inject.Inject;
|
|
|
|
|
|
2013-04-11 12:49:54 +02:00
|
|
|
import org.apache.commons.codec.binary.StringUtils;
|
2013-06-09 16:22:38 +02:00
|
|
|
import org.apache.commons.codec.digest.DigestUtils;
|
2013-04-11 12:49:54 +02:00
|
|
|
import org.apache.http.client.ClientProtocolException;
|
2013-03-31 09:53:19 +02:00
|
|
|
import org.jsoup.Jsoup;
|
|
|
|
|
import org.jsoup.nodes.Document;
|
|
|
|
|
import org.jsoup.select.Elements;
|
2013-03-20 20:33:42 +01:00
|
|
|
import org.slf4j.Logger;
|
|
|
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
|
|
2013-04-03 15:53:57 +02:00
|
|
|
import com.commafeed.backend.HttpGetter;
|
2013-04-17 12:49:03 +02:00
|
|
|
import com.commafeed.backend.HttpGetter.HttpResult;
|
|
|
|
|
import com.commafeed.backend.HttpGetter.NotModifiedException;
|
2013-03-23 16:17:19 +01:00
|
|
|
import com.commafeed.backend.model.Feed;
|
2013-03-27 09:37:15 +01:00
|
|
|
import com.sun.syndication.io.FeedException;
|
2013-03-20 20:33:42 +01:00
|
|
|
|
|
|
|
|
public class FeedFetcher {
|
|
|
|
|
|
|
|
|
|
private static Logger log = LoggerFactory.getLogger(FeedFetcher.class);
|
|
|
|
|
|
|
|
|
|
@Inject
|
|
|
|
|
FeedParser parser;
|
|
|
|
|
|
2013-04-03 15:53:57 +02:00
|
|
|
@Inject
|
|
|
|
|
HttpGetter getter;
|
|
|
|
|
|
2013-04-25 12:30:21 +02:00
|
|
|
public FetchedFeed fetch(String feedUrl, boolean extractFeedUrlFromHtml,
|
2013-06-09 16:22:38 +02:00
|
|
|
String lastModified, String eTag, Date lastPublishedDate,
|
|
|
|
|
String lastContentHash) throws FeedException,
|
|
|
|
|
ClientProtocolException, IOException, NotModifiedException {
|
2013-03-20 20:33:42 +01:00
|
|
|
log.debug("Fetching feed {}", feedUrl);
|
2013-04-25 12:30:21 +02:00
|
|
|
FetchedFeed fetchedFeed = null;
|
2013-03-20 20:33:42 +01:00
|
|
|
|
2013-04-17 12:49:03 +02:00
|
|
|
HttpResult result = getter.getBinary(feedUrl, lastModified, eTag);
|
2013-04-14 18:51:12 +02:00
|
|
|
if (extractFeedUrlFromHtml) {
|
2013-04-19 11:51:40 +02:00
|
|
|
String extractedUrl = extractFeedUrl(
|
|
|
|
|
StringUtils.newStringUtf8(result.getContent()), feedUrl);
|
2013-04-17 12:49:03 +02:00
|
|
|
if (org.apache.commons.lang.StringUtils.isNotBlank(extractedUrl)) {
|
|
|
|
|
result = getter.getBinary(extractedUrl, lastModified, eTag);
|
2013-04-14 18:51:12 +02:00
|
|
|
feedUrl = extractedUrl;
|
|
|
|
|
}
|
2013-03-20 20:33:42 +01:00
|
|
|
}
|
2013-06-09 16:22:38 +02:00
|
|
|
byte[] content = result.getContent();
|
|
|
|
|
|
|
|
|
|
if (content == null) {
|
2013-04-27 11:03:47 +02:00
|
|
|
throw new IOException("Feed content is empty.");
|
|
|
|
|
}
|
|
|
|
|
|
2013-06-09 16:22:38 +02:00
|
|
|
String hash = DigestUtils.sha1Hex(content);
|
|
|
|
|
if (lastContentHash != null && hash != null
|
|
|
|
|
&& lastContentHash.equals(hash)) {
|
|
|
|
|
log.debug("content hash not modified: {}", feedUrl);
|
|
|
|
|
throw new NotModifiedException();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fetchedFeed = parser.parse(feedUrl, content);
|
2013-06-08 21:47:19 +02:00
|
|
|
|
|
|
|
|
if (lastPublishedDate != null
|
|
|
|
|
&& fetchedFeed.getFeed().getLastPublishedDate() != null
|
|
|
|
|
&& lastPublishedDate.getTime() == fetchedFeed.getFeed()
|
|
|
|
|
.getLastPublishedDate().getTime()) {
|
2013-06-09 16:22:38 +02:00
|
|
|
log.debug("publishedDate not modified: {}", feedUrl);
|
2013-06-08 21:47:19 +02:00
|
|
|
throw new NotModifiedException();
|
|
|
|
|
}
|
|
|
|
|
|
2013-04-25 12:30:21 +02:00
|
|
|
Feed feed = fetchedFeed.getFeed();
|
2013-04-17 12:49:03 +02:00
|
|
|
feed.setLastModifiedHeader(result.getLastModifiedSince());
|
2013-05-23 10:03:15 +02:00
|
|
|
feed.setEtagHeader(FeedUtils.truncate(result.geteTag(), 255));
|
2013-06-09 16:22:38 +02:00
|
|
|
feed.setLastContentHash(hash);
|
2013-04-25 12:30:21 +02:00
|
|
|
fetchedFeed.setFetchDuration(result.getDuration());
|
|
|
|
|
return fetchedFeed;
|
2013-03-20 20:33:42 +01:00
|
|
|
}
|
|
|
|
|
|
2013-04-17 12:49:03 +02:00
|
|
|
private String extractFeedUrl(String html, String baseUri) {
|
2013-03-31 14:30:44 +02:00
|
|
|
String foundUrl = null;
|
2013-03-31 09:53:19 +02:00
|
|
|
|
2013-04-17 12:49:03 +02:00
|
|
|
Document doc = Jsoup.parse(html, baseUri);
|
2013-03-31 14:30:44 +02:00
|
|
|
String root = doc.children().get(0).tagName();
|
|
|
|
|
if ("html".equals(root)) {
|
|
|
|
|
Elements atom = doc.select("link[type=application/atom+xml]");
|
2013-05-20 14:06:09 +02:00
|
|
|
Elements rss = doc.select("link[type=application/rss+xml]");
|
|
|
|
|
if (!atom.isEmpty()) {
|
2013-03-31 14:30:44 +02:00
|
|
|
foundUrl = atom.get(0).attr("abs:href").toString();
|
2013-05-20 14:06:09 +02:00
|
|
|
} else if (!rss.isEmpty()) {
|
|
|
|
|
foundUrl = rss.get(0).attr("abs:href").toString();
|
2013-03-31 14:30:44 +02:00
|
|
|
}
|
2013-03-31 09:53:19 +02:00
|
|
|
}
|
2013-03-31 14:30:44 +02:00
|
|
|
return foundUrl;
|
2013-03-31 09:53:19 +02:00
|
|
|
}
|
2013-03-20 20:33:42 +01:00
|
|
|
}
|