Files
Athou_commafeed/src/main/java/com/commafeed/backend/feeds/FeedFetcher.java

101 lines
3.3 KiB
Java
Raw Normal View History

2013-03-20 20:33:42 +01:00
package com.commafeed.backend.feeds;
2013-04-11 12:49:54 +02:00
import java.io.IOException;
import java.util.Date;
2013-04-11 12:49:54 +02:00
2013-03-20 20:33:42 +01:00
import javax.inject.Inject;
2013-08-11 11:45:32 +02:00
import lombok.extern.slf4j.Slf4j;
2013-04-11 12:49:54 +02:00
import org.apache.commons.codec.binary.StringUtils;
2013-06-09 16:22:38 +02:00
import org.apache.commons.codec.digest.DigestUtils;
2013-04-11 12:49:54 +02:00
import org.apache.http.client.ClientProtocolException;
2013-03-31 09:53:19 +02:00
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
2013-03-20 20:33:42 +01:00
2013-04-03 15:53:57 +02:00
import com.commafeed.backend.HttpGetter;
import com.commafeed.backend.HttpGetter.HttpResult;
import com.commafeed.backend.HttpGetter.NotModifiedException;
2013-03-23 16:17:19 +01:00
import com.commafeed.backend.model.Feed;
2013-03-27 09:37:15 +01:00
import com.sun.syndication.io.FeedException;
2013-03-20 20:33:42 +01:00
2013-08-11 11:45:32 +02:00
@Slf4j
2013-03-20 20:33:42 +01:00
public class FeedFetcher {
@Inject
FeedParser parser;
2013-04-03 15:53:57 +02:00
@Inject
HttpGetter getter;
2013-07-25 09:17:33 +02:00
public FetchedFeed fetch(String feedUrl, boolean extractFeedUrlFromHtml, String lastModified, String eTag, Date lastPublishedDate,
String lastContentHash) throws FeedException, ClientProtocolException, IOException, NotModifiedException {
2013-03-20 20:33:42 +01:00
log.debug("Fetching feed {}", feedUrl);
2013-04-25 12:30:21 +02:00
FetchedFeed fetchedFeed = null;
2013-03-20 20:33:42 +01:00
int timeout = 20000;
HttpResult result = getter.getBinary(feedUrl, lastModified, eTag, timeout);
byte[] content = result.getContent();
try {
fetchedFeed = parser.parse(feedUrl, content);
} catch (FeedException e) {
if (extractFeedUrlFromHtml) {
String extractedUrl = extractFeedUrl(StringUtils.newStringUtf8(result.getContent()), feedUrl);
if (org.apache.commons.lang.StringUtils.isNotBlank(extractedUrl)) {
feedUrl = extractedUrl;
result = getter.getBinary(extractedUrl, lastModified, eTag, timeout);
content = result.getContent();
fetchedFeed = parser.parse(feedUrl, content);
}
} else {
throw e;
2013-04-14 18:51:12 +02:00
}
2013-03-20 20:33:42 +01:00
}
2013-06-09 16:22:38 +02:00
if (content == null) {
2013-04-27 11:03:47 +02:00
throw new IOException("Feed content is empty.");
}
2013-06-09 16:22:38 +02:00
String hash = DigestUtils.sha1Hex(content);
2013-07-25 09:17:33 +02:00
if (lastContentHash != null && hash != null && lastContentHash.equals(hash)) {
2013-06-09 16:22:38 +02:00
log.debug("content hash not modified: {}", feedUrl);
2013-07-03 07:56:52 +02:00
throw new NotModifiedException("content hash not modified");
2013-06-09 16:22:38 +02:00
}
2013-07-25 09:17:33 +02:00
if (lastPublishedDate != null && fetchedFeed.getFeed().getLastPublishedDate() != null
&& lastPublishedDate.getTime() == fetchedFeed.getFeed().getLastPublishedDate().getTime()) {
2013-06-09 16:22:38 +02:00
log.debug("publishedDate not modified: {}", feedUrl);
2013-07-03 07:56:52 +02:00
throw new NotModifiedException("publishedDate not modified");
}
2013-04-25 12:30:21 +02:00
Feed feed = fetchedFeed.getFeed();
feed.setLastModifiedHeader(result.getLastModifiedSince());
feed.setEtagHeader(FeedUtils.truncate(result.geteTag(), 255));
2013-06-09 16:22:38 +02:00
feed.setLastContentHash(hash);
2013-04-25 12:30:21 +02:00
fetchedFeed.setFetchDuration(result.getDuration());
fetchedFeed.setUrlAfterRedirect(result.getUrlAfterRedirect());
2013-04-25 12:30:21 +02:00
return fetchedFeed;
2013-03-20 20:33:42 +01:00
}
private String extractFeedUrl(String html, String baseUri) {
2013-03-31 14:30:44 +02:00
String foundUrl = null;
2013-03-31 09:53:19 +02:00
Document doc = Jsoup.parse(html, baseUri);
2013-03-31 14:30:44 +02:00
String root = doc.children().get(0).tagName();
if ("html".equals(root)) {
Elements atom = doc.select("link[type=application/atom+xml]");
2013-05-20 14:06:09 +02:00
Elements rss = doc.select("link[type=application/rss+xml]");
if (!atom.isEmpty()) {
2013-03-31 14:30:44 +02:00
foundUrl = atom.get(0).attr("abs:href").toString();
2013-05-20 14:06:09 +02:00
} else if (!rss.isEmpty()) {
foundUrl = rss.get(0).attr("abs:href").toString();
2013-03-31 14:30:44 +02:00
}
2013-03-31 09:53:19 +02:00
}
2013-03-31 14:30:44 +02:00
return foundUrl;
2013-03-31 09:53:19 +02:00
}
2013-03-20 20:33:42 +01:00
}