Files
Athou_commafeed/src/main/java/com/commafeed/backend/feeds/FeedFetcher.java

76 lines
2.4 KiB
Java

package com.commafeed.backend.feeds;
import java.io.IOException;
import javax.inject.Inject;
import org.apache.commons.codec.binary.StringUtils;
import org.apache.http.client.ClientProtocolException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.commafeed.backend.HttpGetter;
import com.commafeed.backend.HttpGetter.HttpResult;
import com.commafeed.backend.HttpGetter.NotModifiedException;
import com.commafeed.backend.model.Feed;
import com.sun.syndication.io.FeedException;
public class FeedFetcher {
private static Logger log = LoggerFactory.getLogger(FeedFetcher.class);
@Inject
FeedParser parser;
@Inject
HttpGetter getter;
public FetchedFeed fetch(String feedUrl, boolean extractFeedUrlFromHtml,
String lastModified, String eTag) throws FeedException,
ClientProtocolException, IOException, NotModifiedException {
log.debug("Fetching feed {}", feedUrl);
FetchedFeed fetchedFeed = null;
HttpResult result = getter.getBinary(feedUrl, lastModified, eTag);
if (extractFeedUrlFromHtml) {
String extractedUrl = extractFeedUrl(
StringUtils.newStringUtf8(result.getContent()), feedUrl);
if (org.apache.commons.lang.StringUtils.isNotBlank(extractedUrl)) {
result = getter.getBinary(extractedUrl, lastModified, eTag);
feedUrl = extractedUrl;
}
}
if (result.getContent() == null) {
throw new IOException("Feed content is empty.");
}
fetchedFeed = parser.parse(feedUrl, result.getContent());
Feed feed = fetchedFeed.getFeed();
feed.setLastModifiedHeader(result.getLastModifiedSince());
feed.setEtagHeader(org.apache.commons.lang.StringUtils.substring(
result.geteTag(), 0, 255));
fetchedFeed.setFetchDuration(result.getDuration());
return fetchedFeed;
}
private String extractFeedUrl(String html, String baseUri) {
String foundUrl = null;
Document doc = Jsoup.parse(html, baseUri);
String root = doc.children().get(0).tagName();
if ("html".equals(root)) {
Elements atom = doc.select("link[type=application/atom+xml]");
Elements rss = doc.select("link[type=application/rss+xml]");
if (!atom.isEmpty()) {
foundUrl = atom.get(0).attr("abs:href").toString();
} else if (!rss.isEmpty()) {
foundUrl = rss.get(0).attr("abs:href").toString();
}
}
return foundUrl;
}
}