mirror of
https://github.com/Athou/commafeed.git
synced 2026-03-21 21:37:29 +00:00
removed wicket and tomee, use dropwizard instead. remove wro4j, use gulp instead
This commit is contained in:
99
src/main/java/com/commafeed/backend/feed/FeedFetcher.java
Normal file
99
src/main/java/com/commafeed/backend/feed/FeedFetcher.java
Normal file
@@ -0,0 +1,99 @@
|
||||
package com.commafeed.backend.feed;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Date;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import org.apache.commons.codec.binary.StringUtils;
|
||||
import org.apache.commons.codec.digest.DigestUtils;
|
||||
import org.apache.http.client.ClientProtocolException;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.select.Elements;
|
||||
|
||||
import com.commafeed.backend.HttpGetter;
|
||||
import com.commafeed.backend.HttpGetter.HttpResult;
|
||||
import com.commafeed.backend.HttpGetter.NotModifiedException;
|
||||
import com.commafeed.backend.model.Feed;
|
||||
import com.sun.syndication.io.FeedException;
|
||||
|
||||
@Slf4j
|
||||
@AllArgsConstructor
|
||||
public class FeedFetcher {
|
||||
|
||||
private final FeedParser parser;
|
||||
private final HttpGetter getter;
|
||||
|
||||
public FetchedFeed fetch(String feedUrl, boolean extractFeedUrlFromHtml, String lastModified, String eTag, Date lastPublishedDate,
|
||||
String lastContentHash) throws FeedException, ClientProtocolException, IOException, NotModifiedException {
|
||||
log.debug("Fetching feed {}", feedUrl);
|
||||
FetchedFeed fetchedFeed = null;
|
||||
|
||||
int timeout = 20000;
|
||||
|
||||
HttpResult result = getter.getBinary(feedUrl, lastModified, eTag, timeout);
|
||||
byte[] content = result.getContent();
|
||||
|
||||
try {
|
||||
fetchedFeed = parser.parse(feedUrl, content);
|
||||
} catch (FeedException e) {
|
||||
if (extractFeedUrlFromHtml) {
|
||||
String extractedUrl = extractFeedUrl(StringUtils.newStringUtf8(result.getContent()), feedUrl);
|
||||
if (org.apache.commons.lang.StringUtils.isNotBlank(extractedUrl)) {
|
||||
feedUrl = extractedUrl;
|
||||
|
||||
result = getter.getBinary(extractedUrl, lastModified, eTag, timeout);
|
||||
content = result.getContent();
|
||||
fetchedFeed = parser.parse(feedUrl, content);
|
||||
} else {
|
||||
throw e;
|
||||
}
|
||||
} else {
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
if (content == null) {
|
||||
throw new IOException("Feed content is empty.");
|
||||
}
|
||||
|
||||
String hash = DigestUtils.sha1Hex(content);
|
||||
if (lastContentHash != null && hash != null && lastContentHash.equals(hash)) {
|
||||
log.debug("content hash not modified: {}", feedUrl);
|
||||
throw new NotModifiedException("content hash not modified");
|
||||
}
|
||||
|
||||
if (lastPublishedDate != null && fetchedFeed.getFeed().getLastPublishedDate() != null
|
||||
&& lastPublishedDate.getTime() == fetchedFeed.getFeed().getLastPublishedDate().getTime()) {
|
||||
log.debug("publishedDate not modified: {}", feedUrl);
|
||||
throw new NotModifiedException("publishedDate not modified");
|
||||
}
|
||||
|
||||
Feed feed = fetchedFeed.getFeed();
|
||||
feed.setLastModifiedHeader(result.getLastModifiedSince());
|
||||
feed.setEtagHeader(FeedUtils.truncate(result.geteTag(), 255));
|
||||
feed.setLastContentHash(hash);
|
||||
fetchedFeed.setFetchDuration(result.getDuration());
|
||||
fetchedFeed.setUrlAfterRedirect(result.getUrlAfterRedirect());
|
||||
return fetchedFeed;
|
||||
}
|
||||
|
||||
private String extractFeedUrl(String html, String baseUri) {
|
||||
String foundUrl = null;
|
||||
|
||||
Document doc = Jsoup.parse(html, baseUri);
|
||||
String root = doc.children().get(0).tagName();
|
||||
if ("html".equals(root)) {
|
||||
Elements atom = doc.select("link[type=application/atom+xml]");
|
||||
Elements rss = doc.select("link[type=application/rss+xml]");
|
||||
if (!atom.isEmpty()) {
|
||||
foundUrl = atom.get(0).attr("abs:href").toString();
|
||||
} else if (!rss.isEmpty()) {
|
||||
foundUrl = rss.get(0).attr("abs:href").toString();
|
||||
}
|
||||
}
|
||||
return foundUrl;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user