store feed content hash

This commit is contained in:
Athou
2013-06-09 16:22:38 +02:00
parent d212cf66c1
commit d855455b54
4 changed files with 34 additions and 11 deletions

View File

@@ -6,6 +6,7 @@ import java.util.Date;
import javax.inject.Inject; import javax.inject.Inject;
import org.apache.commons.codec.binary.StringUtils; import org.apache.commons.codec.binary.StringUtils;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.http.client.ClientProtocolException; import org.apache.http.client.ClientProtocolException;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
@@ -30,9 +31,9 @@ public class FeedFetcher {
HttpGetter getter; HttpGetter getter;
public FetchedFeed fetch(String feedUrl, boolean extractFeedUrlFromHtml, public FetchedFeed fetch(String feedUrl, boolean extractFeedUrlFromHtml,
String lastModified, String eTag, Date lastPublishedDate) String lastModified, String eTag, Date lastPublishedDate,
throws FeedException, ClientProtocolException, IOException, String lastContentHash) throws FeedException,
NotModifiedException { ClientProtocolException, IOException, NotModifiedException {
log.debug("Fetching feed {}", feedUrl); log.debug("Fetching feed {}", feedUrl);
FetchedFeed fetchedFeed = null; FetchedFeed fetchedFeed = null;
@@ -45,24 +46,33 @@ public class FeedFetcher {
feedUrl = extractedUrl; feedUrl = extractedUrl;
} }
} }
if (result.getContent() == null) { byte[] content = result.getContent();
if (content == null) {
throw new IOException("Feed content is empty."); throw new IOException("Feed content is empty.");
} }
fetchedFeed = parser.parse(feedUrl, result.getContent()); String hash = DigestUtils.sha1Hex(content);
if (lastContentHash != null && hash != null
&& lastContentHash.equals(hash)) {
log.debug("content hash not modified: {}", feedUrl);
throw new NotModifiedException();
}
fetchedFeed = parser.parse(feedUrl, content);
if (lastPublishedDate != null if (lastPublishedDate != null
&& fetchedFeed.getFeed().getLastPublishedDate() != null && fetchedFeed.getFeed().getLastPublishedDate() != null
&& lastPublishedDate.getTime() == fetchedFeed.getFeed() && lastPublishedDate.getTime() == fetchedFeed.getFeed()
.getLastPublishedDate().getTime()) { .getLastPublishedDate().getTime()) {
log.debug("publishedDate not modified: {}", fetchedFeed.getFeed() log.debug("publishedDate not modified: {}", feedUrl);
.getUrl());
throw new NotModifiedException(); throw new NotModifiedException();
} }
Feed feed = fetchedFeed.getFeed(); Feed feed = fetchedFeed.getFeed();
feed.setLastModifiedHeader(result.getLastModifiedSince()); feed.setLastModifiedHeader(result.getLastModifiedSince());
feed.setEtagHeader(FeedUtils.truncate(result.geteTag(), 255)); feed.setEtagHeader(FeedUtils.truncate(result.geteTag(), 255));
feed.setLastContentHash(hash);
fetchedFeed.setFetchDuration(result.getDuration()); fetchedFeed.setFetchDuration(result.getDuration());
return fetchedFeed; return fetchedFeed;
} }

View File

@@ -83,7 +83,7 @@ public class FeedRefreshWorker {
try { try {
FetchedFeed fetchedFeed = fetcher.fetch(feed.getUrl(), false, FetchedFeed fetchedFeed = fetcher.fetch(feed.getUrl(), false,
feed.getLastModifiedHeader(), feed.getEtagHeader(), feed.getLastModifiedHeader(), feed.getEtagHeader(),
feed.getLastPublishedDate()); feed.getLastPublishedDate(), feed.getLastContentHash());
// stops here if NotModifiedException or any other exception is // stops here if NotModifiedException or any other exception is
// thrown // thrown
List<FeedEntry> entries = fetchedFeed.getEntries(); List<FeedEntry> entries = fetchedFeed.getEntries();
@@ -99,6 +99,7 @@ public class FeedRefreshWorker {
feed.setLastModifiedHeader(fetchedFeed.getFeed() feed.setLastModifiedHeader(fetchedFeed.getFeed()
.getLastModifiedHeader()); .getLastModifiedHeader());
feed.setEtagHeader(fetchedFeed.getFeed().getEtagHeader()); feed.setEtagHeader(fetchedFeed.getFeed().getEtagHeader());
feed.setLastContentHash(fetchedFeed.getFeed().getLastContentHash());
feed.setLastPublishedDate(fetchedFeed.getFeed() feed.setLastPublishedDate(fetchedFeed.getFeed()
.getLastPublishedDate()); .getLastPublishedDate());
@@ -144,9 +145,9 @@ public class FeedRefreshWorker {
String message = "Unable to refresh feed " + feed.getUrl() + " : " String message = "Unable to refresh feed " + feed.getUrl() + " : "
+ e.getMessage(); + e.getMessage();
if (e instanceof FeedException) { if (e instanceof FeedException) {
log.debug(e.getClass().getName() + " " + message); log.debug(e.getClass().getName() + " " + message, e);
} else { } else {
log.debug(e.getClass().getName() + " " + message); log.debug(e.getClass().getName() + " " + message, e);
} }
feed.setErrorCount(feed.getErrorCount() + 1); feed.setErrorCount(feed.getErrorCount() + 1);

View File

@@ -72,6 +72,9 @@ public class Feed extends AbstractModel {
@Column(length = 255) @Column(length = 255)
private String etagHeader; private String etagHeader;
@Column(length = 40)
private String lastContentHash;
@ManyToMany(mappedBy = "feeds") @ManyToMany(mappedBy = "feeds")
private Set<FeedEntry> entries = Sets.newHashSet(); private Set<FeedEntry> entries = Sets.newHashSet();
@@ -224,4 +227,12 @@ public class Feed extends AbstractModel {
this.lastPublishedDate = lastPublishedDate; this.lastPublishedDate = lastPublishedDate;
} }
public String getLastContentHash() {
return lastContentHash;
}
public void setLastContentHash(String lastContentHash) {
this.lastContentHash = lastContentHash;
}
} }

View File

@@ -163,7 +163,8 @@ public class FeedREST extends AbstractResourceREST {
url = StringUtils.trimToEmpty(url); url = StringUtils.trimToEmpty(url);
url = prependHttp(url); url = prependHttp(url);
try { try {
FetchedFeed feed = feedFetcher.fetch(url, true, null, null, null); FetchedFeed feed = feedFetcher.fetch(url, true, null, null, null,
null);
info = new FeedInfo(); info = new FeedInfo();
info.setUrl(feed.getFeed().getUrl()); info.setUrl(feed.getFeed().getUrl());
info.setTitle(feed.getTitle()); info.setTitle(feed.getTitle());