make each step of feed fetching return its own model

This commit is contained in:
Athou
2023-05-01 09:25:44 +02:00
parent 4a40f2b8f7
commit 9c628a8f53
11 changed files with 89 additions and 99 deletions

View File

@@ -14,7 +14,6 @@ import org.apache.http.HttpHeaders;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponseInterceptor;
import org.apache.http.HttpStatus;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpResponseException;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
@@ -59,7 +58,7 @@ public class HttpGetter {
}
}
public HttpResult getBinary(String url, int timeout) throws ClientProtocolException, IOException, NotModifiedException {
public HttpResult getBinary(String url, int timeout) throws IOException, NotModifiedException {
return getBinary(url, null, null, timeout);
}
@@ -71,14 +70,10 @@ public class HttpGetter {
* header we got last time we queried that url, or null
* @param eTag
* header we got last time we queried that url, or null
* @return
* @throws ClientProtocolException
* @throws IOException
* @throws NotModifiedException
* if the url hasn't changed since we asked for it last time
*/
public HttpResult getBinary(String url, String lastModified, String eTag, int timeout)
throws ClientProtocolException, IOException, NotModifiedException {
public HttpResult getBinary(String url, String lastModified, String eTag, int timeout) throws IOException, NotModifiedException {
HttpResult result = null;
long start = System.currentTimeMillis();
@@ -175,13 +170,6 @@ public class HttpGetter {
return builder.build();
}
public static void main(String[] args) throws Exception {
CommaFeedConfiguration config = new CommaFeedConfiguration();
HttpGetter getter = new HttpGetter(config);
HttpResult result = getter.getBinary("https://sourceforge.net/projects/mpv-player-windows/rss", 30000);
System.out.println(new String(result.content));
}
@Getter
public static class NotModifiedException extends Exception {
private static final long serialVersionUID = 1L;
@@ -189,12 +177,12 @@ public class HttpGetter {
/**
* if the value of this header changed, this is its new value
*/
private String newLastModifiedHeader;
private final String newLastModifiedHeader;
/**
* if the value of this header changed, this is its new value
*/
private String newEtagHeader;
private final String newEtagHeader;
public NotModifiedException(String message) {
this(message, null, null);

View File

@@ -1,14 +0,0 @@
package com.commafeed.backend.feed;
import java.util.List;
import com.commafeed.backend.model.Feed;
import com.commafeed.backend.model.FeedEntry;
import lombok.Value;
@Value
public class FeedAndEntries {
Feed feed;
List<FeedEntry> entries;
}

View File

@@ -2,6 +2,7 @@ package com.commafeed.backend.feed;
import java.io.IOException;
import java.util.Date;
import java.util.List;
import java.util.Set;
import javax.inject.Inject;
@@ -13,13 +14,19 @@ import org.apache.commons.codec.digest.DigestUtils;
import com.commafeed.backend.HttpGetter;
import com.commafeed.backend.HttpGetter.HttpResult;
import com.commafeed.backend.HttpGetter.NotModifiedException;
import com.commafeed.backend.feed.FeedParser.FeedParserResult;
import com.commafeed.backend.model.Feed;
import com.commafeed.backend.model.FeedEntry;
import com.commafeed.backend.urlprovider.FeedURLProvider;
import com.rometools.rome.io.FeedException;
import lombok.RequiredArgsConstructor;
import lombok.Value;
import lombok.extern.slf4j.Slf4j;
/**
* Fetches a feed then parses it
*/
@Slf4j
@RequiredArgsConstructor(onConstructor = @__({ @Inject }))
@Singleton
@@ -29,18 +36,18 @@ public class FeedFetcher {
private final HttpGetter getter;
private final Set<FeedURLProvider> urlProviders;
public FetchedFeed fetch(String feedUrl, boolean extractFeedUrlFromHtml, String lastModified, String eTag, Date lastPublishedDate,
public FeedFetcherResult fetch(String feedUrl, boolean extractFeedUrlFromHtml, String lastModified, String eTag, Date lastPublishedDate,
String lastContentHash) throws FeedException, IOException, NotModifiedException {
log.debug("Fetching feed {}", feedUrl);
FetchedFeed fetchedFeed = null;
int timeout = 20000;
HttpResult result = getter.getBinary(feedUrl, lastModified, eTag, timeout);
byte[] content = result.getContent();
FeedParserResult parserResult;
try {
fetchedFeed = parser.parse(result.getUrlAfterRedirect(), content);
parserResult = parser.parse(result.getUrlAfterRedirect(), content);
} catch (FeedException e) {
if (extractFeedUrlFromHtml) {
String extractedUrl = extractFeedUrl(urlProviders, feedUrl, StringUtils.newStringUtf8(result.getContent()));
@@ -49,7 +56,7 @@ public class FeedFetcher {
result = getter.getBinary(extractedUrl, lastModified, eTag, timeout);
content = result.getContent();
fetchedFeed = parser.parse(result.getUrlAfterRedirect(), content);
parserResult = parser.parse(result.getUrlAfterRedirect(), content);
} else {
throw e;
}
@@ -73,21 +80,20 @@ public class FeedFetcher {
etagHeaderValueChanged ? result.getETag() : null);
}
if (lastPublishedDate != null && fetchedFeed.getFeed().getLastPublishedDate() != null
&& lastPublishedDate.getTime() == fetchedFeed.getFeed().getLastPublishedDate().getTime()) {
if (lastPublishedDate != null && parserResult.getFeed().getLastPublishedDate() != null
&& lastPublishedDate.getTime() == parserResult.getFeed().getLastPublishedDate().getTime()) {
log.debug("publishedDate not modified: {}", feedUrl);
throw new NotModifiedException("publishedDate not modified",
lastModifiedHeaderValueChanged ? result.getLastModifiedSince() : null,
etagHeaderValueChanged ? result.getETag() : null);
}
Feed feed = fetchedFeed.getFeed();
Feed feed = parserResult.getFeed();
feed.setLastModifiedHeader(result.getLastModifiedSince());
feed.setEtagHeader(FeedUtils.truncate(result.getETag(), 255));
feed.setLastContentHash(hash);
fetchedFeed.setFetchDuration(result.getDuration());
fetchedFeed.setUrlAfterRedirect(result.getUrlAfterRedirect());
return fetchedFeed;
return new FeedFetcherResult(parserResult.getFeed(), parserResult.getEntries(), parserResult.getTitle(),
result.getUrlAfterRedirect(), result.getDuration());
}
private static String extractFeedUrl(Set<FeedURLProvider> urlProviders, String url, String urlContent) {
@@ -100,4 +106,14 @@ public class FeedFetcher {
return null;
}
@Value
public static class FeedFetcherResult {
Feed feed;
List<FeedEntry> entries;
String title;
String urlAfterRedirect;
long fetchDuration;
}
}

View File

@@ -3,6 +3,7 @@ package com.commafeed.backend.feed;
import java.io.StringReader;
import java.nio.charset.Charset;
import java.text.DateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.stream.Collectors;
@@ -37,8 +38,12 @@ import com.rometools.rome.io.SyndFeedInput;
import lombok.Data;
import lombok.RequiredArgsConstructor;
import lombok.Value;
import lombok.extern.slf4j.Slf4j;
/**
* Parses raw xml as a Feed object
*/
@Slf4j
@RequiredArgsConstructor(onConstructor = @__({ @Inject }))
@Singleton
@@ -50,10 +55,7 @@ public class FeedParser {
private static final Date START = new Date(86400000);
private static final Date END = new Date(1000L * Integer.MAX_VALUE - 86400000);
public FetchedFeed parse(String feedUrl, byte[] xml) throws FeedException {
FetchedFeed fetchedFeed = new FetchedFeed();
Feed feed = fetchedFeed.getFeed();
List<FeedEntry> entries = fetchedFeed.getEntries();
public FeedParserResult parse(String feedUrl, byte[] xml) throws FeedException {
try {
Charset encoding = FeedUtils.guessEncoding(xml);
@@ -63,17 +65,19 @@ public class FeedParser {
}
xmlString = FeedUtils.replaceHtmlEntitiesWithNumericEntities(xmlString);
InputSource source = new InputSource(new StringReader(xmlString));
SyndFeed rss = new SyndFeedInput().build(source);
handleForeignMarkup(rss);
fetchedFeed.setTitle(rss.getTitle());
String title = rss.getTitle();
Feed feed = new Feed();
feed.setPushHub(findHub(rss));
feed.setPushTopic(findSelf(rss));
feed.setUrl(feedUrl);
feed.setLink(rss.getLink());
List<SyndEntry> items = rss.getEntries();
for (SyndEntry item : items) {
List<FeedEntry> entries = new ArrayList<>();
for (SyndEntry item : rss.getEntries()) {
FeedEntry entry = new FeedEntry();
String guid = item.getUri();
@@ -121,6 +125,7 @@ public class FeedParser {
entries.add(entry);
}
Date lastEntryDate = null;
Date publishedDate = validateDate(rss.getPublishedDate(), false);
if (!entries.isEmpty()) {
@@ -133,10 +138,10 @@ public class FeedParser {
feed.setAverageEntryInterval(FeedUtils.averageTimeBetweenEntries(entries));
feed.setLastEntryDate(lastEntryDate);
return new FeedParserResult(feed, entries, title);
} catch (Exception e) {
throw new FeedException(String.format("Could not parse feed from %s : %s", feedUrl, e.getMessage()), e);
}
return fetchedFeed;
}
/**
@@ -273,4 +278,11 @@ public class FeedParser {
}
}
@Value
public static class FeedParserResult {
Feed feed;
List<FeedEntry> entries;
String title;
}
}

View File

@@ -22,9 +22,9 @@ public class FeedRefreshIntervalCalculator {
this.refreshIntervalMinutes = config.getApplicationSettings().getRefreshIntervalMinutes();
}
public Date onFetchSuccess(FetchedFeed fetchedFeed) {
public Date onFetchSuccess(Feed feed) {
Date defaultRefreshInterval = getDefaultRefreshInterval();
return heavyLoad ? computeRefreshIntervalForHeavyLoad(fetchedFeed.getFeed(), defaultRefreshInterval) : defaultRefreshInterval;
return heavyLoad ? computeRefreshIntervalForHeavyLoad(feed, defaultRefreshInterval) : defaultRefreshInterval;
}
public Date onFeedNotModified(Feed feed) {

View File

@@ -39,6 +39,9 @@ import io.dropwizard.lifecycle.Managed;
import lombok.AllArgsConstructor;
import lombok.extern.slf4j.Slf4j;
/**
* Updates the feed in the database and inserts new entries
*/
@Slf4j
@Singleton
public class FeedRefreshUpdater implements Managed {

View File

@@ -15,13 +15,15 @@ import com.codahale.metrics.Meter;
import com.codahale.metrics.MetricRegistry;
import com.commafeed.CommaFeedConfiguration;
import com.commafeed.backend.HttpGetter.NotModifiedException;
import com.commafeed.backend.feed.FeedFetcher.FeedFetcherResult;
import com.commafeed.backend.model.Feed;
import com.commafeed.backend.model.FeedEntry;
import lombok.Value;
import lombok.extern.slf4j.Slf4j;
/**
* Calls {@link FeedFetcher} and handles its outcome
* Calls {@link FeedFetcher} and updates the Feed object, but does not update the database ({@link FeedRefreshUpdater} does that)
*/
@Slf4j
@Singleton
@@ -42,39 +44,39 @@ public class FeedRefreshWorker {
}
public FeedAndEntries update(Feed feed) {
public FeedRefreshWorkerResult update(Feed feed) {
try {
String url = Optional.ofNullable(feed.getUrlAfterRedirect()).orElse(feed.getUrl());
FetchedFeed fetchedFeed = fetcher.fetch(url, false, feed.getLastModifiedHeader(), feed.getEtagHeader(),
FeedFetcherResult feedFetcherResult = fetcher.fetch(url, false, feed.getLastModifiedHeader(), feed.getEtagHeader(),
feed.getLastPublishedDate(), feed.getLastContentHash());
// stops here if NotModifiedException or any other exception is thrown
List<FeedEntry> entries = fetchedFeed.getEntries();
List<FeedEntry> entries = feedFetcherResult.getEntries();
Integer maxFeedCapacity = config.getApplicationSettings().getMaxFeedCapacity();
if (maxFeedCapacity > 0) {
entries = entries.stream().limit(maxFeedCapacity).collect(Collectors.toList());
}
String urlAfterRedirect = fetchedFeed.getUrlAfterRedirect();
String urlAfterRedirect = feedFetcherResult.getUrlAfterRedirect();
if (StringUtils.equals(url, urlAfterRedirect)) {
urlAfterRedirect = null;
}
feed.setUrlAfterRedirect(urlAfterRedirect);
feed.setLink(fetchedFeed.getFeed().getLink());
feed.setLastModifiedHeader(fetchedFeed.getFeed().getLastModifiedHeader());
feed.setEtagHeader(fetchedFeed.getFeed().getEtagHeader());
feed.setLastContentHash(fetchedFeed.getFeed().getLastContentHash());
feed.setLastPublishedDate(fetchedFeed.getFeed().getLastPublishedDate());
feed.setAverageEntryInterval(fetchedFeed.getFeed().getAverageEntryInterval());
feed.setLastEntryDate(fetchedFeed.getFeed().getLastEntryDate());
feed.setLink(feedFetcherResult.getFeed().getLink());
feed.setLastModifiedHeader(feedFetcherResult.getFeed().getLastModifiedHeader());
feed.setEtagHeader(feedFetcherResult.getFeed().getEtagHeader());
feed.setLastContentHash(feedFetcherResult.getFeed().getLastContentHash());
feed.setLastPublishedDate(feedFetcherResult.getFeed().getLastPublishedDate());
feed.setAverageEntryInterval(feedFetcherResult.getFeed().getAverageEntryInterval());
feed.setLastEntryDate(feedFetcherResult.getFeed().getLastEntryDate());
feed.setErrorCount(0);
feed.setMessage(null);
feed.setDisabledUntil(refreshIntervalCalculator.onFetchSuccess(fetchedFeed));
feed.setDisabledUntil(refreshIntervalCalculator.onFetchSuccess(feedFetcherResult.getFeed()));
handlePubSub(feed, fetchedFeed.getFeed());
handlePubSub(feed, feedFetcherResult.getFeed());
return new FeedAndEntries(feed, entries);
return new FeedRefreshWorkerResult(feed, entries);
} catch (NotModifiedException e) {
log.debug("Feed not modified : {} - {}", feed.getUrl(), e.getMessage());
@@ -90,7 +92,7 @@ public class FeedRefreshWorker {
feed.setEtagHeader(e.getNewEtagHeader());
}
return new FeedAndEntries(feed, Collections.emptyList());
return new FeedRefreshWorkerResult(feed, Collections.emptyList());
} catch (Exception e) {
String message = "Unable to refresh feed " + feed.getUrl() + " : " + e.getMessage();
log.debug(e.getClass().getName() + " " + message, e);
@@ -99,7 +101,7 @@ public class FeedRefreshWorker {
feed.setMessage(message);
feed.setDisabledUntil(refreshIntervalCalculator.onFetchError(feed));
return new FeedAndEntries(feed, Collections.emptyList());
return new FeedRefreshWorkerResult(feed, Collections.emptyList());
} finally {
feedFetched.mark();
}
@@ -127,4 +129,10 @@ public class FeedRefreshWorker {
}
}
@Value
public static class FeedRefreshWorkerResult {
Feed feed;
List<FeedEntry> entries;
}
}

View File

@@ -1,23 +0,0 @@
package com.commafeed.backend.feed;
import java.util.ArrayList;
import java.util.List;
import com.commafeed.backend.model.Feed;
import com.commafeed.backend.model.FeedEntry;
import lombok.Getter;
import lombok.Setter;
@Getter
@Setter
public class FetchedFeed {
private Feed feed = new Feed();
private List<FeedEntry> entries = new ArrayList<>();
private String title;
private String urlAfterRedirect;
private long fetchDuration;
}

View File

@@ -53,7 +53,7 @@ public class FeedRefreshEngine implements Managed {
}
@Override
public void start() throws Exception {
public void start() {
Flowable<Feed> database = Flowable.fromCallable(() -> findNextUpdatableFeeds(getBatchSize(), getLastLoginThreshold()))
.onErrorResumeNext(e -> {
log.error("error while fetching next updatable feeds", e);
@@ -114,7 +114,7 @@ public class FeedRefreshEngine implements Managed {
}
@Override
public void stop() throws Exception {
public void stop() {
flow.dispose();
}
}

View File

@@ -45,8 +45,8 @@ import com.commafeed.backend.dao.FeedSubscriptionDAO;
import com.commafeed.backend.favicon.AbstractFaviconFetcher.Favicon;
import com.commafeed.backend.feed.FeedEntryKeyword;
import com.commafeed.backend.feed.FeedFetcher;
import com.commafeed.backend.feed.FeedFetcher.FeedFetcherResult;
import com.commafeed.backend.feed.FeedUtils;
import com.commafeed.backend.feed.FetchedFeed;
import com.commafeed.backend.model.Feed;
import com.commafeed.backend.model.FeedCategory;
import com.commafeed.backend.model.FeedEntry;
@@ -244,10 +244,10 @@ public class FeedREST {
url = StringUtils.trimToEmpty(url);
url = prependHttp(url);
try {
FetchedFeed feed = feedFetcher.fetch(url, true, null, null, null, null);
FeedFetcherResult feedFetcherResult = feedFetcher.fetch(url, true, null, null, null, null);
info = new FeedInfo();
info.setUrl(feed.getUrlAfterRedirect());
info.setTitle(feed.getTitle());
info.setUrl(feedFetcherResult.getUrlAfterRedirect());
info.setTitle(feedFetcherResult.getTitle());
} catch (Exception e) {
log.debug(e.getMessage(), e);

View File

@@ -26,7 +26,7 @@ import com.codahale.metrics.annotation.Timed;
import com.commafeed.CommaFeedConfiguration;
import com.commafeed.backend.dao.FeedDAO;
import com.commafeed.backend.feed.FeedParser;
import com.commafeed.backend.feed.FetchedFeed;
import com.commafeed.backend.feed.FeedParser.FeedParserResult;
import com.commafeed.backend.model.Feed;
import com.commafeed.backend.service.FeedRefreshEngine;
import com.google.common.base.Preconditions;
@@ -100,8 +100,8 @@ public class PubSubHubbubCallbackREST {
return Response.status(Status.BAD_REQUEST).entity("empty body received").build();
}
FetchedFeed fetchedFeed = parser.parse(null, bytes);
String topic = fetchedFeed.getFeed().getPushTopic();
FeedParserResult feedParserResult = parser.parse(null, bytes);
String topic = feedParserResult.getFeed().getPushTopic();
if (StringUtils.isBlank(topic)) {
return Response.status(Status.BAD_REQUEST).entity("empty topic received").build();
}