feed refresh engine now uses its own immutable model

This commit is contained in:
Athou
2024-01-07 10:51:35 +01:00
parent 60b6c69020
commit 7b335e2fd4
24 changed files with 623 additions and 565 deletions

View File

@@ -4,8 +4,8 @@ import java.util.List;
import org.apache.commons.codec.digest.DigestUtils;
import com.commafeed.backend.feed.parser.FeedParserResult.Entry;
import com.commafeed.backend.model.Feed;
import com.commafeed.backend.model.FeedEntry;
import com.commafeed.backend.model.FeedSubscription;
import com.commafeed.backend.model.User;
import com.commafeed.frontend.model.Category;
@@ -18,8 +18,8 @@ public abstract class CacheService {
public abstract void setLastEntries(Feed feed, List<String> entries);
public String buildUniqueEntryKey(Feed feed, FeedEntry entry) {
return DigestUtils.sha1Hex(entry.getGuid() + entry.getUrl());
public String buildUniqueEntryKey(Entry entry) {
return DigestUtils.sha1Hex(entry.guid() + entry.url());
}
// user categories

View File

@@ -3,7 +3,6 @@ package com.commafeed.backend.dao;
import java.util.Date;
import java.util.List;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
import org.hibernate.SessionFactory;
@@ -45,8 +44,8 @@ public class FeedDAO extends GenericDAO<Feed> {
updateQuery(feed).set(feed.disabledUntil, date).where(feed.id.in(feedIds)).execute();
}
public Feed findByUrl(String normalizedUrl) {
List<Feed> feeds = query().selectFrom(feed).where(feed.normalizedUrlHash.eq(DigestUtils.sha1Hex(normalizedUrl))).fetch();
public Feed findByUrl(String normalizedUrl, String normalizedUrlHash) {
List<Feed> feeds = query().selectFrom(feed).where(feed.normalizedUrlHash.eq(normalizedUrlHash)).fetch();
Feed feed = Iterables.getFirst(feeds, null);
if (feed != null && StringUtils.equals(normalizedUrl, feed.getNormalizedUrl())) {
return feed;

View File

@@ -2,7 +2,6 @@ package com.commafeed.backend.dao;
import java.util.List;
import org.apache.commons.codec.digest.DigestUtils;
import org.hibernate.SessionFactory;
import com.commafeed.backend.model.Feed;
@@ -26,12 +25,8 @@ public class FeedEntryDAO extends GenericDAO<FeedEntry> {
super(sessionFactory);
}
public Long findExisting(String guid, Feed feed) {
return query().select(entry.id)
.from(entry)
.where(entry.guidHash.eq(DigestUtils.sha1Hex(guid)), entry.feed.eq(feed))
.limit(1)
.fetchOne();
public Long findExisting(String guidHash, Feed feed) {
return query().select(entry.id).from(entry).where(entry.guidHash.eq(guidHash), entry.feed.eq(feed)).limit(1).fetchOne();
}
public List<FeedCapacity> findFeedsExceedingCapacity(long maxCapacity, long max) {

View File

@@ -2,7 +2,6 @@ package com.commafeed.backend.feed;
import java.io.IOException;
import java.util.Date;
import java.util.List;
import java.util.Set;
import org.apache.commons.codec.binary.StringUtils;
@@ -11,16 +10,14 @@ import org.apache.commons.codec.digest.DigestUtils;
import com.commafeed.backend.HttpGetter;
import com.commafeed.backend.HttpGetter.HttpResult;
import com.commafeed.backend.HttpGetter.NotModifiedException;
import com.commafeed.backend.feed.FeedParser.FeedParserResult;
import com.commafeed.backend.model.Feed;
import com.commafeed.backend.model.FeedEntry;
import com.commafeed.backend.feed.parser.FeedParser;
import com.commafeed.backend.feed.parser.FeedParserResult;
import com.commafeed.backend.urlprovider.FeedURLProvider;
import com.rometools.rome.io.FeedException;
import jakarta.inject.Inject;
import jakarta.inject.Singleton;
import lombok.RequiredArgsConstructor;
import lombok.Value;
import lombok.extern.slf4j.Slf4j;
/**
@@ -79,20 +76,16 @@ public class FeedFetcher {
etagHeaderValueChanged ? result.getETag() : null);
}
if (lastPublishedDate != null && parserResult.getFeed().getLastPublishedDate() != null
&& lastPublishedDate.getTime() == parserResult.getFeed().getLastPublishedDate().getTime()) {
if (lastPublishedDate != null && parserResult.lastPublishedDate() != null
&& lastPublishedDate.getTime() == parserResult.lastPublishedDate().getTime()) {
log.debug("publishedDate not modified: {}", feedUrl);
throw new NotModifiedException("publishedDate not modified",
lastModifiedHeaderValueChanged ? result.getLastModifiedSince() : null,
etagHeaderValueChanged ? result.getETag() : null);
}
Feed feed = parserResult.getFeed();
feed.setLastModifiedHeader(result.getLastModifiedSince());
feed.setEtagHeader(FeedUtils.truncate(result.getETag(), 255));
feed.setLastContentHash(hash);
return new FeedFetcherResult(parserResult.getFeed(), parserResult.getEntries(), parserResult.getTitle(),
result.getUrlAfterRedirect(), result.getDuration());
return new FeedFetcherResult(parserResult, result.getUrlAfterRedirect(), result.getLastModifiedSince(), result.getETag(), hash,
result.getDuration());
}
private static String extractFeedUrl(Set<FeedURLProvider> urlProviders, String url, String urlContent) {
@@ -106,13 +99,8 @@ public class FeedFetcher {
return null;
}
@Value
public static class FeedFetcherResult {
Feed feed;
List<FeedEntry> entries;
String title;
String urlAfterRedirect;
long fetchDuration;
public record FeedFetcherResult(FeedParserResult feed, String urlAfterRedirect, String lastModifiedHeader, String lastETagHeader,
String contentHash, long fetchDuration) {
}
}

View File

@@ -1,263 +0,0 @@
package com.commafeed.backend.feed;
import java.io.StringReader;
import java.nio.charset.Charset;
import java.text.DateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import org.jdom2.Element;
import org.jdom2.Namespace;
import org.xml.sax.InputSource;
import com.commafeed.backend.model.Feed;
import com.commafeed.backend.model.FeedEntry;
import com.commafeed.backend.model.FeedEntryContent;
import com.google.common.collect.Iterables;
import com.rometools.modules.mediarss.MediaEntryModule;
import com.rometools.modules.mediarss.MediaModule;
import com.rometools.modules.mediarss.types.MediaGroup;
import com.rometools.modules.mediarss.types.Metadata;
import com.rometools.modules.mediarss.types.Thumbnail;
import com.rometools.rome.feed.synd.SyndCategory;
import com.rometools.rome.feed.synd.SyndContent;
import com.rometools.rome.feed.synd.SyndEnclosure;
import com.rometools.rome.feed.synd.SyndEntry;
import com.rometools.rome.feed.synd.SyndFeed;
import com.rometools.rome.feed.synd.SyndLink;
import com.rometools.rome.feed.synd.SyndLinkImpl;
import com.rometools.rome.io.FeedException;
import com.rometools.rome.io.SyndFeedInput;
import jakarta.inject.Inject;
import jakarta.inject.Singleton;
import lombok.Data;
import lombok.RequiredArgsConstructor;
import lombok.Value;
/**
* Parses raw xml as a Feed object
*/
@RequiredArgsConstructor(onConstructor = @__({ @Inject }))
@Singleton
public class FeedParser {
private static final String ATOM_10_URI = "http://www.w3.org/2005/Atom";
private static final Namespace ATOM_10_NS = Namespace.getNamespace(ATOM_10_URI);
private static final Date START = new Date(86400000);
private static final Date END = new Date(1000L * Integer.MAX_VALUE - 86400000);
public FeedParserResult parse(String feedUrl, byte[] xml) throws FeedException {
try {
Charset encoding = FeedUtils.guessEncoding(xml);
String xmlString = FeedUtils.trimInvalidXmlCharacters(new String(xml, encoding));
if (xmlString == null) {
throw new FeedException("Input string is null for url " + feedUrl);
}
xmlString = FeedUtils.replaceHtmlEntitiesWithNumericEntities(xmlString);
InputSource source = new InputSource(new StringReader(xmlString));
SyndFeed rss = new SyndFeedInput().build(source);
handleForeignMarkup(rss);
String title = rss.getTitle();
Feed feed = new Feed();
feed.setUrl(feedUrl);
feed.setLink(rss.getLink());
List<FeedEntry> entries = new ArrayList<>();
for (SyndEntry item : rss.getEntries()) {
FeedEntry entry = new FeedEntry();
String guid = item.getUri();
if (StringUtils.isBlank(guid)) {
guid = item.getLink();
}
if (StringUtils.isBlank(guid)) {
// no guid and no link, skip entry
continue;
}
entry.setGuid(FeedUtils.truncate(guid, 2048));
entry.setUpdated(validateDate(getEntryUpdateDate(item), true));
entry.setUrl(FeedUtils.truncate(FeedUtils.toAbsoluteUrl(item.getLink(), feed.getLink(), feedUrl), 2048));
// if link is empty but guid is used as url
if (StringUtils.isBlank(entry.getUrl()) && StringUtils.startsWith(entry.getGuid(), "http")) {
entry.setUrl(entry.getGuid());
}
FeedEntryContent content = new FeedEntryContent();
content.setContent(getContent(item));
content.setCategories(FeedUtils
.truncate(item.getCategories().stream().map(SyndCategory::getName).collect(Collectors.joining(", ")), 4096));
content.setTitle(getTitle(item));
content.setAuthor(StringUtils.trimToNull(item.getAuthor()));
SyndEnclosure enclosure = Iterables.getFirst(item.getEnclosures(), null);
if (enclosure != null) {
content.setEnclosureUrl(FeedUtils.truncate(enclosure.getUrl(), 2048));
content.setEnclosureType(enclosure.getType());
}
MediaEntryModule module = (MediaEntryModule) item.getModule(MediaModule.URI);
if (module != null) {
Media media = getMedia(module);
if (media != null) {
content.setMediaDescription(media.getDescription());
content.setMediaThumbnailUrl(FeedUtils.truncate(media.getThumbnailUrl(), 2048));
content.setMediaThumbnailWidth(media.getThumbnailWidth());
content.setMediaThumbnailHeight(media.getThumbnailHeight());
}
}
entry.setContent(content);
entries.add(entry);
}
Date lastEntryDate = null;
Date publishedDate = validateDate(rss.getPublishedDate(), false);
if (!entries.isEmpty()) {
List<Long> sortedTimestamps = FeedUtils.getSortedTimestamps(entries);
Long timestamp = sortedTimestamps.get(0);
lastEntryDate = new Date(timestamp);
publishedDate = (publishedDate == null || publishedDate.before(lastEntryDate)) ? lastEntryDate : publishedDate;
}
feed.setLastPublishedDate(publishedDate);
feed.setAverageEntryInterval(FeedUtils.averageTimeBetweenEntries(entries));
feed.setLastEntryDate(lastEntryDate);
return new FeedParserResult(feed, entries, title);
} catch (Exception e) {
throw new FeedException(String.format("Could not parse feed from %s : %s", feedUrl, e.getMessage()), e);
}
}
/**
* Adds atom links for rss feeds
*/
private void handleForeignMarkup(SyndFeed feed) {
List<Element> foreignMarkup = feed.getForeignMarkup();
if (foreignMarkup == null) {
return;
}
for (Element element : foreignMarkup) {
if ("link".equals(element.getName()) && ATOM_10_NS.equals(element.getNamespace())) {
SyndLink link = new SyndLinkImpl();
link.setRel(element.getAttributeValue("rel"));
link.setHref(element.getAttributeValue("href"));
feed.getLinks().add(link);
}
}
}
private Date getEntryUpdateDate(SyndEntry item) {
Date date = item.getUpdatedDate();
if (date == null) {
date = item.getPublishedDate();
}
if (date == null) {
date = new Date();
}
return date;
}
private Date validateDate(Date date, boolean nullToNow) {
Date now = new Date();
if (date == null) {
return nullToNow ? now : null;
}
if (date.before(START) || date.after(END)) {
return now;
}
if (date.after(now)) {
return now;
}
return date;
}
private String getContent(SyndEntry item) {
String content;
if (item.getContents().isEmpty()) {
content = item.getDescription() == null ? null : item.getDescription().getValue();
} else {
content = item.getContents().stream().map(SyndContent::getValue).collect(Collectors.joining(System.lineSeparator()));
}
return StringUtils.trimToNull(content);
}
private String getTitle(SyndEntry item) {
String title = item.getTitle();
if (StringUtils.isBlank(title)) {
Date date = item.getPublishedDate();
if (date != null) {
title = DateFormat.getInstance().format(date);
} else {
title = "(no title)";
}
}
return StringUtils.trimToNull(title);
}
private Media getMedia(MediaEntryModule module) {
Media media = getMedia(module.getMetadata());
if (media == null && ArrayUtils.isNotEmpty(module.getMediaGroups())) {
MediaGroup group = module.getMediaGroups()[0];
media = getMedia(group.getMetadata());
}
return media;
}
private Media getMedia(Metadata metadata) {
if (metadata == null) {
return null;
}
Media media = new Media();
media.setDescription(metadata.getDescription());
if (ArrayUtils.isNotEmpty(metadata.getThumbnail())) {
Thumbnail thumbnail = metadata.getThumbnail()[0];
media.setThumbnailWidth(thumbnail.getWidth());
media.setThumbnailHeight(thumbnail.getHeight());
if (thumbnail.getUrl() != null) {
media.setThumbnailUrl(thumbnail.getUrl().toString());
}
}
if (media.isEmpty()) {
return null;
}
return media;
}
@Data
private static class Media {
private String description;
private String thumbnailUrl;
private Integer thumbnailWidth;
private Integer thumbnailHeight;
public boolean isEmpty() {
return description == null && thumbnailUrl == null;
}
}
@Value
public static class FeedParserResult {
Feed feed;
List<FeedEntry> entries;
String title;
}
}

View File

@@ -156,7 +156,7 @@ public class FeedRefreshEngine implements Managed {
private void processFeedAsync(Feed feed) {
CompletableFuture.supplyAsync(() -> worker.update(feed), workerExecutor)
.thenApplyAsync(r -> updater.update(r.getFeed(), r.getEntries()), databaseUpdaterExecutor)
.thenApplyAsync(r -> updater.update(r.feed(), r.entries()), databaseUpdaterExecutor)
.whenComplete((data, ex) -> {
if (ex != null) {
log.error("error while processing feed {}", feed.getUrl(), ex);

View File

@@ -5,7 +5,6 @@ import java.util.Date;
import org.apache.commons.lang3.time.DateUtils;
import com.commafeed.CommaFeedConfiguration;
import com.commafeed.backend.model.Feed;
import jakarta.inject.Inject;
import jakarta.inject.Singleton;
@@ -22,18 +21,19 @@ public class FeedRefreshIntervalCalculator {
this.refreshIntervalMinutes = config.getApplicationSettings().getRefreshIntervalMinutes();
}
public Date onFetchSuccess(Feed feed) {
public Date onFetchSuccess(Date publishedDate, Long averageEntryInterval) {
Date defaultRefreshInterval = getDefaultRefreshInterval();
return heavyLoad ? computeRefreshIntervalForHeavyLoad(feed, defaultRefreshInterval) : defaultRefreshInterval;
return heavyLoad ? computeRefreshIntervalForHeavyLoad(publishedDate, averageEntryInterval, defaultRefreshInterval)
: defaultRefreshInterval;
}
public Date onFeedNotModified(Feed feed) {
public Date onFeedNotModified(Date publishedDate, Long averageEntryInterval) {
Date defaultRefreshInterval = getDefaultRefreshInterval();
return heavyLoad ? computeRefreshIntervalForHeavyLoad(feed, defaultRefreshInterval) : defaultRefreshInterval;
return heavyLoad ? computeRefreshIntervalForHeavyLoad(publishedDate, averageEntryInterval, defaultRefreshInterval)
: defaultRefreshInterval;
}
public Date onFetchError(Feed feed) {
int errorCount = feed.getErrorCount();
public Date onFetchError(int errorCount) {
int retriesBeforeDisable = 3;
if (errorCount < retriesBeforeDisable || !heavyLoad) {
return getDefaultRefreshInterval();
@@ -47,10 +47,8 @@ public class FeedRefreshIntervalCalculator {
return DateUtils.addMinutes(new Date(), refreshIntervalMinutes);
}
private Date computeRefreshIntervalForHeavyLoad(Feed feed, Date defaultRefreshInterval) {
private Date computeRefreshIntervalForHeavyLoad(Date publishedDate, Long averageEntryInterval, Date defaultRefreshInterval) {
Date now = new Date();
Date publishedDate = feed.getLastEntryDate();
Long averageEntryInterval = feed.getAverageEntryInterval();
if (publishedDate == null) {
// feed with no entries, recheck in 24 hours

View File

@@ -16,9 +16,9 @@ import com.codahale.metrics.MetricRegistry;
import com.commafeed.backend.cache.CacheService;
import com.commafeed.backend.dao.FeedSubscriptionDAO;
import com.commafeed.backend.dao.UnitOfWork;
import com.commafeed.backend.feed.parser.FeedParserResult.Content;
import com.commafeed.backend.feed.parser.FeedParserResult.Entry;
import com.commafeed.backend.model.Feed;
import com.commafeed.backend.model.FeedEntry;
import com.commafeed.backend.model.FeedEntryContent;
import com.commafeed.backend.model.FeedSubscription;
import com.commafeed.backend.model.User;
import com.commafeed.backend.service.FeedEntryService;
@@ -72,7 +72,7 @@ public class FeedRefreshUpdater implements Managed {
entryInserted = metrics.meter(MetricRegistry.name(getClass(), "entryInserted"));
}
private AddEntryResult addEntry(final Feed feed, final FeedEntry entry, final List<FeedSubscription> subscriptions) {
private AddEntryResult addEntry(final Feed feed, final Entry entry, final List<FeedSubscription> subscriptions) {
boolean processed = false;
boolean inserted = false;
@@ -82,8 +82,8 @@ public class FeedRefreshUpdater implements Managed {
// lock on content, make sure we are not updating the same entry
// twice at the same time
FeedEntryContent content = entry.getContent();
String key2 = DigestUtils.sha1Hex(StringUtils.trimToEmpty(content.getContent() + content.getTitle()));
Content content = entry.content();
String key2 = DigestUtils.sha1Hex(StringUtils.trimToEmpty(content.content() + content.title()));
Iterator<Lock> iterator = locks.bulkGet(Arrays.asList(key1, key2)).iterator();
Lock lock1 = iterator.next();
@@ -116,7 +116,7 @@ public class FeedRefreshUpdater implements Managed {
return new AddEntryResult(processed, inserted);
}
public boolean update(Feed feed, List<FeedEntry> entries) {
public boolean update(Feed feed, List<Entry> entries) {
boolean processed = true;
boolean insertedAtLeastOneEntry = false;
@@ -125,10 +125,10 @@ public class FeedRefreshUpdater implements Managed {
List<String> currentEntries = new ArrayList<>();
List<FeedSubscription> subscriptions = null;
for (FeedEntry entry : entries) {
String cacheKey = cache.buildUniqueEntryKey(feed, entry);
for (Entry entry : entries) {
String cacheKey = cache.buildUniqueEntryKey(entry);
if (!lastEntries.contains(cacheKey)) {
log.debug("cache miss for {}", entry.getUrl());
log.debug("cache miss for {}", entry.url());
if (subscriptions == null) {
subscriptions = unitOfWork.call(() -> feedSubscriptionDAO.findByFeed(feed));
}
@@ -138,7 +138,7 @@ public class FeedRefreshUpdater implements Managed {
entryCacheMiss.mark();
} else {
log.debug("cache hit for {}", entry.getUrl());
log.debug("cache hit for {}", entry.url());
entryCacheHit.mark();
}

View File

@@ -11,16 +11,15 @@ import com.codahale.metrics.MetricRegistry;
import com.commafeed.CommaFeedConfiguration;
import com.commafeed.backend.HttpGetter.NotModifiedException;
import com.commafeed.backend.feed.FeedFetcher.FeedFetcherResult;
import com.commafeed.backend.feed.parser.FeedParserResult.Entry;
import com.commafeed.backend.model.Feed;
import com.commafeed.backend.model.FeedEntry;
import jakarta.inject.Inject;
import jakarta.inject.Singleton;
import lombok.Value;
import lombok.extern.slf4j.Slf4j;
/**
* Calls {@link FeedFetcher} and updates the Feed object, but does not update the database ({@link FeedRefreshUpdater} does that)
* Calls {@link FeedFetcher} and updates the Feed object, but does not update the database, ({@link FeedRefreshUpdater} does that)
*/
@Slf4j
@Singleton
@@ -44,32 +43,34 @@ public class FeedRefreshWorker {
public FeedRefreshWorkerResult update(Feed feed) {
try {
String url = Optional.ofNullable(feed.getUrlAfterRedirect()).orElse(feed.getUrl());
FeedFetcherResult feedFetcherResult = fetcher.fetch(url, false, feed.getLastModifiedHeader(), feed.getEtagHeader(),
FeedFetcherResult result = fetcher.fetch(url, false, feed.getLastModifiedHeader(), feed.getEtagHeader(),
feed.getLastPublishedDate(), feed.getLastContentHash());
// stops here if NotModifiedException or any other exception is thrown
List<FeedEntry> entries = feedFetcherResult.getEntries();
Integer maxFeedCapacity = config.getApplicationSettings().getMaxFeedCapacity();
List<Entry> entries = result.feed().entries();
if (maxFeedCapacity > 0) {
entries = entries.stream().limit(maxFeedCapacity).toList();
}
String urlAfterRedirect = feedFetcherResult.getUrlAfterRedirect();
String urlAfterRedirect = result.urlAfterRedirect();
if (StringUtils.equals(url, urlAfterRedirect)) {
urlAfterRedirect = null;
}
feed.setUrlAfterRedirect(urlAfterRedirect);
feed.setLink(feedFetcherResult.getFeed().getLink());
feed.setLastModifiedHeader(feedFetcherResult.getFeed().getLastModifiedHeader());
feed.setEtagHeader(feedFetcherResult.getFeed().getEtagHeader());
feed.setLastContentHash(feedFetcherResult.getFeed().getLastContentHash());
feed.setLastPublishedDate(feedFetcherResult.getFeed().getLastPublishedDate());
feed.setAverageEntryInterval(feedFetcherResult.getFeed().getAverageEntryInterval());
feed.setLastEntryDate(feedFetcherResult.getFeed().getLastEntryDate());
feed.setLink(result.feed().link());
feed.setLastModifiedHeader(result.lastModifiedHeader());
feed.setEtagHeader(result.lastETagHeader());
feed.setLastContentHash(result.contentHash());
feed.setLastPublishedDate(result.feed().lastPublishedDate());
feed.setAverageEntryInterval(result.feed().averageEntryInterval());
feed.setLastEntryDate(result.feed().lastEntryDate());
feed.setErrorCount(0);
feed.setMessage(null);
feed.setDisabledUntil(refreshIntervalCalculator.onFetchSuccess(feedFetcherResult.getFeed()));
feed.setDisabledUntil(
refreshIntervalCalculator.onFetchSuccess(result.feed().lastPublishedDate(), result.feed().averageEntryInterval()));
return new FeedRefreshWorkerResult(feed, entries);
} catch (NotModifiedException e) {
@@ -77,7 +78,7 @@ public class FeedRefreshWorker {
feed.setErrorCount(0);
feed.setMessage(e.getMessage());
feed.setDisabledUntil(refreshIntervalCalculator.onFeedNotModified(feed));
feed.setDisabledUntil(refreshIntervalCalculator.onFeedNotModified(feed.getLastPublishedDate(), feed.getAverageEntryInterval()));
if (e.getNewLastModifiedHeader() != null) {
feed.setLastModifiedHeader(e.getNewLastModifiedHeader());
@@ -93,7 +94,7 @@ public class FeedRefreshWorker {
feed.setErrorCount(feed.getErrorCount() + 1);
feed.setMessage("Unable to refresh feed : " + e.getMessage());
feed.setDisabledUntil(refreshIntervalCalculator.onFetchError(feed));
feed.setDisabledUntil(refreshIntervalCalculator.onFetchError(feed.getErrorCount()));
return new FeedRefreshWorkerResult(feed, Collections.emptyList());
} finally {
@@ -101,10 +102,7 @@ public class FeedRefreshWorker {
}
}
@Value
public static class FeedRefreshWorkerResult {
Feed feed;
List<FeedEntry> entries;
public record FeedRefreshWorkerResult(Feed feed, List<Entry> entries) {
}
}

View File

@@ -2,20 +2,12 @@ package com.commafeed.backend.feed;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;
import org.ahocorasick.trie.Emit;
import org.ahocorasick.trie.Trie;
import org.ahocorasick.trie.Trie.TrieBuilder;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
@@ -29,8 +21,6 @@ import com.commafeed.backend.model.FeedSubscription;
import com.commafeed.frontend.model.Entry;
import com.google.gwt.i18n.client.HasDirection.Direction;
import com.google.gwt.i18n.shared.BidiUtils;
import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
import lombok.extern.slf4j.Slf4j;
@@ -50,70 +40,6 @@ public class FeedUtils {
return string;
}
/**
* Detect feed encoding by using the declared encoding in the xml processing instruction and by detecting the characters used in the
* feed
*
*/
public static Charset guessEncoding(byte[] bytes) {
String extracted = extractDeclaredEncoding(bytes);
if (StringUtils.startsWithIgnoreCase(extracted, "iso-8859-")) {
if (!StringUtils.endsWith(extracted, "1")) {
return Charset.forName(extracted);
}
} else if (StringUtils.startsWithIgnoreCase(extracted, "windows-")) {
return Charset.forName(extracted);
}
return detectEncoding(bytes);
}
/**
* Detect encoding by analyzing characters in the array
*/
public static Charset detectEncoding(byte[] bytes) {
String encoding = "UTF-8";
CharsetDetector detector = new CharsetDetector();
detector.setText(bytes);
CharsetMatch match = detector.detect();
if (match != null) {
encoding = match.getName();
}
if (encoding.equalsIgnoreCase("ISO-8859-1")) {
encoding = "windows-1252";
}
return Charset.forName(encoding);
}
public static String replaceHtmlEntitiesWithNumericEntities(String source) {
// Create a buffer sufficiently large that re-allocations are minimized.
StringBuilder sb = new StringBuilder(source.length() << 1);
TrieBuilder builder = Trie.builder();
builder.ignoreOverlaps();
for (String key : HtmlEntities.HTML_ENTITIES) {
builder.addKeyword(key);
}
Trie trie = builder.build();
Collection<Emit> emits = trie.parseText(source);
int prevIndex = 0;
for (Emit emit : emits) {
int matchIndex = emit.getStart();
sb.append(source, prevIndex, matchIndex);
sb.append(HtmlEntities.HTML_TO_NUMERIC_MAP.get(emit.getKeyword()));
prevIndex = emit.getEnd() + 1;
}
// Add the remainder of the string (contains no more matches).
sb.append(source.substring(prevIndex));
return sb.toString();
}
public static boolean isHttp(String url) {
return url.startsWith("http://");
}
@@ -122,6 +48,10 @@ public class FeedUtils {
return url.startsWith("https://");
}
public static boolean isAbsoluteUrl(String url) {
return isHttp(url) || isHttps(url);
}
/**
* Normalize the url. The resulting url is not meant to be fetched but rather used as a mean to identify a feed and avoid duplicates
*/
@@ -163,25 +93,6 @@ public class FeedUtils {
return normalized;
}
/**
* Extract the declared encoding from the xml
*/
public static String extractDeclaredEncoding(byte[] bytes) {
int index = ArrayUtils.indexOf(bytes, (byte) '>');
if (index == -1) {
return null;
}
String pi = new String(ArrayUtils.subarray(bytes, 0, index + 1)).replace('\'', '"');
index = StringUtils.indexOf(pi, "encoding=\"");
if (index == -1) {
return null;
}
String encoding = pi.substring(index + 10);
encoding = encoding.substring(0, encoding.indexOf('"'));
return encoding;
}
public static boolean isRTL(FeedEntry entry) {
String text = entry.getContent().getContent();
@@ -202,52 +113,6 @@ public class FeedUtils {
return direction == Direction.RTL;
}
public static String trimInvalidXmlCharacters(String xml) {
if (StringUtils.isBlank(xml)) {
return null;
}
StringBuilder sb = new StringBuilder();
boolean firstTagFound = false;
for (int i = 0; i < xml.length(); i++) {
char c = xml.charAt(i);
if (!firstTagFound) {
if (c == '<') {
firstTagFound = true;
} else {
continue;
}
}
if (c >= 32 || c == 9 || c == 10 || c == 13) {
if (!Character.isHighSurrogate(c) && !Character.isLowSurrogate(c)) {
sb.append(c);
}
}
}
return sb.toString();
}
public static Long averageTimeBetweenEntries(List<FeedEntry> entries) {
if (entries.isEmpty() || entries.size() == 1) {
return null;
}
List<Long> timestamps = getSortedTimestamps(entries);
SummaryStatistics stats = new SummaryStatistics();
for (int i = 0; i < timestamps.size() - 1; i++) {
long diff = Math.abs(timestamps.get(i) - timestamps.get(i + 1));
stats.addValue(diff);
}
return (long) stats.getMean();
}
public static List<Long> getSortedTimestamps(List<FeedEntry> entries) {
return entries.stream().map(t -> t.getUpdated().getTime()).sorted(Collections.reverseOrder()).toList();
}
public static String removeTrailingSlash(String url) {
if (url.endsWith("/")) {
url = url.substring(0, url.length() - 1);
@@ -256,8 +121,8 @@ public class FeedUtils {
}
/**
*
* @param url
*
* @param relativeUrl
* the url of the entry
* @param feedLink
* the url of the feed as described in the feed
@@ -265,32 +130,18 @@ public class FeedUtils {
* the url of the feed that we used to fetch the feed
* @return an absolute url pointing to the entry
*/
public static String toAbsoluteUrl(String url, String feedLink, String feedUrl) {
url = StringUtils.trimToNull(StringUtils.normalizeSpace(url));
if (url == null || url.startsWith("http")) {
return url;
}
String baseUrl = (feedLink == null || isRelative(feedLink)) ? feedUrl : feedLink;
public static String toAbsoluteUrl(String relativeUrl, String feedLink, String feedUrl) {
String baseUrl = (feedLink != null && isAbsoluteUrl(feedLink)) ? feedLink : feedUrl;
if (baseUrl == null) {
return url;
return null;
}
String result;
try {
result = new URL(new URL(baseUrl), url).toString();
return new URL(new URL(baseUrl), relativeUrl).toString();
} catch (MalformedURLException e) {
log.debug("could not parse url : " + e.getMessage(), e);
result = url;
return null;
}
return result;
}
public static boolean isRelative(final String url) {
// the regex means "start with 'scheme://'"
return url.startsWith("/") || url.startsWith("#") || !url.matches("^\\w+\\:\\/\\/.*");
}
public static String getFaviconUrl(FeedSubscription subscription) {

View File

@@ -0,0 +1,70 @@
package com.commafeed.backend.feed.parser;
import java.nio.charset.Charset;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
import jakarta.inject.Singleton;
@Singleton
class EncodingDetector {
/**
* Detect feed encoding by using the declared encoding in the xml processing instruction and by detecting the characters used in the
* feed
*
*/
public Charset getEncoding(byte[] bytes) {
String extracted = extractDeclaredEncoding(bytes);
if (StringUtils.startsWithIgnoreCase(extracted, "iso-8859-")) {
if (!StringUtils.endsWith(extracted, "1")) {
return Charset.forName(extracted);
}
} else if (StringUtils.startsWithIgnoreCase(extracted, "windows-")) {
return Charset.forName(extracted);
}
return detectEncoding(bytes);
}
/**
* Extract the declared encoding from the xml
*/
public String extractDeclaredEncoding(byte[] bytes) {
int index = ArrayUtils.indexOf(bytes, (byte) '>');
if (index == -1) {
return null;
}
String pi = new String(ArrayUtils.subarray(bytes, 0, index + 1)).replace('\'', '"');
index = StringUtils.indexOf(pi, "encoding=\"");
if (index == -1) {
return null;
}
String encoding = pi.substring(index + 10);
encoding = encoding.substring(0, encoding.indexOf('"'));
return encoding;
}
/**
* Detect encoding by analyzing characters in the array
*/
private Charset detectEncoding(byte[] bytes) {
String encoding = "UTF-8";
CharsetDetector detector = new CharsetDetector();
detector.setText(bytes);
CharsetMatch match = detector.detect();
if (match != null) {
encoding = match.getName();
}
if (encoding.equalsIgnoreCase("ISO-8859-1")) {
encoding = "windows-1252";
}
return Charset.forName(encoding);
}
}

View File

@@ -0,0 +1,71 @@
package com.commafeed.backend.feed.parser;
import java.util.Collection;
import org.ahocorasick.trie.Emit;
import org.ahocorasick.trie.Trie;
import org.ahocorasick.trie.Trie.TrieBuilder;
import org.apache.commons.lang3.StringUtils;
import jakarta.inject.Singleton;
@Singleton
class FeedCleaner {
public String trimInvalidXmlCharacters(String xml) {
if (StringUtils.isBlank(xml)) {
return null;
}
StringBuilder sb = new StringBuilder();
boolean firstTagFound = false;
for (int i = 0; i < xml.length(); i++) {
char c = xml.charAt(i);
if (!firstTagFound) {
if (c == '<') {
firstTagFound = true;
} else {
continue;
}
}
if (c >= 32 || c == 9 || c == 10 || c == 13) {
if (!Character.isHighSurrogate(c) && !Character.isLowSurrogate(c)) {
sb.append(c);
}
}
}
return sb.toString();
}
public String replaceHtmlEntitiesWithNumericEntities(String source) {
// Create a buffer sufficiently large that re-allocations are minimized.
StringBuilder sb = new StringBuilder(source.length() << 1);
TrieBuilder builder = Trie.builder();
builder.ignoreOverlaps();
for (String key : HtmlEntities.HTML_ENTITIES) {
builder.addKeyword(key);
}
Trie trie = builder.build();
Collection<Emit> emits = trie.parseText(source);
int prevIndex = 0;
for (Emit emit : emits) {
int matchIndex = emit.getStart();
sb.append(source, prevIndex, matchIndex);
sb.append(HtmlEntities.HTML_TO_NUMERIC_MAP.get(emit.getKeyword()));
prevIndex = emit.getEnd() + 1;
}
// Add the remainder of the string (contains no more matches).
sb.append(source.substring(prevIndex));
return sb.toString();
}
}

View File

@@ -0,0 +1,271 @@
package com.commafeed.backend.feed.parser;
import java.io.StringReader;
import java.nio.charset.Charset;
import java.text.DateFormat;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Date;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
import org.jdom2.Element;
import org.jdom2.Namespace;
import org.xml.sax.InputSource;
import com.commafeed.backend.feed.FeedUtils;
import com.commafeed.backend.feed.parser.FeedParserResult.Content;
import com.commafeed.backend.feed.parser.FeedParserResult.Enclosure;
import com.commafeed.backend.feed.parser.FeedParserResult.Entry;
import com.commafeed.backend.feed.parser.FeedParserResult.Media;
import com.google.common.collect.Iterables;
import com.rometools.modules.mediarss.MediaEntryModule;
import com.rometools.modules.mediarss.MediaModule;
import com.rometools.modules.mediarss.types.MediaGroup;
import com.rometools.modules.mediarss.types.Metadata;
import com.rometools.modules.mediarss.types.Thumbnail;
import com.rometools.rome.feed.synd.SyndCategory;
import com.rometools.rome.feed.synd.SyndContent;
import com.rometools.rome.feed.synd.SyndEnclosure;
import com.rometools.rome.feed.synd.SyndEntry;
import com.rometools.rome.feed.synd.SyndFeed;
import com.rometools.rome.feed.synd.SyndLink;
import com.rometools.rome.feed.synd.SyndLinkImpl;
import com.rometools.rome.io.FeedException;
import com.rometools.rome.io.SyndFeedInput;
import jakarta.inject.Inject;
import jakarta.inject.Singleton;
import lombok.RequiredArgsConstructor;
/**
* Parses raw xml into a FeedParserResult object
*/
@RequiredArgsConstructor(onConstructor = @__({ @Inject }))
@Singleton
public class FeedParser {
private static final Namespace ATOM_10_NS = Namespace.getNamespace("http://www.w3.org/2005/Atom");
private static final Date START = new Date(86400000);
private static final Date END = new Date(1000L * Integer.MAX_VALUE - 86400000);
private final EncodingDetector encodingDetector;
private final FeedCleaner feedCleaner;
public FeedParserResult parse(String feedUrl, byte[] xml) throws FeedException {
try {
Charset encoding = encodingDetector.getEncoding(xml);
String xmlString = feedCleaner.trimInvalidXmlCharacters(new String(xml, encoding));
if (xmlString == null) {
throw new FeedException("Input string is null for url " + feedUrl);
}
xmlString = feedCleaner.replaceHtmlEntitiesWithNumericEntities(xmlString);
InputSource source = new InputSource(new StringReader(xmlString));
SyndFeed feed = new SyndFeedInput().build(source);
handleForeignMarkup(feed);
String title = feed.getTitle();
String link = feed.getLink();
List<Entry> entries = buildEntries(feed, feedUrl);
Date lastEntryDate = entries.stream().findFirst().map(Entry::updated).orElse(null);
Date lastPublishedDate = validateDate(feed.getPublishedDate(), false);
if (lastPublishedDate == null || lastPublishedDate.before(lastEntryDate)) {
lastPublishedDate = lastEntryDate;
}
Long averageEntryInterval = averageTimeBetweenEntries(entries);
return new FeedParserResult(title, link, lastPublishedDate, averageEntryInterval, lastEntryDate, entries);
} catch (Exception e) {
throw new FeedException(String.format("Could not parse feed from %s : %s", feedUrl, e.getMessage()), e);
}
}
/**
* Adds atom links for rss feeds
*/
private void handleForeignMarkup(SyndFeed feed) {
List<Element> foreignMarkup = feed.getForeignMarkup();
if (foreignMarkup == null) {
return;
}
for (Element element : foreignMarkup) {
if ("link".equals(element.getName()) && ATOM_10_NS.equals(element.getNamespace())) {
SyndLink link = new SyndLinkImpl();
link.setRel(element.getAttributeValue("rel"));
link.setHref(element.getAttributeValue("href"));
feed.getLinks().add(link);
}
}
}
private List<Entry> buildEntries(SyndFeed feed, String feedUrl) {
List<Entry> entries = new ArrayList<>();
for (SyndEntry item : feed.getEntries()) {
String guid = item.getUri();
if (StringUtils.isBlank(guid)) {
guid = item.getLink();
}
if (StringUtils.isBlank(guid)) {
// no guid and no link, skip entry
continue;
}
String url = buildEntryUrl(feed, feedUrl, item);
if (StringUtils.isBlank(url) && FeedUtils.isAbsoluteUrl(guid)) {
// if link is empty but guid is used as url, use guid
url = guid;
}
Date updated = buildEntryUpdateDate(item);
Content content = buildContent(item);
entries.add(new Entry(guid, url, updated, content));
}
entries.sort(Comparator.comparing(Entry::updated).reversed());
return entries;
}
private Content buildContent(SyndEntry item) {
String title = getTitle(item);
String content = getContent(item);
String author = StringUtils.trimToNull(item.getAuthor());
String categories = StringUtils
.trimToNull(item.getCategories().stream().map(SyndCategory::getName).collect(Collectors.joining(", ")));
Enclosure enclosure = buildEnclosure(item);
Media media = buildMedia(item);
return new Content(title, content, author, categories, enclosure, media);
}
private Enclosure buildEnclosure(SyndEntry item) {
SyndEnclosure enclosure = Iterables.getFirst(item.getEnclosures(), null);
if (enclosure == null) {
return null;
}
return new Enclosure(enclosure.getUrl(), enclosure.getType());
}
private Date buildEntryUpdateDate(SyndEntry item) {
Date date = item.getUpdatedDate();
if (date == null) {
date = item.getPublishedDate();
}
if (date == null) {
date = new Date();
}
return validateDate(date, true);
}
private String buildEntryUrl(SyndFeed feed, String feedUrl, SyndEntry item) {
String url = StringUtils.trimToNull(StringUtils.normalizeSpace(item.getLink()));
if (url == null || FeedUtils.isAbsoluteUrl(url)) {
// url is absolute, nothing to do
return url;
}
// url is relative, trying to resolve it
String feedLink = StringUtils.trimToNull(StringUtils.normalizeSpace(feed.getLink()));
return FeedUtils.toAbsoluteUrl(url, feedLink, feedUrl);
}
private Date validateDate(Date date, boolean nullToNow) {
Date now = new Date();
if (date == null) {
return nullToNow ? now : null;
}
if (date.before(START) || date.after(END)) {
return now;
}
if (date.after(now)) {
return now;
}
return date;
}
private String getContent(SyndEntry item) {
String content;
if (item.getContents().isEmpty()) {
content = item.getDescription() == null ? null : item.getDescription().getValue();
} else {
content = item.getContents().stream().map(SyndContent::getValue).collect(Collectors.joining(System.lineSeparator()));
}
return StringUtils.trimToNull(content);
}
private String getTitle(SyndEntry item) {
String title = item.getTitle();
if (StringUtils.isBlank(title)) {
Date date = item.getPublishedDate();
if (date != null) {
title = DateFormat.getInstance().format(date);
} else {
title = "(no title)";
}
}
return StringUtils.trimToNull(title);
}
private Media buildMedia(SyndEntry item) {
MediaEntryModule module = (MediaEntryModule) item.getModule(MediaModule.URI);
if (module == null) {
return null;
}
Media media = buildMedia(module.getMetadata());
if (media == null && ArrayUtils.isNotEmpty(module.getMediaGroups())) {
MediaGroup group = module.getMediaGroups()[0];
media = buildMedia(group.getMetadata());
}
return media;
}
private Media buildMedia(Metadata metadata) {
if (metadata == null) {
return null;
}
String description = metadata.getDescription();
String thumbnailUrl = null;
Integer thumbnailWidth = null;
Integer thumbnailHeight = null;
if (ArrayUtils.isNotEmpty(metadata.getThumbnail())) {
Thumbnail thumbnail = metadata.getThumbnail()[0];
thumbnailWidth = thumbnail.getWidth();
thumbnailHeight = thumbnail.getHeight();
if (thumbnail.getUrl() != null) {
thumbnailUrl = thumbnail.getUrl().toString();
}
}
if (description == null && thumbnailUrl == null) {
return null;
}
return new Media(description, thumbnailUrl, thumbnailWidth, thumbnailHeight);
}
private Long averageTimeBetweenEntries(List<Entry> entries) {
if (entries.isEmpty() || entries.size() == 1) {
return null;
}
SummaryStatistics stats = new SummaryStatistics();
for (int i = 0; i < entries.size() - 1; i++) {
long diff = Math.abs(entries.get(i).updated().getTime() - entries.get(i + 1).updated().getTime());
stats.addValue(diff);
}
return (long) stats.getMean();
}
}

View File

@@ -0,0 +1,20 @@
package com.commafeed.backend.feed.parser;
import java.util.Date;
import java.util.List;
public record FeedParserResult(String title, String link, Date lastPublishedDate, Long averageEntryInterval, Date lastEntryDate,
List<Entry> entries) {
public record Entry(String guid, String url, Date updated, Content content) {
}
public record Content(String title, String content, String author, String categories, Enclosure enclosure, Media media) {
}
public record Enclosure(String url, String type) {
}
public record Media(String description, String thumbnailUrl, Integer thumbnailWidth, Integer thumbnailHeight) {
}
}

View File

@@ -1,10 +1,13 @@
package com.commafeed.backend.feed;
package com.commafeed.backend.feed.parser;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.Map;
public class HtmlEntities {
import lombok.experimental.UtilityClass;
@UtilityClass
class HtmlEntities {
public static final Map<String, String> HTML_TO_NUMERIC_MAP;
public static final String[] HTML_ENTITIES;
public static final String[] NUMERIC_ENTITIES;

View File

@@ -3,7 +3,6 @@ package com.commafeed.backend.model;
import java.sql.Types;
import java.util.Set;
import org.apache.commons.lang3.builder.EqualsBuilder;
import org.hibernate.annotations.JdbcTypeCode;
import jakarta.persistence.Column;
@@ -61,22 +60,4 @@ public class FeedEntryContent extends AbstractModel {
@OneToMany(mappedBy = "content")
private Set<FeedEntry> entries;
public boolean equivalentTo(FeedEntryContent c) {
if (c == null) {
return false;
}
return new EqualsBuilder().append(title, c.title)
.append(content, c.content)
.append(author, c.author)
.append(enclosureUrl, c.enclosureUrl)
.append(enclosureType, c.enclosureType)
.append(mediaDescription, c.mediaDescription)
.append(mediaThumbnailUrl, c.mediaThumbnailUrl)
.append(mediaThumbnailWidth, c.mediaThumbnailWidth)
.append(mediaThumbnailHeight, c.mediaThumbnailHeight)
.append(categories, c.categories)
.build();
}
}

View File

@@ -8,6 +8,7 @@ import java.util.Optional;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.builder.EqualsBuilder;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Document.OutputSettings;
@@ -23,6 +24,9 @@ import org.w3c.dom.css.CSSStyleDeclaration;
import com.commafeed.backend.dao.FeedEntryContentDAO;
import com.commafeed.backend.feed.FeedUtils;
import com.commafeed.backend.feed.parser.FeedParserResult.Content;
import com.commafeed.backend.feed.parser.FeedParserResult.Enclosure;
import com.commafeed.backend.feed.parser.FeedParserResult.Media;
import com.commafeed.backend.model.FeedEntryContent;
import com.steadystate.css.parser.CSSOMParser;
@@ -46,26 +50,65 @@ public class FeedEntryContentService {
/**
* this is NOT thread-safe
*/
public FeedEntryContent findOrCreate(FeedEntryContent content, String baseUrl) {
content.setAuthor(FeedUtils.truncate(handleContent(content.getAuthor(), baseUrl, true), 128));
content.setTitle(FeedUtils.truncate(handleContent(content.getTitle(), baseUrl, true), 2048));
content.setContent(handleContent(content.getContent(), baseUrl, false));
content.setMediaDescription(handleContent(content.getMediaDescription(), baseUrl, false));
public FeedEntryContent findOrCreate(Content content, String baseUrl) {
String title = FeedUtils.truncate(handleContent(content.title(), baseUrl, true), 2048);
String titleHash = DigestUtils.sha1Hex(StringUtils.trimToEmpty(title));
String contentHash = DigestUtils.sha1Hex(StringUtils.trimToEmpty(content.getContent()));
content.setContentHash(contentHash);
String titleHash = DigestUtils.sha1Hex(StringUtils.trimToEmpty(content.getTitle()));
content.setTitleHash(titleHash);
String contentString = handleContent(content.content(), baseUrl, false);
String contentHash = DigestUtils.sha1Hex(StringUtils.trimToEmpty(contentString));
List<FeedEntryContent> existing = feedEntryContentDAO.findExisting(contentHash, titleHash);
Optional<FeedEntryContent> equivalentContent = existing.stream().filter(content::equivalentTo).findFirst();
Optional<FeedEntryContent> equivalentContent = existing.stream()
.filter(c -> isEquivalent(c, content, title, contentString))
.findFirst();
if (equivalentContent.isPresent()) {
return equivalentContent.get();
}
feedEntryContentDAO.saveOrUpdate(content);
return content;
FeedEntryContent entryContent = new FeedEntryContent();
entryContent.setTitle(title);
entryContent.setTitleHash(titleHash);
entryContent.setContent(contentString);
entryContent.setContentHash(contentHash);
entryContent.setAuthor(FeedUtils.truncate(handleContent(content.author(), baseUrl, true), 128));
entryContent.setCategories(FeedUtils.truncate(content.categories(), 4096));
Enclosure enclosure = content.enclosure();
if (enclosure != null) {
entryContent.setEnclosureUrl(enclosure.url());
entryContent.setEnclosureType(enclosure.type());
}
Media media = content.media();
if (media != null) {
entryContent.setMediaDescription(handleContent(media.description(), baseUrl, false));
entryContent.setMediaThumbnailUrl(media.thumbnailUrl());
entryContent.setMediaThumbnailWidth(media.thumbnailWidth());
entryContent.setMediaThumbnailHeight(media.thumbnailHeight());
}
feedEntryContentDAO.saveOrUpdate(entryContent);
return entryContent;
}
private boolean isEquivalent(FeedEntryContent content, Content c, String title, String contentString) {
EqualsBuilder builder = new EqualsBuilder().append(content.getTitle(), title)
.append(content.getContent(), contentString)
.append(content.getAuthor(), c.author())
.append(content.getCategories(), c.categories());
if (c.enclosure() != null) {
builder.append(content.getEnclosureUrl(), c.enclosure().url()).append(content.getEnclosureType(), c.enclosure().type());
}
if (c.media() != null) {
builder.append(content.getMediaDescription(), c.media().description())
.append(content.getMediaThumbnailUrl(), c.media().thumbnailUrl())
.append(content.getMediaThumbnailWidth(), c.media().thumbnailWidth())
.append(content.getMediaThumbnailHeight(), c.media().thumbnailHeight());
}
return builder.build();
}
private static Safelist buildWhiteList() {

View File

@@ -10,9 +10,10 @@ import com.commafeed.backend.dao.FeedEntryDAO;
import com.commafeed.backend.dao.FeedEntryStatusDAO;
import com.commafeed.backend.dao.FeedSubscriptionDAO;
import com.commafeed.backend.feed.FeedEntryKeyword;
import com.commafeed.backend.feed.FeedUtils;
import com.commafeed.backend.feed.parser.FeedParserResult.Entry;
import com.commafeed.backend.model.Feed;
import com.commafeed.backend.model.FeedEntry;
import com.commafeed.backend.model.FeedEntryContent;
import com.commafeed.backend.model.FeedEntryStatus;
import com.commafeed.backend.model.FeedSubscription;
import com.commafeed.backend.model.User;
@@ -37,30 +38,27 @@ public class FeedEntryService {
/**
* this is NOT thread-safe
*/
public boolean addEntry(Feed feed, FeedEntry entry, List<FeedSubscription> subscriptions) {
Long existing = feedEntryDAO.findExisting(entry.getGuid(), feed);
public boolean addEntry(Feed feed, Entry entry, List<FeedSubscription> subscriptions) {
String guid = FeedUtils.truncate(entry.guid(), 2048);
String guidHash = DigestUtils.sha1Hex(entry.guid());
Long existing = feedEntryDAO.findExisting(guidHash, feed);
if (existing != null) {
return false;
}
FeedEntryContent content = feedEntryContentService.findOrCreate(entry.getContent(), feed.getLink());
entry.setGuidHash(DigestUtils.sha1Hex(entry.getGuid()));
entry.setContent(content);
entry.setInserted(new Date());
entry.setFeed(feed);
feedEntryDAO.saveOrUpdate(entry);
FeedEntry feedEntry = buildEntry(feed, entry, guid, guidHash);
feedEntryDAO.saveOrUpdate(feedEntry);
// if filter does not match the entry, mark it as read
for (FeedSubscription sub : subscriptions) {
boolean matches = true;
try {
matches = feedEntryFilteringService.filterMatchesEntry(sub.getFilter(), entry);
matches = feedEntryFilteringService.filterMatchesEntry(sub.getFilter(), feedEntry);
} catch (FeedEntryFilteringService.FeedEntryFilterException e) {
log.error("could not evaluate filter {}", sub.getFilter(), e);
}
if (!matches) {
FeedEntryStatus status = new FeedEntryStatus(sub.getUser(), sub, entry);
FeedEntryStatus status = new FeedEntryStatus(sub.getUser(), sub, feedEntry);
status.setRead(true);
feedEntryStatusDAO.saveOrUpdate(status);
}
@@ -69,8 +67,20 @@ public class FeedEntryService {
return true;
}
public void markEntry(User user, Long entryId, boolean read) {
private FeedEntry buildEntry(Feed feed, Entry e, String guid, String guidHash) {
FeedEntry entry = new FeedEntry();
entry.setGuid(guid);
entry.setGuidHash(guidHash);
entry.setUrl(FeedUtils.truncate(e.url(), 2048));
entry.setUpdated(e.updated());
entry.setInserted(new Date());
entry.setFeed(feed);
entry.setContent(feedEntryContentService.findOrCreate(e.content(), feed.getLink()));
return entry;
}
public void markEntry(User user, Long entryId, boolean read) {
FeedEntry entry = feedEntryDAO.findById(entryId);
if (entry == null) {
return;

View File

@@ -37,13 +37,14 @@ public class FeedService {
}
public synchronized Feed findOrCreate(String url) {
String normalized = FeedUtils.normalizeURL(url);
Feed feed = feedDAO.findByUrl(normalized);
String normalizedUrl = FeedUtils.normalizeURL(url);
String normalizedUrlHash = DigestUtils.sha1Hex(normalizedUrl);
Feed feed = feedDAO.findByUrl(normalizedUrl, normalizedUrlHash);
if (feed == null) {
feed = new Feed();
feed.setUrl(url);
feed.setNormalizedUrl(normalized);
feed.setNormalizedUrlHash(DigestUtils.sha1Hex(normalized));
feed.setNormalizedUrl(normalizedUrl);
feed.setNormalizedUrlHash(normalizedUrlHash);
feed.setDisabledUntil(new Date(0));
feedDAO.saveOrUpdate(feed);
}
@@ -55,6 +56,7 @@ public class FeedService {
feed.setNormalizedUrl(normalized);
feed.setNormalizedUrlHash(DigestUtils.sha1Hex(normalized));
feed.setLastUpdated(new Date());
feed.setEtagHeader(FeedUtils.truncate(feed.getEtagHeader(), 255));
feedDAO.saveOrUpdate(feed);
}

View File

@@ -245,8 +245,8 @@ public class FeedREST {
try {
FeedFetcherResult feedFetcherResult = feedFetcher.fetch(url, true, null, null, null, null);
info = new FeedInfo();
info.setUrl(feedFetcherResult.getUrlAfterRedirect());
info.setTitle(feedFetcherResult.getTitle());
info.setUrl(feedFetcherResult.urlAfterRedirect());
info.setTitle(feedFetcherResult.feed().title());
} catch (Exception e) {
log.debug(e.getMessage(), e);

View File

@@ -15,6 +15,7 @@ import org.mockito.junit.jupiter.MockitoExtension;
import com.commafeed.backend.HttpGetter;
import com.commafeed.backend.HttpGetter.HttpResult;
import com.commafeed.backend.HttpGetter.NotModifiedException;
import com.commafeed.backend.feed.parser.FeedParser;
import com.commafeed.backend.urlprovider.FeedURLProvider;
@ExtendWith(MockitoExtension.class)

View File

@@ -64,21 +64,6 @@ class FeedUtilsTest {
}
@Test
void testExtractDeclaredEncoding() {
Assertions.assertNull(FeedUtils.extractDeclaredEncoding("<?xml ?>".getBytes()));
Assertions.assertNull(FeedUtils.extractDeclaredEncoding("<feed></feed>".getBytes()));
Assertions.assertEquals("UTF-8", FeedUtils.extractDeclaredEncoding("<?xml encoding=\"UTF-8\" ?>".getBytes()));
Assertions.assertEquals("UTF-8", FeedUtils.extractDeclaredEncoding("<?xml encoding='UTF-8' ?>".getBytes()));
Assertions.assertEquals("UTF-8", FeedUtils.extractDeclaredEncoding("<?xml encoding='UTF-8'?>".getBytes()));
}
@Test
void testReplaceHtmlEntitiesWithNumericEntities() {
String source = "<source>T&acute;l&acute;phone &prime;</source>";
Assertions.assertEquals("<source>T&#180;l&#180;phone &#8242;</source>", FeedUtils.replaceHtmlEntitiesWithNumericEntities(source));
}
@Test
void testRemoveTrailingSlash() {
final String url = "http://localhost/";

View File

@@ -0,0 +1,19 @@
package com.commafeed.backend.feed.parser;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
class EncodingDetectorTest {
EncodingDetector encodingDetector = new EncodingDetector();
@Test
void testExtractDeclaredEncoding() {
Assertions.assertNull(encodingDetector.extractDeclaredEncoding("<?xml ?>".getBytes()));
Assertions.assertNull(encodingDetector.extractDeclaredEncoding("<feed></feed>".getBytes()));
Assertions.assertEquals("UTF-8", encodingDetector.extractDeclaredEncoding("<?xml encoding=\"UTF-8\" ?>".getBytes()));
Assertions.assertEquals("UTF-8", encodingDetector.extractDeclaredEncoding("<?xml encoding='UTF-8' ?>".getBytes()));
Assertions.assertEquals("UTF-8", encodingDetector.extractDeclaredEncoding("<?xml encoding='UTF-8'?>".getBytes()));
}
}

View File

@@ -0,0 +1,16 @@
package com.commafeed.backend.feed.parser;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
class FeedCleanerTest {
FeedCleaner feedCleaner = new FeedCleaner();
@Test
void testReplaceHtmlEntitiesWithNumericEntities() {
String source = "<source>T&acute;l&acute;phone &prime;</source>";
Assertions.assertEquals("<source>T&#180;l&#180;phone &#8242;</source>", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source));
}
}