From 7b335e2fd4648c63b8b927fb47ec20eb4df3ccdd Mon Sep 17 00:00:00 2001 From: Athou Date: Sun, 7 Jan 2024 10:51:35 +0100 Subject: [PATCH] feed refresh engine now uses its own immutable model --- .../commafeed/backend/cache/CacheService.java | 6 +- .../com/commafeed/backend/dao/FeedDAO.java | 5 +- .../commafeed/backend/dao/FeedEntryDAO.java | 9 +- .../commafeed/backend/feed/FeedFetcher.java | 28 +- .../commafeed/backend/feed/FeedParser.java | 263 ----------------- .../backend/feed/FeedRefreshEngine.java | 2 +- .../feed/FeedRefreshIntervalCalculator.java | 18 +- .../backend/feed/FeedRefreshUpdater.java | 20 +- .../backend/feed/FeedRefreshWorker.java | 38 ++- .../com/commafeed/backend/feed/FeedUtils.java | 171 +---------- .../backend/feed/parser/EncodingDetector.java | 70 +++++ .../backend/feed/parser/FeedCleaner.java | 71 +++++ .../backend/feed/parser/FeedParser.java | 271 ++++++++++++++++++ .../backend/feed/parser/FeedParserResult.java | 20 ++ .../feed/{ => parser}/HtmlEntities.java | 7 +- .../backend/model/FeedEntryContent.java | 19 -- .../service/FeedEntryContentService.java | 69 ++++- .../backend/service/FeedEntryService.java | 36 ++- .../backend/service/FeedService.java | 10 +- .../commafeed/frontend/resource/FeedREST.java | 4 +- .../backend/feed/FeedFetcherTest.java | 1 + .../commafeed/backend/feed/FeedUtilsTest.java | 15 - .../feed/parser/EncodingDetectorTest.java | 19 ++ .../backend/feed/parser/FeedCleanerTest.java | 16 ++ 24 files changed, 623 insertions(+), 565 deletions(-) delete mode 100644 commafeed-server/src/main/java/com/commafeed/backend/feed/FeedParser.java create mode 100644 commafeed-server/src/main/java/com/commafeed/backend/feed/parser/EncodingDetector.java create mode 100644 commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedCleaner.java create mode 100644 commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedParser.java create mode 100644 commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedParserResult.java rename commafeed-server/src/main/java/com/commafeed/backend/feed/{ => parser}/HtmlEntities.java (95%) create mode 100644 commafeed-server/src/test/java/com/commafeed/backend/feed/parser/EncodingDetectorTest.java create mode 100644 commafeed-server/src/test/java/com/commafeed/backend/feed/parser/FeedCleanerTest.java diff --git a/commafeed-server/src/main/java/com/commafeed/backend/cache/CacheService.java b/commafeed-server/src/main/java/com/commafeed/backend/cache/CacheService.java index a470d3c4..f4065a97 100644 --- a/commafeed-server/src/main/java/com/commafeed/backend/cache/CacheService.java +++ b/commafeed-server/src/main/java/com/commafeed/backend/cache/CacheService.java @@ -4,8 +4,8 @@ import java.util.List; import org.apache.commons.codec.digest.DigestUtils; +import com.commafeed.backend.feed.parser.FeedParserResult.Entry; import com.commafeed.backend.model.Feed; -import com.commafeed.backend.model.FeedEntry; import com.commafeed.backend.model.FeedSubscription; import com.commafeed.backend.model.User; import com.commafeed.frontend.model.Category; @@ -18,8 +18,8 @@ public abstract class CacheService { public abstract void setLastEntries(Feed feed, List entries); - public String buildUniqueEntryKey(Feed feed, FeedEntry entry) { - return DigestUtils.sha1Hex(entry.getGuid() + entry.getUrl()); + public String buildUniqueEntryKey(Entry entry) { + return DigestUtils.sha1Hex(entry.guid() + entry.url()); } // user categories diff --git a/commafeed-server/src/main/java/com/commafeed/backend/dao/FeedDAO.java b/commafeed-server/src/main/java/com/commafeed/backend/dao/FeedDAO.java index 417ebdce..2520c1a1 100644 --- a/commafeed-server/src/main/java/com/commafeed/backend/dao/FeedDAO.java +++ b/commafeed-server/src/main/java/com/commafeed/backend/dao/FeedDAO.java @@ -3,7 +3,6 @@ package com.commafeed.backend.dao; import java.util.Date; import java.util.List; -import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.StringUtils; import org.hibernate.SessionFactory; @@ -45,8 +44,8 @@ public class FeedDAO extends GenericDAO { updateQuery(feed).set(feed.disabledUntil, date).where(feed.id.in(feedIds)).execute(); } - public Feed findByUrl(String normalizedUrl) { - List feeds = query().selectFrom(feed).where(feed.normalizedUrlHash.eq(DigestUtils.sha1Hex(normalizedUrl))).fetch(); + public Feed findByUrl(String normalizedUrl, String normalizedUrlHash) { + List feeds = query().selectFrom(feed).where(feed.normalizedUrlHash.eq(normalizedUrlHash)).fetch(); Feed feed = Iterables.getFirst(feeds, null); if (feed != null && StringUtils.equals(normalizedUrl, feed.getNormalizedUrl())) { return feed; diff --git a/commafeed-server/src/main/java/com/commafeed/backend/dao/FeedEntryDAO.java b/commafeed-server/src/main/java/com/commafeed/backend/dao/FeedEntryDAO.java index b119efa9..34af1d88 100644 --- a/commafeed-server/src/main/java/com/commafeed/backend/dao/FeedEntryDAO.java +++ b/commafeed-server/src/main/java/com/commafeed/backend/dao/FeedEntryDAO.java @@ -2,7 +2,6 @@ package com.commafeed.backend.dao; import java.util.List; -import org.apache.commons.codec.digest.DigestUtils; import org.hibernate.SessionFactory; import com.commafeed.backend.model.Feed; @@ -26,12 +25,8 @@ public class FeedEntryDAO extends GenericDAO { super(sessionFactory); } - public Long findExisting(String guid, Feed feed) { - return query().select(entry.id) - .from(entry) - .where(entry.guidHash.eq(DigestUtils.sha1Hex(guid)), entry.feed.eq(feed)) - .limit(1) - .fetchOne(); + public Long findExisting(String guidHash, Feed feed) { + return query().select(entry.id).from(entry).where(entry.guidHash.eq(guidHash), entry.feed.eq(feed)).limit(1).fetchOne(); } public List findFeedsExceedingCapacity(long maxCapacity, long max) { diff --git a/commafeed-server/src/main/java/com/commafeed/backend/feed/FeedFetcher.java b/commafeed-server/src/main/java/com/commafeed/backend/feed/FeedFetcher.java index 2955d09e..592d5e6a 100644 --- a/commafeed-server/src/main/java/com/commafeed/backend/feed/FeedFetcher.java +++ b/commafeed-server/src/main/java/com/commafeed/backend/feed/FeedFetcher.java @@ -2,7 +2,6 @@ package com.commafeed.backend.feed; import java.io.IOException; import java.util.Date; -import java.util.List; import java.util.Set; import org.apache.commons.codec.binary.StringUtils; @@ -11,16 +10,14 @@ import org.apache.commons.codec.digest.DigestUtils; import com.commafeed.backend.HttpGetter; import com.commafeed.backend.HttpGetter.HttpResult; import com.commafeed.backend.HttpGetter.NotModifiedException; -import com.commafeed.backend.feed.FeedParser.FeedParserResult; -import com.commafeed.backend.model.Feed; -import com.commafeed.backend.model.FeedEntry; +import com.commafeed.backend.feed.parser.FeedParser; +import com.commafeed.backend.feed.parser.FeedParserResult; import com.commafeed.backend.urlprovider.FeedURLProvider; import com.rometools.rome.io.FeedException; import jakarta.inject.Inject; import jakarta.inject.Singleton; import lombok.RequiredArgsConstructor; -import lombok.Value; import lombok.extern.slf4j.Slf4j; /** @@ -79,20 +76,16 @@ public class FeedFetcher { etagHeaderValueChanged ? result.getETag() : null); } - if (lastPublishedDate != null && parserResult.getFeed().getLastPublishedDate() != null - && lastPublishedDate.getTime() == parserResult.getFeed().getLastPublishedDate().getTime()) { + if (lastPublishedDate != null && parserResult.lastPublishedDate() != null + && lastPublishedDate.getTime() == parserResult.lastPublishedDate().getTime()) { log.debug("publishedDate not modified: {}", feedUrl); throw new NotModifiedException("publishedDate not modified", lastModifiedHeaderValueChanged ? result.getLastModifiedSince() : null, etagHeaderValueChanged ? result.getETag() : null); } - Feed feed = parserResult.getFeed(); - feed.setLastModifiedHeader(result.getLastModifiedSince()); - feed.setEtagHeader(FeedUtils.truncate(result.getETag(), 255)); - feed.setLastContentHash(hash); - return new FeedFetcherResult(parserResult.getFeed(), parserResult.getEntries(), parserResult.getTitle(), - result.getUrlAfterRedirect(), result.getDuration()); + return new FeedFetcherResult(parserResult, result.getUrlAfterRedirect(), result.getLastModifiedSince(), result.getETag(), hash, + result.getDuration()); } private static String extractFeedUrl(Set urlProviders, String url, String urlContent) { @@ -106,13 +99,8 @@ public class FeedFetcher { return null; } - @Value - public static class FeedFetcherResult { - Feed feed; - List entries; - String title; - String urlAfterRedirect; - long fetchDuration; + public record FeedFetcherResult(FeedParserResult feed, String urlAfterRedirect, String lastModifiedHeader, String lastETagHeader, + String contentHash, long fetchDuration) { } } diff --git a/commafeed-server/src/main/java/com/commafeed/backend/feed/FeedParser.java b/commafeed-server/src/main/java/com/commafeed/backend/feed/FeedParser.java deleted file mode 100644 index 2af2fef0..00000000 --- a/commafeed-server/src/main/java/com/commafeed/backend/feed/FeedParser.java +++ /dev/null @@ -1,263 +0,0 @@ -package com.commafeed.backend.feed; - -import java.io.StringReader; -import java.nio.charset.Charset; -import java.text.DateFormat; -import java.util.ArrayList; -import java.util.Date; -import java.util.List; -import java.util.stream.Collectors; - -import org.apache.commons.lang3.ArrayUtils; -import org.apache.commons.lang3.StringUtils; -import org.jdom2.Element; -import org.jdom2.Namespace; -import org.xml.sax.InputSource; - -import com.commafeed.backend.model.Feed; -import com.commafeed.backend.model.FeedEntry; -import com.commafeed.backend.model.FeedEntryContent; -import com.google.common.collect.Iterables; -import com.rometools.modules.mediarss.MediaEntryModule; -import com.rometools.modules.mediarss.MediaModule; -import com.rometools.modules.mediarss.types.MediaGroup; -import com.rometools.modules.mediarss.types.Metadata; -import com.rometools.modules.mediarss.types.Thumbnail; -import com.rometools.rome.feed.synd.SyndCategory; -import com.rometools.rome.feed.synd.SyndContent; -import com.rometools.rome.feed.synd.SyndEnclosure; -import com.rometools.rome.feed.synd.SyndEntry; -import com.rometools.rome.feed.synd.SyndFeed; -import com.rometools.rome.feed.synd.SyndLink; -import com.rometools.rome.feed.synd.SyndLinkImpl; -import com.rometools.rome.io.FeedException; -import com.rometools.rome.io.SyndFeedInput; - -import jakarta.inject.Inject; -import jakarta.inject.Singleton; -import lombok.Data; -import lombok.RequiredArgsConstructor; -import lombok.Value; - -/** - * Parses raw xml as a Feed object - */ -@RequiredArgsConstructor(onConstructor = @__({ @Inject })) -@Singleton -public class FeedParser { - - private static final String ATOM_10_URI = "http://www.w3.org/2005/Atom"; - private static final Namespace ATOM_10_NS = Namespace.getNamespace(ATOM_10_URI); - - private static final Date START = new Date(86400000); - private static final Date END = new Date(1000L * Integer.MAX_VALUE - 86400000); - - public FeedParserResult parse(String feedUrl, byte[] xml) throws FeedException { - - try { - Charset encoding = FeedUtils.guessEncoding(xml); - String xmlString = FeedUtils.trimInvalidXmlCharacters(new String(xml, encoding)); - if (xmlString == null) { - throw new FeedException("Input string is null for url " + feedUrl); - } - xmlString = FeedUtils.replaceHtmlEntitiesWithNumericEntities(xmlString); - InputSource source = new InputSource(new StringReader(xmlString)); - - SyndFeed rss = new SyndFeedInput().build(source); - handleForeignMarkup(rss); - - String title = rss.getTitle(); - Feed feed = new Feed(); - feed.setUrl(feedUrl); - feed.setLink(rss.getLink()); - - List entries = new ArrayList<>(); - for (SyndEntry item : rss.getEntries()) { - FeedEntry entry = new FeedEntry(); - - String guid = item.getUri(); - if (StringUtils.isBlank(guid)) { - guid = item.getLink(); - } - if (StringUtils.isBlank(guid)) { - // no guid and no link, skip entry - continue; - } - entry.setGuid(FeedUtils.truncate(guid, 2048)); - entry.setUpdated(validateDate(getEntryUpdateDate(item), true)); - entry.setUrl(FeedUtils.truncate(FeedUtils.toAbsoluteUrl(item.getLink(), feed.getLink(), feedUrl), 2048)); - - // if link is empty but guid is used as url - if (StringUtils.isBlank(entry.getUrl()) && StringUtils.startsWith(entry.getGuid(), "http")) { - entry.setUrl(entry.getGuid()); - } - - FeedEntryContent content = new FeedEntryContent(); - content.setContent(getContent(item)); - content.setCategories(FeedUtils - .truncate(item.getCategories().stream().map(SyndCategory::getName).collect(Collectors.joining(", ")), 4096)); - content.setTitle(getTitle(item)); - content.setAuthor(StringUtils.trimToNull(item.getAuthor())); - - SyndEnclosure enclosure = Iterables.getFirst(item.getEnclosures(), null); - if (enclosure != null) { - content.setEnclosureUrl(FeedUtils.truncate(enclosure.getUrl(), 2048)); - content.setEnclosureType(enclosure.getType()); - } - - MediaEntryModule module = (MediaEntryModule) item.getModule(MediaModule.URI); - if (module != null) { - Media media = getMedia(module); - if (media != null) { - content.setMediaDescription(media.getDescription()); - content.setMediaThumbnailUrl(FeedUtils.truncate(media.getThumbnailUrl(), 2048)); - content.setMediaThumbnailWidth(media.getThumbnailWidth()); - content.setMediaThumbnailHeight(media.getThumbnailHeight()); - } - } - - entry.setContent(content); - - entries.add(entry); - } - - Date lastEntryDate = null; - Date publishedDate = validateDate(rss.getPublishedDate(), false); - if (!entries.isEmpty()) { - List sortedTimestamps = FeedUtils.getSortedTimestamps(entries); - Long timestamp = sortedTimestamps.get(0); - lastEntryDate = new Date(timestamp); - publishedDate = (publishedDate == null || publishedDate.before(lastEntryDate)) ? lastEntryDate : publishedDate; - } - feed.setLastPublishedDate(publishedDate); - feed.setAverageEntryInterval(FeedUtils.averageTimeBetweenEntries(entries)); - feed.setLastEntryDate(lastEntryDate); - - return new FeedParserResult(feed, entries, title); - } catch (Exception e) { - throw new FeedException(String.format("Could not parse feed from %s : %s", feedUrl, e.getMessage()), e); - } - } - - /** - * Adds atom links for rss feeds - */ - private void handleForeignMarkup(SyndFeed feed) { - List foreignMarkup = feed.getForeignMarkup(); - if (foreignMarkup == null) { - return; - } - for (Element element : foreignMarkup) { - if ("link".equals(element.getName()) && ATOM_10_NS.equals(element.getNamespace())) { - SyndLink link = new SyndLinkImpl(); - link.setRel(element.getAttributeValue("rel")); - link.setHref(element.getAttributeValue("href")); - feed.getLinks().add(link); - } - } - } - - private Date getEntryUpdateDate(SyndEntry item) { - Date date = item.getUpdatedDate(); - if (date == null) { - date = item.getPublishedDate(); - } - if (date == null) { - date = new Date(); - } - return date; - } - - private Date validateDate(Date date, boolean nullToNow) { - Date now = new Date(); - if (date == null) { - return nullToNow ? now : null; - } - if (date.before(START) || date.after(END)) { - return now; - } - - if (date.after(now)) { - return now; - } - return date; - } - - private String getContent(SyndEntry item) { - String content; - if (item.getContents().isEmpty()) { - content = item.getDescription() == null ? null : item.getDescription().getValue(); - } else { - content = item.getContents().stream().map(SyndContent::getValue).collect(Collectors.joining(System.lineSeparator())); - } - return StringUtils.trimToNull(content); - } - - private String getTitle(SyndEntry item) { - String title = item.getTitle(); - if (StringUtils.isBlank(title)) { - Date date = item.getPublishedDate(); - if (date != null) { - title = DateFormat.getInstance().format(date); - } else { - title = "(no title)"; - } - } - return StringUtils.trimToNull(title); - } - - private Media getMedia(MediaEntryModule module) { - Media media = getMedia(module.getMetadata()); - if (media == null && ArrayUtils.isNotEmpty(module.getMediaGroups())) { - MediaGroup group = module.getMediaGroups()[0]; - media = getMedia(group.getMetadata()); - } - - return media; - } - - private Media getMedia(Metadata metadata) { - if (metadata == null) { - return null; - } - - Media media = new Media(); - media.setDescription(metadata.getDescription()); - - if (ArrayUtils.isNotEmpty(metadata.getThumbnail())) { - Thumbnail thumbnail = metadata.getThumbnail()[0]; - media.setThumbnailWidth(thumbnail.getWidth()); - media.setThumbnailHeight(thumbnail.getHeight()); - - if (thumbnail.getUrl() != null) { - media.setThumbnailUrl(thumbnail.getUrl().toString()); - } - } - - if (media.isEmpty()) { - return null; - } - - return media; - } - - @Data - private static class Media { - private String description; - private String thumbnailUrl; - private Integer thumbnailWidth; - private Integer thumbnailHeight; - - public boolean isEmpty() { - return description == null && thumbnailUrl == null; - } - } - - @Value - public static class FeedParserResult { - Feed feed; - List entries; - String title; - } - -} diff --git a/commafeed-server/src/main/java/com/commafeed/backend/feed/FeedRefreshEngine.java b/commafeed-server/src/main/java/com/commafeed/backend/feed/FeedRefreshEngine.java index 3a828081..291d3061 100644 --- a/commafeed-server/src/main/java/com/commafeed/backend/feed/FeedRefreshEngine.java +++ b/commafeed-server/src/main/java/com/commafeed/backend/feed/FeedRefreshEngine.java @@ -156,7 +156,7 @@ public class FeedRefreshEngine implements Managed { private void processFeedAsync(Feed feed) { CompletableFuture.supplyAsync(() -> worker.update(feed), workerExecutor) - .thenApplyAsync(r -> updater.update(r.getFeed(), r.getEntries()), databaseUpdaterExecutor) + .thenApplyAsync(r -> updater.update(r.feed(), r.entries()), databaseUpdaterExecutor) .whenComplete((data, ex) -> { if (ex != null) { log.error("error while processing feed {}", feed.getUrl(), ex); diff --git a/commafeed-server/src/main/java/com/commafeed/backend/feed/FeedRefreshIntervalCalculator.java b/commafeed-server/src/main/java/com/commafeed/backend/feed/FeedRefreshIntervalCalculator.java index 3db7abff..3d19ca89 100644 --- a/commafeed-server/src/main/java/com/commafeed/backend/feed/FeedRefreshIntervalCalculator.java +++ b/commafeed-server/src/main/java/com/commafeed/backend/feed/FeedRefreshIntervalCalculator.java @@ -5,7 +5,6 @@ import java.util.Date; import org.apache.commons.lang3.time.DateUtils; import com.commafeed.CommaFeedConfiguration; -import com.commafeed.backend.model.Feed; import jakarta.inject.Inject; import jakarta.inject.Singleton; @@ -22,18 +21,19 @@ public class FeedRefreshIntervalCalculator { this.refreshIntervalMinutes = config.getApplicationSettings().getRefreshIntervalMinutes(); } - public Date onFetchSuccess(Feed feed) { + public Date onFetchSuccess(Date publishedDate, Long averageEntryInterval) { Date defaultRefreshInterval = getDefaultRefreshInterval(); - return heavyLoad ? computeRefreshIntervalForHeavyLoad(feed, defaultRefreshInterval) : defaultRefreshInterval; + return heavyLoad ? computeRefreshIntervalForHeavyLoad(publishedDate, averageEntryInterval, defaultRefreshInterval) + : defaultRefreshInterval; } - public Date onFeedNotModified(Feed feed) { + public Date onFeedNotModified(Date publishedDate, Long averageEntryInterval) { Date defaultRefreshInterval = getDefaultRefreshInterval(); - return heavyLoad ? computeRefreshIntervalForHeavyLoad(feed, defaultRefreshInterval) : defaultRefreshInterval; + return heavyLoad ? computeRefreshIntervalForHeavyLoad(publishedDate, averageEntryInterval, defaultRefreshInterval) + : defaultRefreshInterval; } - public Date onFetchError(Feed feed) { - int errorCount = feed.getErrorCount(); + public Date onFetchError(int errorCount) { int retriesBeforeDisable = 3; if (errorCount < retriesBeforeDisable || !heavyLoad) { return getDefaultRefreshInterval(); @@ -47,10 +47,8 @@ public class FeedRefreshIntervalCalculator { return DateUtils.addMinutes(new Date(), refreshIntervalMinutes); } - private Date computeRefreshIntervalForHeavyLoad(Feed feed, Date defaultRefreshInterval) { + private Date computeRefreshIntervalForHeavyLoad(Date publishedDate, Long averageEntryInterval, Date defaultRefreshInterval) { Date now = new Date(); - Date publishedDate = feed.getLastEntryDate(); - Long averageEntryInterval = feed.getAverageEntryInterval(); if (publishedDate == null) { // feed with no entries, recheck in 24 hours diff --git a/commafeed-server/src/main/java/com/commafeed/backend/feed/FeedRefreshUpdater.java b/commafeed-server/src/main/java/com/commafeed/backend/feed/FeedRefreshUpdater.java index d63a7e63..5e5a921d 100644 --- a/commafeed-server/src/main/java/com/commafeed/backend/feed/FeedRefreshUpdater.java +++ b/commafeed-server/src/main/java/com/commafeed/backend/feed/FeedRefreshUpdater.java @@ -16,9 +16,9 @@ import com.codahale.metrics.MetricRegistry; import com.commafeed.backend.cache.CacheService; import com.commafeed.backend.dao.FeedSubscriptionDAO; import com.commafeed.backend.dao.UnitOfWork; +import com.commafeed.backend.feed.parser.FeedParserResult.Content; +import com.commafeed.backend.feed.parser.FeedParserResult.Entry; import com.commafeed.backend.model.Feed; -import com.commafeed.backend.model.FeedEntry; -import com.commafeed.backend.model.FeedEntryContent; import com.commafeed.backend.model.FeedSubscription; import com.commafeed.backend.model.User; import com.commafeed.backend.service.FeedEntryService; @@ -72,7 +72,7 @@ public class FeedRefreshUpdater implements Managed { entryInserted = metrics.meter(MetricRegistry.name(getClass(), "entryInserted")); } - private AddEntryResult addEntry(final Feed feed, final FeedEntry entry, final List subscriptions) { + private AddEntryResult addEntry(final Feed feed, final Entry entry, final List subscriptions) { boolean processed = false; boolean inserted = false; @@ -82,8 +82,8 @@ public class FeedRefreshUpdater implements Managed { // lock on content, make sure we are not updating the same entry // twice at the same time - FeedEntryContent content = entry.getContent(); - String key2 = DigestUtils.sha1Hex(StringUtils.trimToEmpty(content.getContent() + content.getTitle())); + Content content = entry.content(); + String key2 = DigestUtils.sha1Hex(StringUtils.trimToEmpty(content.content() + content.title())); Iterator iterator = locks.bulkGet(Arrays.asList(key1, key2)).iterator(); Lock lock1 = iterator.next(); @@ -116,7 +116,7 @@ public class FeedRefreshUpdater implements Managed { return new AddEntryResult(processed, inserted); } - public boolean update(Feed feed, List entries) { + public boolean update(Feed feed, List entries) { boolean processed = true; boolean insertedAtLeastOneEntry = false; @@ -125,10 +125,10 @@ public class FeedRefreshUpdater implements Managed { List currentEntries = new ArrayList<>(); List subscriptions = null; - for (FeedEntry entry : entries) { - String cacheKey = cache.buildUniqueEntryKey(feed, entry); + for (Entry entry : entries) { + String cacheKey = cache.buildUniqueEntryKey(entry); if (!lastEntries.contains(cacheKey)) { - log.debug("cache miss for {}", entry.getUrl()); + log.debug("cache miss for {}", entry.url()); if (subscriptions == null) { subscriptions = unitOfWork.call(() -> feedSubscriptionDAO.findByFeed(feed)); } @@ -138,7 +138,7 @@ public class FeedRefreshUpdater implements Managed { entryCacheMiss.mark(); } else { - log.debug("cache hit for {}", entry.getUrl()); + log.debug("cache hit for {}", entry.url()); entryCacheHit.mark(); } diff --git a/commafeed-server/src/main/java/com/commafeed/backend/feed/FeedRefreshWorker.java b/commafeed-server/src/main/java/com/commafeed/backend/feed/FeedRefreshWorker.java index a8ed7b4c..11a0ebe4 100644 --- a/commafeed-server/src/main/java/com/commafeed/backend/feed/FeedRefreshWorker.java +++ b/commafeed-server/src/main/java/com/commafeed/backend/feed/FeedRefreshWorker.java @@ -11,16 +11,15 @@ import com.codahale.metrics.MetricRegistry; import com.commafeed.CommaFeedConfiguration; import com.commafeed.backend.HttpGetter.NotModifiedException; import com.commafeed.backend.feed.FeedFetcher.FeedFetcherResult; +import com.commafeed.backend.feed.parser.FeedParserResult.Entry; import com.commafeed.backend.model.Feed; -import com.commafeed.backend.model.FeedEntry; import jakarta.inject.Inject; import jakarta.inject.Singleton; -import lombok.Value; import lombok.extern.slf4j.Slf4j; /** - * Calls {@link FeedFetcher} and updates the Feed object, but does not update the database ({@link FeedRefreshUpdater} does that) + * Calls {@link FeedFetcher} and updates the Feed object, but does not update the database, ({@link FeedRefreshUpdater} does that) */ @Slf4j @Singleton @@ -44,32 +43,34 @@ public class FeedRefreshWorker { public FeedRefreshWorkerResult update(Feed feed) { try { String url = Optional.ofNullable(feed.getUrlAfterRedirect()).orElse(feed.getUrl()); - FeedFetcherResult feedFetcherResult = fetcher.fetch(url, false, feed.getLastModifiedHeader(), feed.getEtagHeader(), + FeedFetcherResult result = fetcher.fetch(url, false, feed.getLastModifiedHeader(), feed.getEtagHeader(), feed.getLastPublishedDate(), feed.getLastContentHash()); // stops here if NotModifiedException or any other exception is thrown - List entries = feedFetcherResult.getEntries(); Integer maxFeedCapacity = config.getApplicationSettings().getMaxFeedCapacity(); + List entries = result.feed().entries(); if (maxFeedCapacity > 0) { entries = entries.stream().limit(maxFeedCapacity).toList(); } - String urlAfterRedirect = feedFetcherResult.getUrlAfterRedirect(); + String urlAfterRedirect = result.urlAfterRedirect(); if (StringUtils.equals(url, urlAfterRedirect)) { urlAfterRedirect = null; } + feed.setUrlAfterRedirect(urlAfterRedirect); - feed.setLink(feedFetcherResult.getFeed().getLink()); - feed.setLastModifiedHeader(feedFetcherResult.getFeed().getLastModifiedHeader()); - feed.setEtagHeader(feedFetcherResult.getFeed().getEtagHeader()); - feed.setLastContentHash(feedFetcherResult.getFeed().getLastContentHash()); - feed.setLastPublishedDate(feedFetcherResult.getFeed().getLastPublishedDate()); - feed.setAverageEntryInterval(feedFetcherResult.getFeed().getAverageEntryInterval()); - feed.setLastEntryDate(feedFetcherResult.getFeed().getLastEntryDate()); + feed.setLink(result.feed().link()); + feed.setLastModifiedHeader(result.lastModifiedHeader()); + feed.setEtagHeader(result.lastETagHeader()); + feed.setLastContentHash(result.contentHash()); + feed.setLastPublishedDate(result.feed().lastPublishedDate()); + feed.setAverageEntryInterval(result.feed().averageEntryInterval()); + feed.setLastEntryDate(result.feed().lastEntryDate()); feed.setErrorCount(0); feed.setMessage(null); - feed.setDisabledUntil(refreshIntervalCalculator.onFetchSuccess(feedFetcherResult.getFeed())); + feed.setDisabledUntil( + refreshIntervalCalculator.onFetchSuccess(result.feed().lastPublishedDate(), result.feed().averageEntryInterval())); return new FeedRefreshWorkerResult(feed, entries); } catch (NotModifiedException e) { @@ -77,7 +78,7 @@ public class FeedRefreshWorker { feed.setErrorCount(0); feed.setMessage(e.getMessage()); - feed.setDisabledUntil(refreshIntervalCalculator.onFeedNotModified(feed)); + feed.setDisabledUntil(refreshIntervalCalculator.onFeedNotModified(feed.getLastPublishedDate(), feed.getAverageEntryInterval())); if (e.getNewLastModifiedHeader() != null) { feed.setLastModifiedHeader(e.getNewLastModifiedHeader()); @@ -93,7 +94,7 @@ public class FeedRefreshWorker { feed.setErrorCount(feed.getErrorCount() + 1); feed.setMessage("Unable to refresh feed : " + e.getMessage()); - feed.setDisabledUntil(refreshIntervalCalculator.onFetchError(feed)); + feed.setDisabledUntil(refreshIntervalCalculator.onFetchError(feed.getErrorCount())); return new FeedRefreshWorkerResult(feed, Collections.emptyList()); } finally { @@ -101,10 +102,7 @@ public class FeedRefreshWorker { } } - @Value - public static class FeedRefreshWorkerResult { - Feed feed; - List entries; + public record FeedRefreshWorkerResult(Feed feed, List entries) { } } diff --git a/commafeed-server/src/main/java/com/commafeed/backend/feed/FeedUtils.java b/commafeed-server/src/main/java/com/commafeed/backend/feed/FeedUtils.java index 3fa87e93..76904118 100644 --- a/commafeed-server/src/main/java/com/commafeed/backend/feed/FeedUtils.java +++ b/commafeed-server/src/main/java/com/commafeed/backend/feed/FeedUtils.java @@ -2,20 +2,12 @@ package com.commafeed.backend.feed; import java.net.MalformedURLException; import java.net.URL; -import java.nio.charset.Charset; -import java.util.Collection; -import java.util.Collections; import java.util.Iterator; import java.util.List; import java.util.regex.Pattern; -import org.ahocorasick.trie.Emit; -import org.ahocorasick.trie.Trie; -import org.ahocorasick.trie.Trie.TrieBuilder; import org.apache.commons.codec.binary.Base64; -import org.apache.commons.lang3.ArrayUtils; import org.apache.commons.lang3.StringUtils; -import org.apache.commons.math3.stat.descriptive.SummaryStatistics; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; @@ -29,8 +21,6 @@ import com.commafeed.backend.model.FeedSubscription; import com.commafeed.frontend.model.Entry; import com.google.gwt.i18n.client.HasDirection.Direction; import com.google.gwt.i18n.shared.BidiUtils; -import com.ibm.icu.text.CharsetDetector; -import com.ibm.icu.text.CharsetMatch; import lombok.extern.slf4j.Slf4j; @@ -50,70 +40,6 @@ public class FeedUtils { return string; } - /** - * Detect feed encoding by using the declared encoding in the xml processing instruction and by detecting the characters used in the - * feed - * - */ - public static Charset guessEncoding(byte[] bytes) { - String extracted = extractDeclaredEncoding(bytes); - if (StringUtils.startsWithIgnoreCase(extracted, "iso-8859-")) { - if (!StringUtils.endsWith(extracted, "1")) { - return Charset.forName(extracted); - } - } else if (StringUtils.startsWithIgnoreCase(extracted, "windows-")) { - return Charset.forName(extracted); - } - return detectEncoding(bytes); - } - - /** - * Detect encoding by analyzing characters in the array - */ - public static Charset detectEncoding(byte[] bytes) { - String encoding = "UTF-8"; - - CharsetDetector detector = new CharsetDetector(); - detector.setText(bytes); - CharsetMatch match = detector.detect(); - if (match != null) { - encoding = match.getName(); - } - if (encoding.equalsIgnoreCase("ISO-8859-1")) { - encoding = "windows-1252"; - } - return Charset.forName(encoding); - } - - public static String replaceHtmlEntitiesWithNumericEntities(String source) { - // Create a buffer sufficiently large that re-allocations are minimized. - StringBuilder sb = new StringBuilder(source.length() << 1); - - TrieBuilder builder = Trie.builder(); - builder.ignoreOverlaps(); - - for (String key : HtmlEntities.HTML_ENTITIES) { - builder.addKeyword(key); - } - - Trie trie = builder.build(); - Collection emits = trie.parseText(source); - - int prevIndex = 0; - for (Emit emit : emits) { - int matchIndex = emit.getStart(); - - sb.append(source, prevIndex, matchIndex); - sb.append(HtmlEntities.HTML_TO_NUMERIC_MAP.get(emit.getKeyword())); - prevIndex = emit.getEnd() + 1; - } - - // Add the remainder of the string (contains no more matches). - sb.append(source.substring(prevIndex)); - - return sb.toString(); - } - public static boolean isHttp(String url) { return url.startsWith("http://"); } @@ -122,6 +48,10 @@ public class FeedUtils { return url.startsWith("https://"); } + public static boolean isAbsoluteUrl(String url) { + return isHttp(url) || isHttps(url); + } + /** * Normalize the url. The resulting url is not meant to be fetched but rather used as a mean to identify a feed and avoid duplicates */ @@ -163,25 +93,6 @@ public class FeedUtils { return normalized; } - /** - * Extract the declared encoding from the xml - */ - public static String extractDeclaredEncoding(byte[] bytes) { - int index = ArrayUtils.indexOf(bytes, (byte) '>'); - if (index == -1) { - return null; - } - - String pi = new String(ArrayUtils.subarray(bytes, 0, index + 1)).replace('\'', '"'); - index = StringUtils.indexOf(pi, "encoding=\""); - if (index == -1) { - return null; - } - String encoding = pi.substring(index + 10); - encoding = encoding.substring(0, encoding.indexOf('"')); - return encoding; - } - public static boolean isRTL(FeedEntry entry) { String text = entry.getContent().getContent(); @@ -202,52 +113,6 @@ public class FeedUtils { return direction == Direction.RTL; } - public static String trimInvalidXmlCharacters(String xml) { - if (StringUtils.isBlank(xml)) { - return null; - } - StringBuilder sb = new StringBuilder(); - - boolean firstTagFound = false; - for (int i = 0; i < xml.length(); i++) { - char c = xml.charAt(i); - - if (!firstTagFound) { - if (c == '<') { - firstTagFound = true; - } else { - continue; - } - } - - if (c >= 32 || c == 9 || c == 10 || c == 13) { - if (!Character.isHighSurrogate(c) && !Character.isLowSurrogate(c)) { - sb.append(c); - } - } - } - return sb.toString(); - } - - public static Long averageTimeBetweenEntries(List entries) { - if (entries.isEmpty() || entries.size() == 1) { - return null; - } - - List timestamps = getSortedTimestamps(entries); - - SummaryStatistics stats = new SummaryStatistics(); - for (int i = 0; i < timestamps.size() - 1; i++) { - long diff = Math.abs(timestamps.get(i) - timestamps.get(i + 1)); - stats.addValue(diff); - } - return (long) stats.getMean(); - } - - public static List getSortedTimestamps(List entries) { - return entries.stream().map(t -> t.getUpdated().getTime()).sorted(Collections.reverseOrder()).toList(); - } - public static String removeTrailingSlash(String url) { if (url.endsWith("/")) { url = url.substring(0, url.length() - 1); @@ -256,8 +121,8 @@ public class FeedUtils { } /** - * - * @param url + * + * @param relativeUrl * the url of the entry * @param feedLink * the url of the feed as described in the feed @@ -265,32 +130,18 @@ public class FeedUtils { * the url of the feed that we used to fetch the feed * @return an absolute url pointing to the entry */ - public static String toAbsoluteUrl(String url, String feedLink, String feedUrl) { - url = StringUtils.trimToNull(StringUtils.normalizeSpace(url)); - if (url == null || url.startsWith("http")) { - return url; - } - - String baseUrl = (feedLink == null || isRelative(feedLink)) ? feedUrl : feedLink; - + public static String toAbsoluteUrl(String relativeUrl, String feedLink, String feedUrl) { + String baseUrl = (feedLink != null && isAbsoluteUrl(feedLink)) ? feedLink : feedUrl; if (baseUrl == null) { - return url; + return null; } - String result; try { - result = new URL(new URL(baseUrl), url).toString(); + return new URL(new URL(baseUrl), relativeUrl).toString(); } catch (MalformedURLException e) { log.debug("could not parse url : " + e.getMessage(), e); - result = url; + return null; } - - return result; - } - - public static boolean isRelative(final String url) { - // the regex means "start with 'scheme://'" - return url.startsWith("/") || url.startsWith("#") || !url.matches("^\\w+\\:\\/\\/.*"); } public static String getFaviconUrl(FeedSubscription subscription) { diff --git a/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/EncodingDetector.java b/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/EncodingDetector.java new file mode 100644 index 00000000..0dc092d7 --- /dev/null +++ b/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/EncodingDetector.java @@ -0,0 +1,70 @@ +package com.commafeed.backend.feed.parser; + +import java.nio.charset.Charset; + +import org.apache.commons.lang3.ArrayUtils; +import org.apache.commons.lang3.StringUtils; + +import com.ibm.icu.text.CharsetDetector; +import com.ibm.icu.text.CharsetMatch; + +import jakarta.inject.Singleton; + +@Singleton +class EncodingDetector { + + /** + * Detect feed encoding by using the declared encoding in the xml processing instruction and by detecting the characters used in the + * feed + * + */ + public Charset getEncoding(byte[] bytes) { + String extracted = extractDeclaredEncoding(bytes); + if (StringUtils.startsWithIgnoreCase(extracted, "iso-8859-")) { + if (!StringUtils.endsWith(extracted, "1")) { + return Charset.forName(extracted); + } + } else if (StringUtils.startsWithIgnoreCase(extracted, "windows-")) { + return Charset.forName(extracted); + } + return detectEncoding(bytes); + } + + /** + * Extract the declared encoding from the xml + */ + public String extractDeclaredEncoding(byte[] bytes) { + int index = ArrayUtils.indexOf(bytes, (byte) '>'); + if (index == -1) { + return null; + } + + String pi = new String(ArrayUtils.subarray(bytes, 0, index + 1)).replace('\'', '"'); + index = StringUtils.indexOf(pi, "encoding=\""); + if (index == -1) { + return null; + } + String encoding = pi.substring(index + 10); + encoding = encoding.substring(0, encoding.indexOf('"')); + return encoding; + } + + /** + * Detect encoding by analyzing characters in the array + */ + private Charset detectEncoding(byte[] bytes) { + String encoding = "UTF-8"; + + CharsetDetector detector = new CharsetDetector(); + detector.setText(bytes); + CharsetMatch match = detector.detect(); + if (match != null) { + encoding = match.getName(); + } + if (encoding.equalsIgnoreCase("ISO-8859-1")) { + encoding = "windows-1252"; + } + return Charset.forName(encoding); + } + +} diff --git a/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedCleaner.java b/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedCleaner.java new file mode 100644 index 00000000..6a175239 --- /dev/null +++ b/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedCleaner.java @@ -0,0 +1,71 @@ +package com.commafeed.backend.feed.parser; + +import java.util.Collection; + +import org.ahocorasick.trie.Emit; +import org.ahocorasick.trie.Trie; +import org.ahocorasick.trie.Trie.TrieBuilder; +import org.apache.commons.lang3.StringUtils; + +import jakarta.inject.Singleton; + +@Singleton +class FeedCleaner { + + public String trimInvalidXmlCharacters(String xml) { + if (StringUtils.isBlank(xml)) { + return null; + } + StringBuilder sb = new StringBuilder(); + + boolean firstTagFound = false; + for (int i = 0; i < xml.length(); i++) { + char c = xml.charAt(i); + + if (!firstTagFound) { + if (c == '<') { + firstTagFound = true; + } else { + continue; + } + } + + if (c >= 32 || c == 9 || c == 10 || c == 13) { + if (!Character.isHighSurrogate(c) && !Character.isLowSurrogate(c)) { + sb.append(c); + } + } + } + return sb.toString(); + } + + public String replaceHtmlEntitiesWithNumericEntities(String source) { + // Create a buffer sufficiently large that re-allocations are minimized. + StringBuilder sb = new StringBuilder(source.length() << 1); + + TrieBuilder builder = Trie.builder(); + builder.ignoreOverlaps(); + + for (String key : HtmlEntities.HTML_ENTITIES) { + builder.addKeyword(key); + } + + Trie trie = builder.build(); + Collection emits = trie.parseText(source); + + int prevIndex = 0; + for (Emit emit : emits) { + int matchIndex = emit.getStart(); + + sb.append(source, prevIndex, matchIndex); + sb.append(HtmlEntities.HTML_TO_NUMERIC_MAP.get(emit.getKeyword())); + prevIndex = emit.getEnd() + 1; + } + + // Add the remainder of the string (contains no more matches). + sb.append(source.substring(prevIndex)); + + return sb.toString(); + } + +} diff --git a/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedParser.java b/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedParser.java new file mode 100644 index 00000000..951cb676 --- /dev/null +++ b/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedParser.java @@ -0,0 +1,271 @@ +package com.commafeed.backend.feed.parser; + +import java.io.StringReader; +import java.nio.charset.Charset; +import java.text.DateFormat; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.Date; +import java.util.List; +import java.util.stream.Collectors; + +import org.apache.commons.lang3.ArrayUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.math3.stat.descriptive.SummaryStatistics; +import org.jdom2.Element; +import org.jdom2.Namespace; +import org.xml.sax.InputSource; + +import com.commafeed.backend.feed.FeedUtils; +import com.commafeed.backend.feed.parser.FeedParserResult.Content; +import com.commafeed.backend.feed.parser.FeedParserResult.Enclosure; +import com.commafeed.backend.feed.parser.FeedParserResult.Entry; +import com.commafeed.backend.feed.parser.FeedParserResult.Media; +import com.google.common.collect.Iterables; +import com.rometools.modules.mediarss.MediaEntryModule; +import com.rometools.modules.mediarss.MediaModule; +import com.rometools.modules.mediarss.types.MediaGroup; +import com.rometools.modules.mediarss.types.Metadata; +import com.rometools.modules.mediarss.types.Thumbnail; +import com.rometools.rome.feed.synd.SyndCategory; +import com.rometools.rome.feed.synd.SyndContent; +import com.rometools.rome.feed.synd.SyndEnclosure; +import com.rometools.rome.feed.synd.SyndEntry; +import com.rometools.rome.feed.synd.SyndFeed; +import com.rometools.rome.feed.synd.SyndLink; +import com.rometools.rome.feed.synd.SyndLinkImpl; +import com.rometools.rome.io.FeedException; +import com.rometools.rome.io.SyndFeedInput; + +import jakarta.inject.Inject; +import jakarta.inject.Singleton; +import lombok.RequiredArgsConstructor; + +/** + * Parses raw xml into a FeedParserResult object + */ +@RequiredArgsConstructor(onConstructor = @__({ @Inject })) +@Singleton +public class FeedParser { + + private static final Namespace ATOM_10_NS = Namespace.getNamespace("http://www.w3.org/2005/Atom"); + + private static final Date START = new Date(86400000); + private static final Date END = new Date(1000L * Integer.MAX_VALUE - 86400000); + + private final EncodingDetector encodingDetector; + private final FeedCleaner feedCleaner; + + public FeedParserResult parse(String feedUrl, byte[] xml) throws FeedException { + try { + Charset encoding = encodingDetector.getEncoding(xml); + String xmlString = feedCleaner.trimInvalidXmlCharacters(new String(xml, encoding)); + if (xmlString == null) { + throw new FeedException("Input string is null for url " + feedUrl); + } + xmlString = feedCleaner.replaceHtmlEntitiesWithNumericEntities(xmlString); + + InputSource source = new InputSource(new StringReader(xmlString)); + SyndFeed feed = new SyndFeedInput().build(source); + handleForeignMarkup(feed); + + String title = feed.getTitle(); + String link = feed.getLink(); + List entries = buildEntries(feed, feedUrl); + Date lastEntryDate = entries.stream().findFirst().map(Entry::updated).orElse(null); + Date lastPublishedDate = validateDate(feed.getPublishedDate(), false); + if (lastPublishedDate == null || lastPublishedDate.before(lastEntryDate)) { + lastPublishedDate = lastEntryDate; + } + Long averageEntryInterval = averageTimeBetweenEntries(entries); + + return new FeedParserResult(title, link, lastPublishedDate, averageEntryInterval, lastEntryDate, entries); + } catch (Exception e) { + throw new FeedException(String.format("Could not parse feed from %s : %s", feedUrl, e.getMessage()), e); + } + } + + /** + * Adds atom links for rss feeds + */ + private void handleForeignMarkup(SyndFeed feed) { + List foreignMarkup = feed.getForeignMarkup(); + if (foreignMarkup == null) { + return; + } + for (Element element : foreignMarkup) { + if ("link".equals(element.getName()) && ATOM_10_NS.equals(element.getNamespace())) { + SyndLink link = new SyndLinkImpl(); + link.setRel(element.getAttributeValue("rel")); + link.setHref(element.getAttributeValue("href")); + feed.getLinks().add(link); + } + } + } + + private List buildEntries(SyndFeed feed, String feedUrl) { + List entries = new ArrayList<>(); + + for (SyndEntry item : feed.getEntries()) { + String guid = item.getUri(); + if (StringUtils.isBlank(guid)) { + guid = item.getLink(); + } + if (StringUtils.isBlank(guid)) { + // no guid and no link, skip entry + continue; + } + + String url = buildEntryUrl(feed, feedUrl, item); + if (StringUtils.isBlank(url) && FeedUtils.isAbsoluteUrl(guid)) { + // if link is empty but guid is used as url, use guid + url = guid; + } + + Date updated = buildEntryUpdateDate(item); + Content content = buildContent(item); + + entries.add(new Entry(guid, url, updated, content)); + } + + entries.sort(Comparator.comparing(Entry::updated).reversed()); + return entries; + } + + private Content buildContent(SyndEntry item) { + String title = getTitle(item); + String content = getContent(item); + String author = StringUtils.trimToNull(item.getAuthor()); + String categories = StringUtils + .trimToNull(item.getCategories().stream().map(SyndCategory::getName).collect(Collectors.joining(", "))); + + Enclosure enclosure = buildEnclosure(item); + Media media = buildMedia(item); + return new Content(title, content, author, categories, enclosure, media); + } + + private Enclosure buildEnclosure(SyndEntry item) { + SyndEnclosure enclosure = Iterables.getFirst(item.getEnclosures(), null); + if (enclosure == null) { + return null; + } + + return new Enclosure(enclosure.getUrl(), enclosure.getType()); + } + + private Date buildEntryUpdateDate(SyndEntry item) { + Date date = item.getUpdatedDate(); + if (date == null) { + date = item.getPublishedDate(); + } + if (date == null) { + date = new Date(); + } + return validateDate(date, true); + } + + private String buildEntryUrl(SyndFeed feed, String feedUrl, SyndEntry item) { + String url = StringUtils.trimToNull(StringUtils.normalizeSpace(item.getLink())); + if (url == null || FeedUtils.isAbsoluteUrl(url)) { + // url is absolute, nothing to do + return url; + } + + // url is relative, trying to resolve it + String feedLink = StringUtils.trimToNull(StringUtils.normalizeSpace(feed.getLink())); + return FeedUtils.toAbsoluteUrl(url, feedLink, feedUrl); + } + + private Date validateDate(Date date, boolean nullToNow) { + Date now = new Date(); + if (date == null) { + return nullToNow ? now : null; + } + if (date.before(START) || date.after(END)) { + return now; + } + + if (date.after(now)) { + return now; + } + return date; + } + + private String getContent(SyndEntry item) { + String content; + if (item.getContents().isEmpty()) { + content = item.getDescription() == null ? null : item.getDescription().getValue(); + } else { + content = item.getContents().stream().map(SyndContent::getValue).collect(Collectors.joining(System.lineSeparator())); + } + return StringUtils.trimToNull(content); + } + + private String getTitle(SyndEntry item) { + String title = item.getTitle(); + if (StringUtils.isBlank(title)) { + Date date = item.getPublishedDate(); + if (date != null) { + title = DateFormat.getInstance().format(date); + } else { + title = "(no title)"; + } + } + return StringUtils.trimToNull(title); + } + + private Media buildMedia(SyndEntry item) { + MediaEntryModule module = (MediaEntryModule) item.getModule(MediaModule.URI); + if (module == null) { + return null; + } + + Media media = buildMedia(module.getMetadata()); + if (media == null && ArrayUtils.isNotEmpty(module.getMediaGroups())) { + MediaGroup group = module.getMediaGroups()[0]; + media = buildMedia(group.getMetadata()); + } + + return media; + } + + private Media buildMedia(Metadata metadata) { + if (metadata == null) { + return null; + } + + String description = metadata.getDescription(); + + String thumbnailUrl = null; + Integer thumbnailWidth = null; + Integer thumbnailHeight = null; + if (ArrayUtils.isNotEmpty(metadata.getThumbnail())) { + Thumbnail thumbnail = metadata.getThumbnail()[0]; + thumbnailWidth = thumbnail.getWidth(); + thumbnailHeight = thumbnail.getHeight(); + if (thumbnail.getUrl() != null) { + thumbnailUrl = thumbnail.getUrl().toString(); + } + } + + if (description == null && thumbnailUrl == null) { + return null; + } + + return new Media(description, thumbnailUrl, thumbnailWidth, thumbnailHeight); + } + + private Long averageTimeBetweenEntries(List entries) { + if (entries.isEmpty() || entries.size() == 1) { + return null; + } + + SummaryStatistics stats = new SummaryStatistics(); + for (int i = 0; i < entries.size() - 1; i++) { + long diff = Math.abs(entries.get(i).updated().getTime() - entries.get(i + 1).updated().getTime()); + stats.addValue(diff); + } + return (long) stats.getMean(); + } + +} diff --git a/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedParserResult.java b/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedParserResult.java new file mode 100644 index 00000000..912afee0 --- /dev/null +++ b/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/FeedParserResult.java @@ -0,0 +1,20 @@ +package com.commafeed.backend.feed.parser; + +import java.util.Date; +import java.util.List; + +public record FeedParserResult(String title, String link, Date lastPublishedDate, Long averageEntryInterval, Date lastEntryDate, + List entries) { + public record Entry(String guid, String url, Date updated, Content content) { + } + + public record Content(String title, String content, String author, String categories, Enclosure enclosure, Media media) { + } + + public record Enclosure(String url, String type) { + } + + public record Media(String description, String thumbnailUrl, Integer thumbnailWidth, Integer thumbnailHeight) { + } + +} diff --git a/commafeed-server/src/main/java/com/commafeed/backend/feed/HtmlEntities.java b/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/HtmlEntities.java similarity index 95% rename from commafeed-server/src/main/java/com/commafeed/backend/feed/HtmlEntities.java rename to commafeed-server/src/main/java/com/commafeed/backend/feed/parser/HtmlEntities.java index 8b30fd9a..d38cccea 100644 --- a/commafeed-server/src/main/java/com/commafeed/backend/feed/HtmlEntities.java +++ b/commafeed-server/src/main/java/com/commafeed/backend/feed/parser/HtmlEntities.java @@ -1,10 +1,13 @@ -package com.commafeed.backend.feed; +package com.commafeed.backend.feed.parser; import java.util.Collections; import java.util.LinkedHashMap; import java.util.Map; -public class HtmlEntities { +import lombok.experimental.UtilityClass; + +@UtilityClass +class HtmlEntities { public static final Map HTML_TO_NUMERIC_MAP; public static final String[] HTML_ENTITIES; public static final String[] NUMERIC_ENTITIES; diff --git a/commafeed-server/src/main/java/com/commafeed/backend/model/FeedEntryContent.java b/commafeed-server/src/main/java/com/commafeed/backend/model/FeedEntryContent.java index dbe953a1..04376ce2 100644 --- a/commafeed-server/src/main/java/com/commafeed/backend/model/FeedEntryContent.java +++ b/commafeed-server/src/main/java/com/commafeed/backend/model/FeedEntryContent.java @@ -3,7 +3,6 @@ package com.commafeed.backend.model; import java.sql.Types; import java.util.Set; -import org.apache.commons.lang3.builder.EqualsBuilder; import org.hibernate.annotations.JdbcTypeCode; import jakarta.persistence.Column; @@ -61,22 +60,4 @@ public class FeedEntryContent extends AbstractModel { @OneToMany(mappedBy = "content") private Set entries; - public boolean equivalentTo(FeedEntryContent c) { - if (c == null) { - return false; - } - - return new EqualsBuilder().append(title, c.title) - .append(content, c.content) - .append(author, c.author) - .append(enclosureUrl, c.enclosureUrl) - .append(enclosureType, c.enclosureType) - .append(mediaDescription, c.mediaDescription) - .append(mediaThumbnailUrl, c.mediaThumbnailUrl) - .append(mediaThumbnailWidth, c.mediaThumbnailWidth) - .append(mediaThumbnailHeight, c.mediaThumbnailHeight) - .append(categories, c.categories) - .build(); - } - } diff --git a/commafeed-server/src/main/java/com/commafeed/backend/service/FeedEntryContentService.java b/commafeed-server/src/main/java/com/commafeed/backend/service/FeedEntryContentService.java index 85eb1809..636b1a65 100644 --- a/commafeed-server/src/main/java/com/commafeed/backend/service/FeedEntryContentService.java +++ b/commafeed-server/src/main/java/com/commafeed/backend/service/FeedEntryContentService.java @@ -8,6 +8,7 @@ import java.util.Optional; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.builder.EqualsBuilder; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Document.OutputSettings; @@ -23,6 +24,9 @@ import org.w3c.dom.css.CSSStyleDeclaration; import com.commafeed.backend.dao.FeedEntryContentDAO; import com.commafeed.backend.feed.FeedUtils; +import com.commafeed.backend.feed.parser.FeedParserResult.Content; +import com.commafeed.backend.feed.parser.FeedParserResult.Enclosure; +import com.commafeed.backend.feed.parser.FeedParserResult.Media; import com.commafeed.backend.model.FeedEntryContent; import com.steadystate.css.parser.CSSOMParser; @@ -46,26 +50,65 @@ public class FeedEntryContentService { /** * this is NOT thread-safe */ - public FeedEntryContent findOrCreate(FeedEntryContent content, String baseUrl) { - content.setAuthor(FeedUtils.truncate(handleContent(content.getAuthor(), baseUrl, true), 128)); - content.setTitle(FeedUtils.truncate(handleContent(content.getTitle(), baseUrl, true), 2048)); - content.setContent(handleContent(content.getContent(), baseUrl, false)); - content.setMediaDescription(handleContent(content.getMediaDescription(), baseUrl, false)); + public FeedEntryContent findOrCreate(Content content, String baseUrl) { + String title = FeedUtils.truncate(handleContent(content.title(), baseUrl, true), 2048); + String titleHash = DigestUtils.sha1Hex(StringUtils.trimToEmpty(title)); - String contentHash = DigestUtils.sha1Hex(StringUtils.trimToEmpty(content.getContent())); - content.setContentHash(contentHash); - - String titleHash = DigestUtils.sha1Hex(StringUtils.trimToEmpty(content.getTitle())); - content.setTitleHash(titleHash); + String contentString = handleContent(content.content(), baseUrl, false); + String contentHash = DigestUtils.sha1Hex(StringUtils.trimToEmpty(contentString)); List existing = feedEntryContentDAO.findExisting(contentHash, titleHash); - Optional equivalentContent = existing.stream().filter(content::equivalentTo).findFirst(); + Optional equivalentContent = existing.stream() + .filter(c -> isEquivalent(c, content, title, contentString)) + .findFirst(); if (equivalentContent.isPresent()) { return equivalentContent.get(); } - feedEntryContentDAO.saveOrUpdate(content); - return content; + FeedEntryContent entryContent = new FeedEntryContent(); + entryContent.setTitle(title); + entryContent.setTitleHash(titleHash); + entryContent.setContent(contentString); + entryContent.setContentHash(contentHash); + entryContent.setAuthor(FeedUtils.truncate(handleContent(content.author(), baseUrl, true), 128)); + entryContent.setCategories(FeedUtils.truncate(content.categories(), 4096)); + + Enclosure enclosure = content.enclosure(); + if (enclosure != null) { + entryContent.setEnclosureUrl(enclosure.url()); + entryContent.setEnclosureType(enclosure.type()); + } + + Media media = content.media(); + if (media != null) { + entryContent.setMediaDescription(handleContent(media.description(), baseUrl, false)); + entryContent.setMediaThumbnailUrl(media.thumbnailUrl()); + entryContent.setMediaThumbnailWidth(media.thumbnailWidth()); + entryContent.setMediaThumbnailHeight(media.thumbnailHeight()); + } + + feedEntryContentDAO.saveOrUpdate(entryContent); + return entryContent; + } + + private boolean isEquivalent(FeedEntryContent content, Content c, String title, String contentString) { + EqualsBuilder builder = new EqualsBuilder().append(content.getTitle(), title) + .append(content.getContent(), contentString) + .append(content.getAuthor(), c.author()) + .append(content.getCategories(), c.categories()); + + if (c.enclosure() != null) { + builder.append(content.getEnclosureUrl(), c.enclosure().url()).append(content.getEnclosureType(), c.enclosure().type()); + } + + if (c.media() != null) { + builder.append(content.getMediaDescription(), c.media().description()) + .append(content.getMediaThumbnailUrl(), c.media().thumbnailUrl()) + .append(content.getMediaThumbnailWidth(), c.media().thumbnailWidth()) + .append(content.getMediaThumbnailHeight(), c.media().thumbnailHeight()); + } + + return builder.build(); } private static Safelist buildWhiteList() { diff --git a/commafeed-server/src/main/java/com/commafeed/backend/service/FeedEntryService.java b/commafeed-server/src/main/java/com/commafeed/backend/service/FeedEntryService.java index e94d96b7..5bff1883 100644 --- a/commafeed-server/src/main/java/com/commafeed/backend/service/FeedEntryService.java +++ b/commafeed-server/src/main/java/com/commafeed/backend/service/FeedEntryService.java @@ -10,9 +10,10 @@ import com.commafeed.backend.dao.FeedEntryDAO; import com.commafeed.backend.dao.FeedEntryStatusDAO; import com.commafeed.backend.dao.FeedSubscriptionDAO; import com.commafeed.backend.feed.FeedEntryKeyword; +import com.commafeed.backend.feed.FeedUtils; +import com.commafeed.backend.feed.parser.FeedParserResult.Entry; import com.commafeed.backend.model.Feed; import com.commafeed.backend.model.FeedEntry; -import com.commafeed.backend.model.FeedEntryContent; import com.commafeed.backend.model.FeedEntryStatus; import com.commafeed.backend.model.FeedSubscription; import com.commafeed.backend.model.User; @@ -37,30 +38,27 @@ public class FeedEntryService { /** * this is NOT thread-safe */ - public boolean addEntry(Feed feed, FeedEntry entry, List subscriptions) { - - Long existing = feedEntryDAO.findExisting(entry.getGuid(), feed); + public boolean addEntry(Feed feed, Entry entry, List subscriptions) { + String guid = FeedUtils.truncate(entry.guid(), 2048); + String guidHash = DigestUtils.sha1Hex(entry.guid()); + Long existing = feedEntryDAO.findExisting(guidHash, feed); if (existing != null) { return false; } - FeedEntryContent content = feedEntryContentService.findOrCreate(entry.getContent(), feed.getLink()); - entry.setGuidHash(DigestUtils.sha1Hex(entry.getGuid())); - entry.setContent(content); - entry.setInserted(new Date()); - entry.setFeed(feed); - feedEntryDAO.saveOrUpdate(entry); + FeedEntry feedEntry = buildEntry(feed, entry, guid, guidHash); + feedEntryDAO.saveOrUpdate(feedEntry); // if filter does not match the entry, mark it as read for (FeedSubscription sub : subscriptions) { boolean matches = true; try { - matches = feedEntryFilteringService.filterMatchesEntry(sub.getFilter(), entry); + matches = feedEntryFilteringService.filterMatchesEntry(sub.getFilter(), feedEntry); } catch (FeedEntryFilteringService.FeedEntryFilterException e) { log.error("could not evaluate filter {}", sub.getFilter(), e); } if (!matches) { - FeedEntryStatus status = new FeedEntryStatus(sub.getUser(), sub, entry); + FeedEntryStatus status = new FeedEntryStatus(sub.getUser(), sub, feedEntry); status.setRead(true); feedEntryStatusDAO.saveOrUpdate(status); } @@ -69,8 +67,20 @@ public class FeedEntryService { return true; } - public void markEntry(User user, Long entryId, boolean read) { + private FeedEntry buildEntry(Feed feed, Entry e, String guid, String guidHash) { + FeedEntry entry = new FeedEntry(); + entry.setGuid(guid); + entry.setGuidHash(guidHash); + entry.setUrl(FeedUtils.truncate(e.url(), 2048)); + entry.setUpdated(e.updated()); + entry.setInserted(new Date()); + entry.setFeed(feed); + entry.setContent(feedEntryContentService.findOrCreate(e.content(), feed.getLink())); + return entry; + } + + public void markEntry(User user, Long entryId, boolean read) { FeedEntry entry = feedEntryDAO.findById(entryId); if (entry == null) { return; diff --git a/commafeed-server/src/main/java/com/commafeed/backend/service/FeedService.java b/commafeed-server/src/main/java/com/commafeed/backend/service/FeedService.java index 5fbc0b81..f8e6bb92 100644 --- a/commafeed-server/src/main/java/com/commafeed/backend/service/FeedService.java +++ b/commafeed-server/src/main/java/com/commafeed/backend/service/FeedService.java @@ -37,13 +37,14 @@ public class FeedService { } public synchronized Feed findOrCreate(String url) { - String normalized = FeedUtils.normalizeURL(url); - Feed feed = feedDAO.findByUrl(normalized); + String normalizedUrl = FeedUtils.normalizeURL(url); + String normalizedUrlHash = DigestUtils.sha1Hex(normalizedUrl); + Feed feed = feedDAO.findByUrl(normalizedUrl, normalizedUrlHash); if (feed == null) { feed = new Feed(); feed.setUrl(url); - feed.setNormalizedUrl(normalized); - feed.setNormalizedUrlHash(DigestUtils.sha1Hex(normalized)); + feed.setNormalizedUrl(normalizedUrl); + feed.setNormalizedUrlHash(normalizedUrlHash); feed.setDisabledUntil(new Date(0)); feedDAO.saveOrUpdate(feed); } @@ -55,6 +56,7 @@ public class FeedService { feed.setNormalizedUrl(normalized); feed.setNormalizedUrlHash(DigestUtils.sha1Hex(normalized)); feed.setLastUpdated(new Date()); + feed.setEtagHeader(FeedUtils.truncate(feed.getEtagHeader(), 255)); feedDAO.saveOrUpdate(feed); } diff --git a/commafeed-server/src/main/java/com/commafeed/frontend/resource/FeedREST.java b/commafeed-server/src/main/java/com/commafeed/frontend/resource/FeedREST.java index c3d116bc..8d91f2d5 100644 --- a/commafeed-server/src/main/java/com/commafeed/frontend/resource/FeedREST.java +++ b/commafeed-server/src/main/java/com/commafeed/frontend/resource/FeedREST.java @@ -245,8 +245,8 @@ public class FeedREST { try { FeedFetcherResult feedFetcherResult = feedFetcher.fetch(url, true, null, null, null, null); info = new FeedInfo(); - info.setUrl(feedFetcherResult.getUrlAfterRedirect()); - info.setTitle(feedFetcherResult.getTitle()); + info.setUrl(feedFetcherResult.urlAfterRedirect()); + info.setTitle(feedFetcherResult.feed().title()); } catch (Exception e) { log.debug(e.getMessage(), e); diff --git a/commafeed-server/src/test/java/com/commafeed/backend/feed/FeedFetcherTest.java b/commafeed-server/src/test/java/com/commafeed/backend/feed/FeedFetcherTest.java index 562f5cda..fcec5b41 100644 --- a/commafeed-server/src/test/java/com/commafeed/backend/feed/FeedFetcherTest.java +++ b/commafeed-server/src/test/java/com/commafeed/backend/feed/FeedFetcherTest.java @@ -15,6 +15,7 @@ import org.mockito.junit.jupiter.MockitoExtension; import com.commafeed.backend.HttpGetter; import com.commafeed.backend.HttpGetter.HttpResult; import com.commafeed.backend.HttpGetter.NotModifiedException; +import com.commafeed.backend.feed.parser.FeedParser; import com.commafeed.backend.urlprovider.FeedURLProvider; @ExtendWith(MockitoExtension.class) diff --git a/commafeed-server/src/test/java/com/commafeed/backend/feed/FeedUtilsTest.java b/commafeed-server/src/test/java/com/commafeed/backend/feed/FeedUtilsTest.java index 8ed36c6e..e4e3ef80 100644 --- a/commafeed-server/src/test/java/com/commafeed/backend/feed/FeedUtilsTest.java +++ b/commafeed-server/src/test/java/com/commafeed/backend/feed/FeedUtilsTest.java @@ -64,21 +64,6 @@ class FeedUtilsTest { } - @Test - void testExtractDeclaredEncoding() { - Assertions.assertNull(FeedUtils.extractDeclaredEncoding("".getBytes())); - Assertions.assertNull(FeedUtils.extractDeclaredEncoding("".getBytes())); - Assertions.assertEquals("UTF-8", FeedUtils.extractDeclaredEncoding("".getBytes())); - Assertions.assertEquals("UTF-8", FeedUtils.extractDeclaredEncoding("".getBytes())); - Assertions.assertEquals("UTF-8", FeedUtils.extractDeclaredEncoding("".getBytes())); - } - - @Test - void testReplaceHtmlEntitiesWithNumericEntities() { - String source = "T´l´phone ′"; - Assertions.assertEquals("T´l´phone ′", FeedUtils.replaceHtmlEntitiesWithNumericEntities(source)); - } - @Test void testRemoveTrailingSlash() { final String url = "http://localhost/"; diff --git a/commafeed-server/src/test/java/com/commafeed/backend/feed/parser/EncodingDetectorTest.java b/commafeed-server/src/test/java/com/commafeed/backend/feed/parser/EncodingDetectorTest.java new file mode 100644 index 00000000..cf88d11f --- /dev/null +++ b/commafeed-server/src/test/java/com/commafeed/backend/feed/parser/EncodingDetectorTest.java @@ -0,0 +1,19 @@ +package com.commafeed.backend.feed.parser; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +class EncodingDetectorTest { + + EncodingDetector encodingDetector = new EncodingDetector(); + + @Test + void testExtractDeclaredEncoding() { + Assertions.assertNull(encodingDetector.extractDeclaredEncoding("".getBytes())); + Assertions.assertNull(encodingDetector.extractDeclaredEncoding("".getBytes())); + Assertions.assertEquals("UTF-8", encodingDetector.extractDeclaredEncoding("".getBytes())); + Assertions.assertEquals("UTF-8", encodingDetector.extractDeclaredEncoding("".getBytes())); + Assertions.assertEquals("UTF-8", encodingDetector.extractDeclaredEncoding("".getBytes())); + } + +} \ No newline at end of file diff --git a/commafeed-server/src/test/java/com/commafeed/backend/feed/parser/FeedCleanerTest.java b/commafeed-server/src/test/java/com/commafeed/backend/feed/parser/FeedCleanerTest.java new file mode 100644 index 00000000..68a7584c --- /dev/null +++ b/commafeed-server/src/test/java/com/commafeed/backend/feed/parser/FeedCleanerTest.java @@ -0,0 +1,16 @@ +package com.commafeed.backend.feed.parser; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +class FeedCleanerTest { + + FeedCleaner feedCleaner = new FeedCleaner(); + + @Test + void testReplaceHtmlEntitiesWithNumericEntities() { + String source = "T´l´phone ′"; + Assertions.assertEquals("T´l´phone ′", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source)); + } + +} \ No newline at end of file