forked from Archives/Athou_commafeed
feed refresh engine now uses its own immutable model
This commit is contained in:
@@ -4,8 +4,8 @@ import java.util.List;
|
||||
|
||||
import org.apache.commons.codec.digest.DigestUtils;
|
||||
|
||||
import com.commafeed.backend.feed.parser.FeedParserResult.Entry;
|
||||
import com.commafeed.backend.model.Feed;
|
||||
import com.commafeed.backend.model.FeedEntry;
|
||||
import com.commafeed.backend.model.FeedSubscription;
|
||||
import com.commafeed.backend.model.User;
|
||||
import com.commafeed.frontend.model.Category;
|
||||
@@ -18,8 +18,8 @@ public abstract class CacheService {
|
||||
|
||||
public abstract void setLastEntries(Feed feed, List<String> entries);
|
||||
|
||||
public String buildUniqueEntryKey(Feed feed, FeedEntry entry) {
|
||||
return DigestUtils.sha1Hex(entry.getGuid() + entry.getUrl());
|
||||
public String buildUniqueEntryKey(Entry entry) {
|
||||
return DigestUtils.sha1Hex(entry.guid() + entry.url());
|
||||
}
|
||||
|
||||
// user categories
|
||||
|
||||
@@ -3,7 +3,6 @@ package com.commafeed.backend.dao;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.codec.digest.DigestUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.hibernate.SessionFactory;
|
||||
|
||||
@@ -45,8 +44,8 @@ public class FeedDAO extends GenericDAO<Feed> {
|
||||
updateQuery(feed).set(feed.disabledUntil, date).where(feed.id.in(feedIds)).execute();
|
||||
}
|
||||
|
||||
public Feed findByUrl(String normalizedUrl) {
|
||||
List<Feed> feeds = query().selectFrom(feed).where(feed.normalizedUrlHash.eq(DigestUtils.sha1Hex(normalizedUrl))).fetch();
|
||||
public Feed findByUrl(String normalizedUrl, String normalizedUrlHash) {
|
||||
List<Feed> feeds = query().selectFrom(feed).where(feed.normalizedUrlHash.eq(normalizedUrlHash)).fetch();
|
||||
Feed feed = Iterables.getFirst(feeds, null);
|
||||
if (feed != null && StringUtils.equals(normalizedUrl, feed.getNormalizedUrl())) {
|
||||
return feed;
|
||||
|
||||
@@ -2,7 +2,6 @@ package com.commafeed.backend.dao;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.codec.digest.DigestUtils;
|
||||
import org.hibernate.SessionFactory;
|
||||
|
||||
import com.commafeed.backend.model.Feed;
|
||||
@@ -26,12 +25,8 @@ public class FeedEntryDAO extends GenericDAO<FeedEntry> {
|
||||
super(sessionFactory);
|
||||
}
|
||||
|
||||
public Long findExisting(String guid, Feed feed) {
|
||||
return query().select(entry.id)
|
||||
.from(entry)
|
||||
.where(entry.guidHash.eq(DigestUtils.sha1Hex(guid)), entry.feed.eq(feed))
|
||||
.limit(1)
|
||||
.fetchOne();
|
||||
public Long findExisting(String guidHash, Feed feed) {
|
||||
return query().select(entry.id).from(entry).where(entry.guidHash.eq(guidHash), entry.feed.eq(feed)).limit(1).fetchOne();
|
||||
}
|
||||
|
||||
public List<FeedCapacity> findFeedsExceedingCapacity(long maxCapacity, long max) {
|
||||
|
||||
@@ -2,7 +2,6 @@ package com.commafeed.backend.feed;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.commons.codec.binary.StringUtils;
|
||||
@@ -11,16 +10,14 @@ import org.apache.commons.codec.digest.DigestUtils;
|
||||
import com.commafeed.backend.HttpGetter;
|
||||
import com.commafeed.backend.HttpGetter.HttpResult;
|
||||
import com.commafeed.backend.HttpGetter.NotModifiedException;
|
||||
import com.commafeed.backend.feed.FeedParser.FeedParserResult;
|
||||
import com.commafeed.backend.model.Feed;
|
||||
import com.commafeed.backend.model.FeedEntry;
|
||||
import com.commafeed.backend.feed.parser.FeedParser;
|
||||
import com.commafeed.backend.feed.parser.FeedParserResult;
|
||||
import com.commafeed.backend.urlprovider.FeedURLProvider;
|
||||
import com.rometools.rome.io.FeedException;
|
||||
|
||||
import jakarta.inject.Inject;
|
||||
import jakarta.inject.Singleton;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.Value;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
/**
|
||||
@@ -79,20 +76,16 @@ public class FeedFetcher {
|
||||
etagHeaderValueChanged ? result.getETag() : null);
|
||||
}
|
||||
|
||||
if (lastPublishedDate != null && parserResult.getFeed().getLastPublishedDate() != null
|
||||
&& lastPublishedDate.getTime() == parserResult.getFeed().getLastPublishedDate().getTime()) {
|
||||
if (lastPublishedDate != null && parserResult.lastPublishedDate() != null
|
||||
&& lastPublishedDate.getTime() == parserResult.lastPublishedDate().getTime()) {
|
||||
log.debug("publishedDate not modified: {}", feedUrl);
|
||||
throw new NotModifiedException("publishedDate not modified",
|
||||
lastModifiedHeaderValueChanged ? result.getLastModifiedSince() : null,
|
||||
etagHeaderValueChanged ? result.getETag() : null);
|
||||
}
|
||||
|
||||
Feed feed = parserResult.getFeed();
|
||||
feed.setLastModifiedHeader(result.getLastModifiedSince());
|
||||
feed.setEtagHeader(FeedUtils.truncate(result.getETag(), 255));
|
||||
feed.setLastContentHash(hash);
|
||||
return new FeedFetcherResult(parserResult.getFeed(), parserResult.getEntries(), parserResult.getTitle(),
|
||||
result.getUrlAfterRedirect(), result.getDuration());
|
||||
return new FeedFetcherResult(parserResult, result.getUrlAfterRedirect(), result.getLastModifiedSince(), result.getETag(), hash,
|
||||
result.getDuration());
|
||||
}
|
||||
|
||||
private static String extractFeedUrl(Set<FeedURLProvider> urlProviders, String url, String urlContent) {
|
||||
@@ -106,13 +99,8 @@ public class FeedFetcher {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Value
|
||||
public static class FeedFetcherResult {
|
||||
Feed feed;
|
||||
List<FeedEntry> entries;
|
||||
String title;
|
||||
String urlAfterRedirect;
|
||||
long fetchDuration;
|
||||
public record FeedFetcherResult(FeedParserResult feed, String urlAfterRedirect, String lastModifiedHeader, String lastETagHeader,
|
||||
String contentHash, long fetchDuration) {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -1,263 +0,0 @@
|
||||
package com.commafeed.backend.feed;
|
||||
|
||||
import java.io.StringReader;
|
||||
import java.nio.charset.Charset;
|
||||
import java.text.DateFormat;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.ArrayUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jdom2.Element;
|
||||
import org.jdom2.Namespace;
|
||||
import org.xml.sax.InputSource;
|
||||
|
||||
import com.commafeed.backend.model.Feed;
|
||||
import com.commafeed.backend.model.FeedEntry;
|
||||
import com.commafeed.backend.model.FeedEntryContent;
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.rometools.modules.mediarss.MediaEntryModule;
|
||||
import com.rometools.modules.mediarss.MediaModule;
|
||||
import com.rometools.modules.mediarss.types.MediaGroup;
|
||||
import com.rometools.modules.mediarss.types.Metadata;
|
||||
import com.rometools.modules.mediarss.types.Thumbnail;
|
||||
import com.rometools.rome.feed.synd.SyndCategory;
|
||||
import com.rometools.rome.feed.synd.SyndContent;
|
||||
import com.rometools.rome.feed.synd.SyndEnclosure;
|
||||
import com.rometools.rome.feed.synd.SyndEntry;
|
||||
import com.rometools.rome.feed.synd.SyndFeed;
|
||||
import com.rometools.rome.feed.synd.SyndLink;
|
||||
import com.rometools.rome.feed.synd.SyndLinkImpl;
|
||||
import com.rometools.rome.io.FeedException;
|
||||
import com.rometools.rome.io.SyndFeedInput;
|
||||
|
||||
import jakarta.inject.Inject;
|
||||
import jakarta.inject.Singleton;
|
||||
import lombok.Data;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.Value;
|
||||
|
||||
/**
|
||||
* Parses raw xml as a Feed object
|
||||
*/
|
||||
@RequiredArgsConstructor(onConstructor = @__({ @Inject }))
|
||||
@Singleton
|
||||
public class FeedParser {
|
||||
|
||||
private static final String ATOM_10_URI = "http://www.w3.org/2005/Atom";
|
||||
private static final Namespace ATOM_10_NS = Namespace.getNamespace(ATOM_10_URI);
|
||||
|
||||
private static final Date START = new Date(86400000);
|
||||
private static final Date END = new Date(1000L * Integer.MAX_VALUE - 86400000);
|
||||
|
||||
public FeedParserResult parse(String feedUrl, byte[] xml) throws FeedException {
|
||||
|
||||
try {
|
||||
Charset encoding = FeedUtils.guessEncoding(xml);
|
||||
String xmlString = FeedUtils.trimInvalidXmlCharacters(new String(xml, encoding));
|
||||
if (xmlString == null) {
|
||||
throw new FeedException("Input string is null for url " + feedUrl);
|
||||
}
|
||||
xmlString = FeedUtils.replaceHtmlEntitiesWithNumericEntities(xmlString);
|
||||
InputSource source = new InputSource(new StringReader(xmlString));
|
||||
|
||||
SyndFeed rss = new SyndFeedInput().build(source);
|
||||
handleForeignMarkup(rss);
|
||||
|
||||
String title = rss.getTitle();
|
||||
Feed feed = new Feed();
|
||||
feed.setUrl(feedUrl);
|
||||
feed.setLink(rss.getLink());
|
||||
|
||||
List<FeedEntry> entries = new ArrayList<>();
|
||||
for (SyndEntry item : rss.getEntries()) {
|
||||
FeedEntry entry = new FeedEntry();
|
||||
|
||||
String guid = item.getUri();
|
||||
if (StringUtils.isBlank(guid)) {
|
||||
guid = item.getLink();
|
||||
}
|
||||
if (StringUtils.isBlank(guid)) {
|
||||
// no guid and no link, skip entry
|
||||
continue;
|
||||
}
|
||||
entry.setGuid(FeedUtils.truncate(guid, 2048));
|
||||
entry.setUpdated(validateDate(getEntryUpdateDate(item), true));
|
||||
entry.setUrl(FeedUtils.truncate(FeedUtils.toAbsoluteUrl(item.getLink(), feed.getLink(), feedUrl), 2048));
|
||||
|
||||
// if link is empty but guid is used as url
|
||||
if (StringUtils.isBlank(entry.getUrl()) && StringUtils.startsWith(entry.getGuid(), "http")) {
|
||||
entry.setUrl(entry.getGuid());
|
||||
}
|
||||
|
||||
FeedEntryContent content = new FeedEntryContent();
|
||||
content.setContent(getContent(item));
|
||||
content.setCategories(FeedUtils
|
||||
.truncate(item.getCategories().stream().map(SyndCategory::getName).collect(Collectors.joining(", ")), 4096));
|
||||
content.setTitle(getTitle(item));
|
||||
content.setAuthor(StringUtils.trimToNull(item.getAuthor()));
|
||||
|
||||
SyndEnclosure enclosure = Iterables.getFirst(item.getEnclosures(), null);
|
||||
if (enclosure != null) {
|
||||
content.setEnclosureUrl(FeedUtils.truncate(enclosure.getUrl(), 2048));
|
||||
content.setEnclosureType(enclosure.getType());
|
||||
}
|
||||
|
||||
MediaEntryModule module = (MediaEntryModule) item.getModule(MediaModule.URI);
|
||||
if (module != null) {
|
||||
Media media = getMedia(module);
|
||||
if (media != null) {
|
||||
content.setMediaDescription(media.getDescription());
|
||||
content.setMediaThumbnailUrl(FeedUtils.truncate(media.getThumbnailUrl(), 2048));
|
||||
content.setMediaThumbnailWidth(media.getThumbnailWidth());
|
||||
content.setMediaThumbnailHeight(media.getThumbnailHeight());
|
||||
}
|
||||
}
|
||||
|
||||
entry.setContent(content);
|
||||
|
||||
entries.add(entry);
|
||||
}
|
||||
|
||||
Date lastEntryDate = null;
|
||||
Date publishedDate = validateDate(rss.getPublishedDate(), false);
|
||||
if (!entries.isEmpty()) {
|
||||
List<Long> sortedTimestamps = FeedUtils.getSortedTimestamps(entries);
|
||||
Long timestamp = sortedTimestamps.get(0);
|
||||
lastEntryDate = new Date(timestamp);
|
||||
publishedDate = (publishedDate == null || publishedDate.before(lastEntryDate)) ? lastEntryDate : publishedDate;
|
||||
}
|
||||
feed.setLastPublishedDate(publishedDate);
|
||||
feed.setAverageEntryInterval(FeedUtils.averageTimeBetweenEntries(entries));
|
||||
feed.setLastEntryDate(lastEntryDate);
|
||||
|
||||
return new FeedParserResult(feed, entries, title);
|
||||
} catch (Exception e) {
|
||||
throw new FeedException(String.format("Could not parse feed from %s : %s", feedUrl, e.getMessage()), e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds atom links for rss feeds
|
||||
*/
|
||||
private void handleForeignMarkup(SyndFeed feed) {
|
||||
List<Element> foreignMarkup = feed.getForeignMarkup();
|
||||
if (foreignMarkup == null) {
|
||||
return;
|
||||
}
|
||||
for (Element element : foreignMarkup) {
|
||||
if ("link".equals(element.getName()) && ATOM_10_NS.equals(element.getNamespace())) {
|
||||
SyndLink link = new SyndLinkImpl();
|
||||
link.setRel(element.getAttributeValue("rel"));
|
||||
link.setHref(element.getAttributeValue("href"));
|
||||
feed.getLinks().add(link);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private Date getEntryUpdateDate(SyndEntry item) {
|
||||
Date date = item.getUpdatedDate();
|
||||
if (date == null) {
|
||||
date = item.getPublishedDate();
|
||||
}
|
||||
if (date == null) {
|
||||
date = new Date();
|
||||
}
|
||||
return date;
|
||||
}
|
||||
|
||||
private Date validateDate(Date date, boolean nullToNow) {
|
||||
Date now = new Date();
|
||||
if (date == null) {
|
||||
return nullToNow ? now : null;
|
||||
}
|
||||
if (date.before(START) || date.after(END)) {
|
||||
return now;
|
||||
}
|
||||
|
||||
if (date.after(now)) {
|
||||
return now;
|
||||
}
|
||||
return date;
|
||||
}
|
||||
|
||||
private String getContent(SyndEntry item) {
|
||||
String content;
|
||||
if (item.getContents().isEmpty()) {
|
||||
content = item.getDescription() == null ? null : item.getDescription().getValue();
|
||||
} else {
|
||||
content = item.getContents().stream().map(SyndContent::getValue).collect(Collectors.joining(System.lineSeparator()));
|
||||
}
|
||||
return StringUtils.trimToNull(content);
|
||||
}
|
||||
|
||||
private String getTitle(SyndEntry item) {
|
||||
String title = item.getTitle();
|
||||
if (StringUtils.isBlank(title)) {
|
||||
Date date = item.getPublishedDate();
|
||||
if (date != null) {
|
||||
title = DateFormat.getInstance().format(date);
|
||||
} else {
|
||||
title = "(no title)";
|
||||
}
|
||||
}
|
||||
return StringUtils.trimToNull(title);
|
||||
}
|
||||
|
||||
private Media getMedia(MediaEntryModule module) {
|
||||
Media media = getMedia(module.getMetadata());
|
||||
if (media == null && ArrayUtils.isNotEmpty(module.getMediaGroups())) {
|
||||
MediaGroup group = module.getMediaGroups()[0];
|
||||
media = getMedia(group.getMetadata());
|
||||
}
|
||||
|
||||
return media;
|
||||
}
|
||||
|
||||
private Media getMedia(Metadata metadata) {
|
||||
if (metadata == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
Media media = new Media();
|
||||
media.setDescription(metadata.getDescription());
|
||||
|
||||
if (ArrayUtils.isNotEmpty(metadata.getThumbnail())) {
|
||||
Thumbnail thumbnail = metadata.getThumbnail()[0];
|
||||
media.setThumbnailWidth(thumbnail.getWidth());
|
||||
media.setThumbnailHeight(thumbnail.getHeight());
|
||||
|
||||
if (thumbnail.getUrl() != null) {
|
||||
media.setThumbnailUrl(thumbnail.getUrl().toString());
|
||||
}
|
||||
}
|
||||
|
||||
if (media.isEmpty()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return media;
|
||||
}
|
||||
|
||||
@Data
|
||||
private static class Media {
|
||||
private String description;
|
||||
private String thumbnailUrl;
|
||||
private Integer thumbnailWidth;
|
||||
private Integer thumbnailHeight;
|
||||
|
||||
public boolean isEmpty() {
|
||||
return description == null && thumbnailUrl == null;
|
||||
}
|
||||
}
|
||||
|
||||
@Value
|
||||
public static class FeedParserResult {
|
||||
Feed feed;
|
||||
List<FeedEntry> entries;
|
||||
String title;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -156,7 +156,7 @@ public class FeedRefreshEngine implements Managed {
|
||||
|
||||
private void processFeedAsync(Feed feed) {
|
||||
CompletableFuture.supplyAsync(() -> worker.update(feed), workerExecutor)
|
||||
.thenApplyAsync(r -> updater.update(r.getFeed(), r.getEntries()), databaseUpdaterExecutor)
|
||||
.thenApplyAsync(r -> updater.update(r.feed(), r.entries()), databaseUpdaterExecutor)
|
||||
.whenComplete((data, ex) -> {
|
||||
if (ex != null) {
|
||||
log.error("error while processing feed {}", feed.getUrl(), ex);
|
||||
|
||||
@@ -5,7 +5,6 @@ import java.util.Date;
|
||||
import org.apache.commons.lang3.time.DateUtils;
|
||||
|
||||
import com.commafeed.CommaFeedConfiguration;
|
||||
import com.commafeed.backend.model.Feed;
|
||||
|
||||
import jakarta.inject.Inject;
|
||||
import jakarta.inject.Singleton;
|
||||
@@ -22,18 +21,19 @@ public class FeedRefreshIntervalCalculator {
|
||||
this.refreshIntervalMinutes = config.getApplicationSettings().getRefreshIntervalMinutes();
|
||||
}
|
||||
|
||||
public Date onFetchSuccess(Feed feed) {
|
||||
public Date onFetchSuccess(Date publishedDate, Long averageEntryInterval) {
|
||||
Date defaultRefreshInterval = getDefaultRefreshInterval();
|
||||
return heavyLoad ? computeRefreshIntervalForHeavyLoad(feed, defaultRefreshInterval) : defaultRefreshInterval;
|
||||
return heavyLoad ? computeRefreshIntervalForHeavyLoad(publishedDate, averageEntryInterval, defaultRefreshInterval)
|
||||
: defaultRefreshInterval;
|
||||
}
|
||||
|
||||
public Date onFeedNotModified(Feed feed) {
|
||||
public Date onFeedNotModified(Date publishedDate, Long averageEntryInterval) {
|
||||
Date defaultRefreshInterval = getDefaultRefreshInterval();
|
||||
return heavyLoad ? computeRefreshIntervalForHeavyLoad(feed, defaultRefreshInterval) : defaultRefreshInterval;
|
||||
return heavyLoad ? computeRefreshIntervalForHeavyLoad(publishedDate, averageEntryInterval, defaultRefreshInterval)
|
||||
: defaultRefreshInterval;
|
||||
}
|
||||
|
||||
public Date onFetchError(Feed feed) {
|
||||
int errorCount = feed.getErrorCount();
|
||||
public Date onFetchError(int errorCount) {
|
||||
int retriesBeforeDisable = 3;
|
||||
if (errorCount < retriesBeforeDisable || !heavyLoad) {
|
||||
return getDefaultRefreshInterval();
|
||||
@@ -47,10 +47,8 @@ public class FeedRefreshIntervalCalculator {
|
||||
return DateUtils.addMinutes(new Date(), refreshIntervalMinutes);
|
||||
}
|
||||
|
||||
private Date computeRefreshIntervalForHeavyLoad(Feed feed, Date defaultRefreshInterval) {
|
||||
private Date computeRefreshIntervalForHeavyLoad(Date publishedDate, Long averageEntryInterval, Date defaultRefreshInterval) {
|
||||
Date now = new Date();
|
||||
Date publishedDate = feed.getLastEntryDate();
|
||||
Long averageEntryInterval = feed.getAverageEntryInterval();
|
||||
|
||||
if (publishedDate == null) {
|
||||
// feed with no entries, recheck in 24 hours
|
||||
|
||||
@@ -16,9 +16,9 @@ import com.codahale.metrics.MetricRegistry;
|
||||
import com.commafeed.backend.cache.CacheService;
|
||||
import com.commafeed.backend.dao.FeedSubscriptionDAO;
|
||||
import com.commafeed.backend.dao.UnitOfWork;
|
||||
import com.commafeed.backend.feed.parser.FeedParserResult.Content;
|
||||
import com.commafeed.backend.feed.parser.FeedParserResult.Entry;
|
||||
import com.commafeed.backend.model.Feed;
|
||||
import com.commafeed.backend.model.FeedEntry;
|
||||
import com.commafeed.backend.model.FeedEntryContent;
|
||||
import com.commafeed.backend.model.FeedSubscription;
|
||||
import com.commafeed.backend.model.User;
|
||||
import com.commafeed.backend.service.FeedEntryService;
|
||||
@@ -72,7 +72,7 @@ public class FeedRefreshUpdater implements Managed {
|
||||
entryInserted = metrics.meter(MetricRegistry.name(getClass(), "entryInserted"));
|
||||
}
|
||||
|
||||
private AddEntryResult addEntry(final Feed feed, final FeedEntry entry, final List<FeedSubscription> subscriptions) {
|
||||
private AddEntryResult addEntry(final Feed feed, final Entry entry, final List<FeedSubscription> subscriptions) {
|
||||
boolean processed = false;
|
||||
boolean inserted = false;
|
||||
|
||||
@@ -82,8 +82,8 @@ public class FeedRefreshUpdater implements Managed {
|
||||
|
||||
// lock on content, make sure we are not updating the same entry
|
||||
// twice at the same time
|
||||
FeedEntryContent content = entry.getContent();
|
||||
String key2 = DigestUtils.sha1Hex(StringUtils.trimToEmpty(content.getContent() + content.getTitle()));
|
||||
Content content = entry.content();
|
||||
String key2 = DigestUtils.sha1Hex(StringUtils.trimToEmpty(content.content() + content.title()));
|
||||
|
||||
Iterator<Lock> iterator = locks.bulkGet(Arrays.asList(key1, key2)).iterator();
|
||||
Lock lock1 = iterator.next();
|
||||
@@ -116,7 +116,7 @@ public class FeedRefreshUpdater implements Managed {
|
||||
return new AddEntryResult(processed, inserted);
|
||||
}
|
||||
|
||||
public boolean update(Feed feed, List<FeedEntry> entries) {
|
||||
public boolean update(Feed feed, List<Entry> entries) {
|
||||
boolean processed = true;
|
||||
boolean insertedAtLeastOneEntry = false;
|
||||
|
||||
@@ -125,10 +125,10 @@ public class FeedRefreshUpdater implements Managed {
|
||||
List<String> currentEntries = new ArrayList<>();
|
||||
|
||||
List<FeedSubscription> subscriptions = null;
|
||||
for (FeedEntry entry : entries) {
|
||||
String cacheKey = cache.buildUniqueEntryKey(feed, entry);
|
||||
for (Entry entry : entries) {
|
||||
String cacheKey = cache.buildUniqueEntryKey(entry);
|
||||
if (!lastEntries.contains(cacheKey)) {
|
||||
log.debug("cache miss for {}", entry.getUrl());
|
||||
log.debug("cache miss for {}", entry.url());
|
||||
if (subscriptions == null) {
|
||||
subscriptions = unitOfWork.call(() -> feedSubscriptionDAO.findByFeed(feed));
|
||||
}
|
||||
@@ -138,7 +138,7 @@ public class FeedRefreshUpdater implements Managed {
|
||||
|
||||
entryCacheMiss.mark();
|
||||
} else {
|
||||
log.debug("cache hit for {}", entry.getUrl());
|
||||
log.debug("cache hit for {}", entry.url());
|
||||
entryCacheHit.mark();
|
||||
}
|
||||
|
||||
|
||||
@@ -11,16 +11,15 @@ import com.codahale.metrics.MetricRegistry;
|
||||
import com.commafeed.CommaFeedConfiguration;
|
||||
import com.commafeed.backend.HttpGetter.NotModifiedException;
|
||||
import com.commafeed.backend.feed.FeedFetcher.FeedFetcherResult;
|
||||
import com.commafeed.backend.feed.parser.FeedParserResult.Entry;
|
||||
import com.commafeed.backend.model.Feed;
|
||||
import com.commafeed.backend.model.FeedEntry;
|
||||
|
||||
import jakarta.inject.Inject;
|
||||
import jakarta.inject.Singleton;
|
||||
import lombok.Value;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
/**
|
||||
* Calls {@link FeedFetcher} and updates the Feed object, but does not update the database ({@link FeedRefreshUpdater} does that)
|
||||
* Calls {@link FeedFetcher} and updates the Feed object, but does not update the database, ({@link FeedRefreshUpdater} does that)
|
||||
*/
|
||||
@Slf4j
|
||||
@Singleton
|
||||
@@ -44,32 +43,34 @@ public class FeedRefreshWorker {
|
||||
public FeedRefreshWorkerResult update(Feed feed) {
|
||||
try {
|
||||
String url = Optional.ofNullable(feed.getUrlAfterRedirect()).orElse(feed.getUrl());
|
||||
FeedFetcherResult feedFetcherResult = fetcher.fetch(url, false, feed.getLastModifiedHeader(), feed.getEtagHeader(),
|
||||
FeedFetcherResult result = fetcher.fetch(url, false, feed.getLastModifiedHeader(), feed.getEtagHeader(),
|
||||
feed.getLastPublishedDate(), feed.getLastContentHash());
|
||||
// stops here if NotModifiedException or any other exception is thrown
|
||||
List<FeedEntry> entries = feedFetcherResult.getEntries();
|
||||
|
||||
Integer maxFeedCapacity = config.getApplicationSettings().getMaxFeedCapacity();
|
||||
List<Entry> entries = result.feed().entries();
|
||||
if (maxFeedCapacity > 0) {
|
||||
entries = entries.stream().limit(maxFeedCapacity).toList();
|
||||
}
|
||||
|
||||
String urlAfterRedirect = feedFetcherResult.getUrlAfterRedirect();
|
||||
String urlAfterRedirect = result.urlAfterRedirect();
|
||||
if (StringUtils.equals(url, urlAfterRedirect)) {
|
||||
urlAfterRedirect = null;
|
||||
}
|
||||
|
||||
feed.setUrlAfterRedirect(urlAfterRedirect);
|
||||
feed.setLink(feedFetcherResult.getFeed().getLink());
|
||||
feed.setLastModifiedHeader(feedFetcherResult.getFeed().getLastModifiedHeader());
|
||||
feed.setEtagHeader(feedFetcherResult.getFeed().getEtagHeader());
|
||||
feed.setLastContentHash(feedFetcherResult.getFeed().getLastContentHash());
|
||||
feed.setLastPublishedDate(feedFetcherResult.getFeed().getLastPublishedDate());
|
||||
feed.setAverageEntryInterval(feedFetcherResult.getFeed().getAverageEntryInterval());
|
||||
feed.setLastEntryDate(feedFetcherResult.getFeed().getLastEntryDate());
|
||||
feed.setLink(result.feed().link());
|
||||
feed.setLastModifiedHeader(result.lastModifiedHeader());
|
||||
feed.setEtagHeader(result.lastETagHeader());
|
||||
feed.setLastContentHash(result.contentHash());
|
||||
feed.setLastPublishedDate(result.feed().lastPublishedDate());
|
||||
feed.setAverageEntryInterval(result.feed().averageEntryInterval());
|
||||
feed.setLastEntryDate(result.feed().lastEntryDate());
|
||||
|
||||
feed.setErrorCount(0);
|
||||
feed.setMessage(null);
|
||||
feed.setDisabledUntil(refreshIntervalCalculator.onFetchSuccess(feedFetcherResult.getFeed()));
|
||||
feed.setDisabledUntil(
|
||||
refreshIntervalCalculator.onFetchSuccess(result.feed().lastPublishedDate(), result.feed().averageEntryInterval()));
|
||||
|
||||
return new FeedRefreshWorkerResult(feed, entries);
|
||||
} catch (NotModifiedException e) {
|
||||
@@ -77,7 +78,7 @@ public class FeedRefreshWorker {
|
||||
|
||||
feed.setErrorCount(0);
|
||||
feed.setMessage(e.getMessage());
|
||||
feed.setDisabledUntil(refreshIntervalCalculator.onFeedNotModified(feed));
|
||||
feed.setDisabledUntil(refreshIntervalCalculator.onFeedNotModified(feed.getLastPublishedDate(), feed.getAverageEntryInterval()));
|
||||
|
||||
if (e.getNewLastModifiedHeader() != null) {
|
||||
feed.setLastModifiedHeader(e.getNewLastModifiedHeader());
|
||||
@@ -93,7 +94,7 @@ public class FeedRefreshWorker {
|
||||
|
||||
feed.setErrorCount(feed.getErrorCount() + 1);
|
||||
feed.setMessage("Unable to refresh feed : " + e.getMessage());
|
||||
feed.setDisabledUntil(refreshIntervalCalculator.onFetchError(feed));
|
||||
feed.setDisabledUntil(refreshIntervalCalculator.onFetchError(feed.getErrorCount()));
|
||||
|
||||
return new FeedRefreshWorkerResult(feed, Collections.emptyList());
|
||||
} finally {
|
||||
@@ -101,10 +102,7 @@ public class FeedRefreshWorker {
|
||||
}
|
||||
}
|
||||
|
||||
@Value
|
||||
public static class FeedRefreshWorkerResult {
|
||||
Feed feed;
|
||||
List<FeedEntry> entries;
|
||||
public record FeedRefreshWorkerResult(Feed feed, List<Entry> entries) {
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -2,20 +2,12 @@ package com.commafeed.backend.feed;
|
||||
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.ahocorasick.trie.Emit;
|
||||
import org.ahocorasick.trie.Trie;
|
||||
import org.ahocorasick.trie.Trie.TrieBuilder;
|
||||
import org.apache.commons.codec.binary.Base64;
|
||||
import org.apache.commons.lang3.ArrayUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Element;
|
||||
@@ -29,8 +21,6 @@ import com.commafeed.backend.model.FeedSubscription;
|
||||
import com.commafeed.frontend.model.Entry;
|
||||
import com.google.gwt.i18n.client.HasDirection.Direction;
|
||||
import com.google.gwt.i18n.shared.BidiUtils;
|
||||
import com.ibm.icu.text.CharsetDetector;
|
||||
import com.ibm.icu.text.CharsetMatch;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
@@ -50,70 +40,6 @@ public class FeedUtils {
|
||||
return string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect feed encoding by using the declared encoding in the xml processing instruction and by detecting the characters used in the
|
||||
* feed
|
||||
*
|
||||
*/
|
||||
public static Charset guessEncoding(byte[] bytes) {
|
||||
String extracted = extractDeclaredEncoding(bytes);
|
||||
if (StringUtils.startsWithIgnoreCase(extracted, "iso-8859-")) {
|
||||
if (!StringUtils.endsWith(extracted, "1")) {
|
||||
return Charset.forName(extracted);
|
||||
}
|
||||
} else if (StringUtils.startsWithIgnoreCase(extracted, "windows-")) {
|
||||
return Charset.forName(extracted);
|
||||
}
|
||||
return detectEncoding(bytes);
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect encoding by analyzing characters in the array
|
||||
*/
|
||||
public static Charset detectEncoding(byte[] bytes) {
|
||||
String encoding = "UTF-8";
|
||||
|
||||
CharsetDetector detector = new CharsetDetector();
|
||||
detector.setText(bytes);
|
||||
CharsetMatch match = detector.detect();
|
||||
if (match != null) {
|
||||
encoding = match.getName();
|
||||
}
|
||||
if (encoding.equalsIgnoreCase("ISO-8859-1")) {
|
||||
encoding = "windows-1252";
|
||||
}
|
||||
return Charset.forName(encoding);
|
||||
}
|
||||
|
||||
public static String replaceHtmlEntitiesWithNumericEntities(String source) {
|
||||
// Create a buffer sufficiently large that re-allocations are minimized.
|
||||
StringBuilder sb = new StringBuilder(source.length() << 1);
|
||||
|
||||
TrieBuilder builder = Trie.builder();
|
||||
builder.ignoreOverlaps();
|
||||
|
||||
for (String key : HtmlEntities.HTML_ENTITIES) {
|
||||
builder.addKeyword(key);
|
||||
}
|
||||
|
||||
Trie trie = builder.build();
|
||||
Collection<Emit> emits = trie.parseText(source);
|
||||
|
||||
int prevIndex = 0;
|
||||
for (Emit emit : emits) {
|
||||
int matchIndex = emit.getStart();
|
||||
|
||||
sb.append(source, prevIndex, matchIndex);
|
||||
sb.append(HtmlEntities.HTML_TO_NUMERIC_MAP.get(emit.getKeyword()));
|
||||
prevIndex = emit.getEnd() + 1;
|
||||
}
|
||||
|
||||
// Add the remainder of the string (contains no more matches).
|
||||
sb.append(source.substring(prevIndex));
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public static boolean isHttp(String url) {
|
||||
return url.startsWith("http://");
|
||||
}
|
||||
@@ -122,6 +48,10 @@ public class FeedUtils {
|
||||
return url.startsWith("https://");
|
||||
}
|
||||
|
||||
public static boolean isAbsoluteUrl(String url) {
|
||||
return isHttp(url) || isHttps(url);
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize the url. The resulting url is not meant to be fetched but rather used as a mean to identify a feed and avoid duplicates
|
||||
*/
|
||||
@@ -163,25 +93,6 @@ public class FeedUtils {
|
||||
return normalized;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract the declared encoding from the xml
|
||||
*/
|
||||
public static String extractDeclaredEncoding(byte[] bytes) {
|
||||
int index = ArrayUtils.indexOf(bytes, (byte) '>');
|
||||
if (index == -1) {
|
||||
return null;
|
||||
}
|
||||
|
||||
String pi = new String(ArrayUtils.subarray(bytes, 0, index + 1)).replace('\'', '"');
|
||||
index = StringUtils.indexOf(pi, "encoding=\"");
|
||||
if (index == -1) {
|
||||
return null;
|
||||
}
|
||||
String encoding = pi.substring(index + 10);
|
||||
encoding = encoding.substring(0, encoding.indexOf('"'));
|
||||
return encoding;
|
||||
}
|
||||
|
||||
public static boolean isRTL(FeedEntry entry) {
|
||||
String text = entry.getContent().getContent();
|
||||
|
||||
@@ -202,52 +113,6 @@ public class FeedUtils {
|
||||
return direction == Direction.RTL;
|
||||
}
|
||||
|
||||
public static String trimInvalidXmlCharacters(String xml) {
|
||||
if (StringUtils.isBlank(xml)) {
|
||||
return null;
|
||||
}
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
boolean firstTagFound = false;
|
||||
for (int i = 0; i < xml.length(); i++) {
|
||||
char c = xml.charAt(i);
|
||||
|
||||
if (!firstTagFound) {
|
||||
if (c == '<') {
|
||||
firstTagFound = true;
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (c >= 32 || c == 9 || c == 10 || c == 13) {
|
||||
if (!Character.isHighSurrogate(c) && !Character.isLowSurrogate(c)) {
|
||||
sb.append(c);
|
||||
}
|
||||
}
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public static Long averageTimeBetweenEntries(List<FeedEntry> entries) {
|
||||
if (entries.isEmpty() || entries.size() == 1) {
|
||||
return null;
|
||||
}
|
||||
|
||||
List<Long> timestamps = getSortedTimestamps(entries);
|
||||
|
||||
SummaryStatistics stats = new SummaryStatistics();
|
||||
for (int i = 0; i < timestamps.size() - 1; i++) {
|
||||
long diff = Math.abs(timestamps.get(i) - timestamps.get(i + 1));
|
||||
stats.addValue(diff);
|
||||
}
|
||||
return (long) stats.getMean();
|
||||
}
|
||||
|
||||
public static List<Long> getSortedTimestamps(List<FeedEntry> entries) {
|
||||
return entries.stream().map(t -> t.getUpdated().getTime()).sorted(Collections.reverseOrder()).toList();
|
||||
}
|
||||
|
||||
public static String removeTrailingSlash(String url) {
|
||||
if (url.endsWith("/")) {
|
||||
url = url.substring(0, url.length() - 1);
|
||||
@@ -256,8 +121,8 @@ public class FeedUtils {
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param url
|
||||
*
|
||||
* @param relativeUrl
|
||||
* the url of the entry
|
||||
* @param feedLink
|
||||
* the url of the feed as described in the feed
|
||||
@@ -265,32 +130,18 @@ public class FeedUtils {
|
||||
* the url of the feed that we used to fetch the feed
|
||||
* @return an absolute url pointing to the entry
|
||||
*/
|
||||
public static String toAbsoluteUrl(String url, String feedLink, String feedUrl) {
|
||||
url = StringUtils.trimToNull(StringUtils.normalizeSpace(url));
|
||||
if (url == null || url.startsWith("http")) {
|
||||
return url;
|
||||
}
|
||||
|
||||
String baseUrl = (feedLink == null || isRelative(feedLink)) ? feedUrl : feedLink;
|
||||
|
||||
public static String toAbsoluteUrl(String relativeUrl, String feedLink, String feedUrl) {
|
||||
String baseUrl = (feedLink != null && isAbsoluteUrl(feedLink)) ? feedLink : feedUrl;
|
||||
if (baseUrl == null) {
|
||||
return url;
|
||||
return null;
|
||||
}
|
||||
|
||||
String result;
|
||||
try {
|
||||
result = new URL(new URL(baseUrl), url).toString();
|
||||
return new URL(new URL(baseUrl), relativeUrl).toString();
|
||||
} catch (MalformedURLException e) {
|
||||
log.debug("could not parse url : " + e.getMessage(), e);
|
||||
result = url;
|
||||
return null;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
public static boolean isRelative(final String url) {
|
||||
// the regex means "start with 'scheme://'"
|
||||
return url.startsWith("/") || url.startsWith("#") || !url.matches("^\\w+\\:\\/\\/.*");
|
||||
}
|
||||
|
||||
public static String getFaviconUrl(FeedSubscription subscription) {
|
||||
|
||||
@@ -0,0 +1,70 @@
|
||||
package com.commafeed.backend.feed.parser;
|
||||
|
||||
import java.nio.charset.Charset;
|
||||
|
||||
import org.apache.commons.lang3.ArrayUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.ibm.icu.text.CharsetDetector;
|
||||
import com.ibm.icu.text.CharsetMatch;
|
||||
|
||||
import jakarta.inject.Singleton;
|
||||
|
||||
@Singleton
|
||||
class EncodingDetector {
|
||||
|
||||
/**
|
||||
* Detect feed encoding by using the declared encoding in the xml processing instruction and by detecting the characters used in the
|
||||
* feed
|
||||
*
|
||||
*/
|
||||
public Charset getEncoding(byte[] bytes) {
|
||||
String extracted = extractDeclaredEncoding(bytes);
|
||||
if (StringUtils.startsWithIgnoreCase(extracted, "iso-8859-")) {
|
||||
if (!StringUtils.endsWith(extracted, "1")) {
|
||||
return Charset.forName(extracted);
|
||||
}
|
||||
} else if (StringUtils.startsWithIgnoreCase(extracted, "windows-")) {
|
||||
return Charset.forName(extracted);
|
||||
}
|
||||
return detectEncoding(bytes);
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract the declared encoding from the xml
|
||||
*/
|
||||
public String extractDeclaredEncoding(byte[] bytes) {
|
||||
int index = ArrayUtils.indexOf(bytes, (byte) '>');
|
||||
if (index == -1) {
|
||||
return null;
|
||||
}
|
||||
|
||||
String pi = new String(ArrayUtils.subarray(bytes, 0, index + 1)).replace('\'', '"');
|
||||
index = StringUtils.indexOf(pi, "encoding=\"");
|
||||
if (index == -1) {
|
||||
return null;
|
||||
}
|
||||
String encoding = pi.substring(index + 10);
|
||||
encoding = encoding.substring(0, encoding.indexOf('"'));
|
||||
return encoding;
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect encoding by analyzing characters in the array
|
||||
*/
|
||||
private Charset detectEncoding(byte[] bytes) {
|
||||
String encoding = "UTF-8";
|
||||
|
||||
CharsetDetector detector = new CharsetDetector();
|
||||
detector.setText(bytes);
|
||||
CharsetMatch match = detector.detect();
|
||||
if (match != null) {
|
||||
encoding = match.getName();
|
||||
}
|
||||
if (encoding.equalsIgnoreCase("ISO-8859-1")) {
|
||||
encoding = "windows-1252";
|
||||
}
|
||||
return Charset.forName(encoding);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,71 @@
|
||||
package com.commafeed.backend.feed.parser;
|
||||
|
||||
import java.util.Collection;
|
||||
|
||||
import org.ahocorasick.trie.Emit;
|
||||
import org.ahocorasick.trie.Trie;
|
||||
import org.ahocorasick.trie.Trie.TrieBuilder;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import jakarta.inject.Singleton;
|
||||
|
||||
@Singleton
|
||||
class FeedCleaner {
|
||||
|
||||
public String trimInvalidXmlCharacters(String xml) {
|
||||
if (StringUtils.isBlank(xml)) {
|
||||
return null;
|
||||
}
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
boolean firstTagFound = false;
|
||||
for (int i = 0; i < xml.length(); i++) {
|
||||
char c = xml.charAt(i);
|
||||
|
||||
if (!firstTagFound) {
|
||||
if (c == '<') {
|
||||
firstTagFound = true;
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (c >= 32 || c == 9 || c == 10 || c == 13) {
|
||||
if (!Character.isHighSurrogate(c) && !Character.isLowSurrogate(c)) {
|
||||
sb.append(c);
|
||||
}
|
||||
}
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public String replaceHtmlEntitiesWithNumericEntities(String source) {
|
||||
// Create a buffer sufficiently large that re-allocations are minimized.
|
||||
StringBuilder sb = new StringBuilder(source.length() << 1);
|
||||
|
||||
TrieBuilder builder = Trie.builder();
|
||||
builder.ignoreOverlaps();
|
||||
|
||||
for (String key : HtmlEntities.HTML_ENTITIES) {
|
||||
builder.addKeyword(key);
|
||||
}
|
||||
|
||||
Trie trie = builder.build();
|
||||
Collection<Emit> emits = trie.parseText(source);
|
||||
|
||||
int prevIndex = 0;
|
||||
for (Emit emit : emits) {
|
||||
int matchIndex = emit.getStart();
|
||||
|
||||
sb.append(source, prevIndex, matchIndex);
|
||||
sb.append(HtmlEntities.HTML_TO_NUMERIC_MAP.get(emit.getKeyword()));
|
||||
prevIndex = emit.getEnd() + 1;
|
||||
}
|
||||
|
||||
// Add the remainder of the string (contains no more matches).
|
||||
sb.append(source.substring(prevIndex));
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,271 @@
|
||||
package com.commafeed.backend.feed.parser;
|
||||
|
||||
import java.io.StringReader;
|
||||
import java.nio.charset.Charset;
|
||||
import java.text.DateFormat;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.ArrayUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
|
||||
import org.jdom2.Element;
|
||||
import org.jdom2.Namespace;
|
||||
import org.xml.sax.InputSource;
|
||||
|
||||
import com.commafeed.backend.feed.FeedUtils;
|
||||
import com.commafeed.backend.feed.parser.FeedParserResult.Content;
|
||||
import com.commafeed.backend.feed.parser.FeedParserResult.Enclosure;
|
||||
import com.commafeed.backend.feed.parser.FeedParserResult.Entry;
|
||||
import com.commafeed.backend.feed.parser.FeedParserResult.Media;
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.rometools.modules.mediarss.MediaEntryModule;
|
||||
import com.rometools.modules.mediarss.MediaModule;
|
||||
import com.rometools.modules.mediarss.types.MediaGroup;
|
||||
import com.rometools.modules.mediarss.types.Metadata;
|
||||
import com.rometools.modules.mediarss.types.Thumbnail;
|
||||
import com.rometools.rome.feed.synd.SyndCategory;
|
||||
import com.rometools.rome.feed.synd.SyndContent;
|
||||
import com.rometools.rome.feed.synd.SyndEnclosure;
|
||||
import com.rometools.rome.feed.synd.SyndEntry;
|
||||
import com.rometools.rome.feed.synd.SyndFeed;
|
||||
import com.rometools.rome.feed.synd.SyndLink;
|
||||
import com.rometools.rome.feed.synd.SyndLinkImpl;
|
||||
import com.rometools.rome.io.FeedException;
|
||||
import com.rometools.rome.io.SyndFeedInput;
|
||||
|
||||
import jakarta.inject.Inject;
|
||||
import jakarta.inject.Singleton;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
|
||||
/**
|
||||
* Parses raw xml into a FeedParserResult object
|
||||
*/
|
||||
@RequiredArgsConstructor(onConstructor = @__({ @Inject }))
|
||||
@Singleton
|
||||
public class FeedParser {
|
||||
|
||||
private static final Namespace ATOM_10_NS = Namespace.getNamespace("http://www.w3.org/2005/Atom");
|
||||
|
||||
private static final Date START = new Date(86400000);
|
||||
private static final Date END = new Date(1000L * Integer.MAX_VALUE - 86400000);
|
||||
|
||||
private final EncodingDetector encodingDetector;
|
||||
private final FeedCleaner feedCleaner;
|
||||
|
||||
public FeedParserResult parse(String feedUrl, byte[] xml) throws FeedException {
|
||||
try {
|
||||
Charset encoding = encodingDetector.getEncoding(xml);
|
||||
String xmlString = feedCleaner.trimInvalidXmlCharacters(new String(xml, encoding));
|
||||
if (xmlString == null) {
|
||||
throw new FeedException("Input string is null for url " + feedUrl);
|
||||
}
|
||||
xmlString = feedCleaner.replaceHtmlEntitiesWithNumericEntities(xmlString);
|
||||
|
||||
InputSource source = new InputSource(new StringReader(xmlString));
|
||||
SyndFeed feed = new SyndFeedInput().build(source);
|
||||
handleForeignMarkup(feed);
|
||||
|
||||
String title = feed.getTitle();
|
||||
String link = feed.getLink();
|
||||
List<Entry> entries = buildEntries(feed, feedUrl);
|
||||
Date lastEntryDate = entries.stream().findFirst().map(Entry::updated).orElse(null);
|
||||
Date lastPublishedDate = validateDate(feed.getPublishedDate(), false);
|
||||
if (lastPublishedDate == null || lastPublishedDate.before(lastEntryDate)) {
|
||||
lastPublishedDate = lastEntryDate;
|
||||
}
|
||||
Long averageEntryInterval = averageTimeBetweenEntries(entries);
|
||||
|
||||
return new FeedParserResult(title, link, lastPublishedDate, averageEntryInterval, lastEntryDate, entries);
|
||||
} catch (Exception e) {
|
||||
throw new FeedException(String.format("Could not parse feed from %s : %s", feedUrl, e.getMessage()), e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds atom links for rss feeds
|
||||
*/
|
||||
private void handleForeignMarkup(SyndFeed feed) {
|
||||
List<Element> foreignMarkup = feed.getForeignMarkup();
|
||||
if (foreignMarkup == null) {
|
||||
return;
|
||||
}
|
||||
for (Element element : foreignMarkup) {
|
||||
if ("link".equals(element.getName()) && ATOM_10_NS.equals(element.getNamespace())) {
|
||||
SyndLink link = new SyndLinkImpl();
|
||||
link.setRel(element.getAttributeValue("rel"));
|
||||
link.setHref(element.getAttributeValue("href"));
|
||||
feed.getLinks().add(link);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private List<Entry> buildEntries(SyndFeed feed, String feedUrl) {
|
||||
List<Entry> entries = new ArrayList<>();
|
||||
|
||||
for (SyndEntry item : feed.getEntries()) {
|
||||
String guid = item.getUri();
|
||||
if (StringUtils.isBlank(guid)) {
|
||||
guid = item.getLink();
|
||||
}
|
||||
if (StringUtils.isBlank(guid)) {
|
||||
// no guid and no link, skip entry
|
||||
continue;
|
||||
}
|
||||
|
||||
String url = buildEntryUrl(feed, feedUrl, item);
|
||||
if (StringUtils.isBlank(url) && FeedUtils.isAbsoluteUrl(guid)) {
|
||||
// if link is empty but guid is used as url, use guid
|
||||
url = guid;
|
||||
}
|
||||
|
||||
Date updated = buildEntryUpdateDate(item);
|
||||
Content content = buildContent(item);
|
||||
|
||||
entries.add(new Entry(guid, url, updated, content));
|
||||
}
|
||||
|
||||
entries.sort(Comparator.comparing(Entry::updated).reversed());
|
||||
return entries;
|
||||
}
|
||||
|
||||
private Content buildContent(SyndEntry item) {
|
||||
String title = getTitle(item);
|
||||
String content = getContent(item);
|
||||
String author = StringUtils.trimToNull(item.getAuthor());
|
||||
String categories = StringUtils
|
||||
.trimToNull(item.getCategories().stream().map(SyndCategory::getName).collect(Collectors.joining(", ")));
|
||||
|
||||
Enclosure enclosure = buildEnclosure(item);
|
||||
Media media = buildMedia(item);
|
||||
return new Content(title, content, author, categories, enclosure, media);
|
||||
}
|
||||
|
||||
private Enclosure buildEnclosure(SyndEntry item) {
|
||||
SyndEnclosure enclosure = Iterables.getFirst(item.getEnclosures(), null);
|
||||
if (enclosure == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return new Enclosure(enclosure.getUrl(), enclosure.getType());
|
||||
}
|
||||
|
||||
private Date buildEntryUpdateDate(SyndEntry item) {
|
||||
Date date = item.getUpdatedDate();
|
||||
if (date == null) {
|
||||
date = item.getPublishedDate();
|
||||
}
|
||||
if (date == null) {
|
||||
date = new Date();
|
||||
}
|
||||
return validateDate(date, true);
|
||||
}
|
||||
|
||||
private String buildEntryUrl(SyndFeed feed, String feedUrl, SyndEntry item) {
|
||||
String url = StringUtils.trimToNull(StringUtils.normalizeSpace(item.getLink()));
|
||||
if (url == null || FeedUtils.isAbsoluteUrl(url)) {
|
||||
// url is absolute, nothing to do
|
||||
return url;
|
||||
}
|
||||
|
||||
// url is relative, trying to resolve it
|
||||
String feedLink = StringUtils.trimToNull(StringUtils.normalizeSpace(feed.getLink()));
|
||||
return FeedUtils.toAbsoluteUrl(url, feedLink, feedUrl);
|
||||
}
|
||||
|
||||
private Date validateDate(Date date, boolean nullToNow) {
|
||||
Date now = new Date();
|
||||
if (date == null) {
|
||||
return nullToNow ? now : null;
|
||||
}
|
||||
if (date.before(START) || date.after(END)) {
|
||||
return now;
|
||||
}
|
||||
|
||||
if (date.after(now)) {
|
||||
return now;
|
||||
}
|
||||
return date;
|
||||
}
|
||||
|
||||
private String getContent(SyndEntry item) {
|
||||
String content;
|
||||
if (item.getContents().isEmpty()) {
|
||||
content = item.getDescription() == null ? null : item.getDescription().getValue();
|
||||
} else {
|
||||
content = item.getContents().stream().map(SyndContent::getValue).collect(Collectors.joining(System.lineSeparator()));
|
||||
}
|
||||
return StringUtils.trimToNull(content);
|
||||
}
|
||||
|
||||
private String getTitle(SyndEntry item) {
|
||||
String title = item.getTitle();
|
||||
if (StringUtils.isBlank(title)) {
|
||||
Date date = item.getPublishedDate();
|
||||
if (date != null) {
|
||||
title = DateFormat.getInstance().format(date);
|
||||
} else {
|
||||
title = "(no title)";
|
||||
}
|
||||
}
|
||||
return StringUtils.trimToNull(title);
|
||||
}
|
||||
|
||||
private Media buildMedia(SyndEntry item) {
|
||||
MediaEntryModule module = (MediaEntryModule) item.getModule(MediaModule.URI);
|
||||
if (module == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
Media media = buildMedia(module.getMetadata());
|
||||
if (media == null && ArrayUtils.isNotEmpty(module.getMediaGroups())) {
|
||||
MediaGroup group = module.getMediaGroups()[0];
|
||||
media = buildMedia(group.getMetadata());
|
||||
}
|
||||
|
||||
return media;
|
||||
}
|
||||
|
||||
private Media buildMedia(Metadata metadata) {
|
||||
if (metadata == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
String description = metadata.getDescription();
|
||||
|
||||
String thumbnailUrl = null;
|
||||
Integer thumbnailWidth = null;
|
||||
Integer thumbnailHeight = null;
|
||||
if (ArrayUtils.isNotEmpty(metadata.getThumbnail())) {
|
||||
Thumbnail thumbnail = metadata.getThumbnail()[0];
|
||||
thumbnailWidth = thumbnail.getWidth();
|
||||
thumbnailHeight = thumbnail.getHeight();
|
||||
if (thumbnail.getUrl() != null) {
|
||||
thumbnailUrl = thumbnail.getUrl().toString();
|
||||
}
|
||||
}
|
||||
|
||||
if (description == null && thumbnailUrl == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return new Media(description, thumbnailUrl, thumbnailWidth, thumbnailHeight);
|
||||
}
|
||||
|
||||
private Long averageTimeBetweenEntries(List<Entry> entries) {
|
||||
if (entries.isEmpty() || entries.size() == 1) {
|
||||
return null;
|
||||
}
|
||||
|
||||
SummaryStatistics stats = new SummaryStatistics();
|
||||
for (int i = 0; i < entries.size() - 1; i++) {
|
||||
long diff = Math.abs(entries.get(i).updated().getTime() - entries.get(i + 1).updated().getTime());
|
||||
stats.addValue(diff);
|
||||
}
|
||||
return (long) stats.getMean();
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,20 @@
|
||||
package com.commafeed.backend.feed.parser;
|
||||
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
|
||||
public record FeedParserResult(String title, String link, Date lastPublishedDate, Long averageEntryInterval, Date lastEntryDate,
|
||||
List<Entry> entries) {
|
||||
public record Entry(String guid, String url, Date updated, Content content) {
|
||||
}
|
||||
|
||||
public record Content(String title, String content, String author, String categories, Enclosure enclosure, Media media) {
|
||||
}
|
||||
|
||||
public record Enclosure(String url, String type) {
|
||||
}
|
||||
|
||||
public record Media(String description, String thumbnailUrl, Integer thumbnailWidth, Integer thumbnailHeight) {
|
||||
}
|
||||
|
||||
}
|
||||
@@ -1,10 +1,13 @@
|
||||
package com.commafeed.backend.feed;
|
||||
package com.commafeed.backend.feed.parser;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
||||
public class HtmlEntities {
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
class HtmlEntities {
|
||||
public static final Map<String, String> HTML_TO_NUMERIC_MAP;
|
||||
public static final String[] HTML_ENTITIES;
|
||||
public static final String[] NUMERIC_ENTITIES;
|
||||
@@ -3,7 +3,6 @@ package com.commafeed.backend.model;
|
||||
import java.sql.Types;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.commons.lang3.builder.EqualsBuilder;
|
||||
import org.hibernate.annotations.JdbcTypeCode;
|
||||
|
||||
import jakarta.persistence.Column;
|
||||
@@ -61,22 +60,4 @@ public class FeedEntryContent extends AbstractModel {
|
||||
@OneToMany(mappedBy = "content")
|
||||
private Set<FeedEntry> entries;
|
||||
|
||||
public boolean equivalentTo(FeedEntryContent c) {
|
||||
if (c == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return new EqualsBuilder().append(title, c.title)
|
||||
.append(content, c.content)
|
||||
.append(author, c.author)
|
||||
.append(enclosureUrl, c.enclosureUrl)
|
||||
.append(enclosureType, c.enclosureType)
|
||||
.append(mediaDescription, c.mediaDescription)
|
||||
.append(mediaThumbnailUrl, c.mediaThumbnailUrl)
|
||||
.append(mediaThumbnailWidth, c.mediaThumbnailWidth)
|
||||
.append(mediaThumbnailHeight, c.mediaThumbnailHeight)
|
||||
.append(categories, c.categories)
|
||||
.build();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -8,6 +8,7 @@ import java.util.Optional;
|
||||
|
||||
import org.apache.commons.codec.digest.DigestUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.lang3.builder.EqualsBuilder;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Document.OutputSettings;
|
||||
@@ -23,6 +24,9 @@ import org.w3c.dom.css.CSSStyleDeclaration;
|
||||
|
||||
import com.commafeed.backend.dao.FeedEntryContentDAO;
|
||||
import com.commafeed.backend.feed.FeedUtils;
|
||||
import com.commafeed.backend.feed.parser.FeedParserResult.Content;
|
||||
import com.commafeed.backend.feed.parser.FeedParserResult.Enclosure;
|
||||
import com.commafeed.backend.feed.parser.FeedParserResult.Media;
|
||||
import com.commafeed.backend.model.FeedEntryContent;
|
||||
import com.steadystate.css.parser.CSSOMParser;
|
||||
|
||||
@@ -46,26 +50,65 @@ public class FeedEntryContentService {
|
||||
/**
|
||||
* this is NOT thread-safe
|
||||
*/
|
||||
public FeedEntryContent findOrCreate(FeedEntryContent content, String baseUrl) {
|
||||
content.setAuthor(FeedUtils.truncate(handleContent(content.getAuthor(), baseUrl, true), 128));
|
||||
content.setTitle(FeedUtils.truncate(handleContent(content.getTitle(), baseUrl, true), 2048));
|
||||
content.setContent(handleContent(content.getContent(), baseUrl, false));
|
||||
content.setMediaDescription(handleContent(content.getMediaDescription(), baseUrl, false));
|
||||
public FeedEntryContent findOrCreate(Content content, String baseUrl) {
|
||||
String title = FeedUtils.truncate(handleContent(content.title(), baseUrl, true), 2048);
|
||||
String titleHash = DigestUtils.sha1Hex(StringUtils.trimToEmpty(title));
|
||||
|
||||
String contentHash = DigestUtils.sha1Hex(StringUtils.trimToEmpty(content.getContent()));
|
||||
content.setContentHash(contentHash);
|
||||
|
||||
String titleHash = DigestUtils.sha1Hex(StringUtils.trimToEmpty(content.getTitle()));
|
||||
content.setTitleHash(titleHash);
|
||||
String contentString = handleContent(content.content(), baseUrl, false);
|
||||
String contentHash = DigestUtils.sha1Hex(StringUtils.trimToEmpty(contentString));
|
||||
|
||||
List<FeedEntryContent> existing = feedEntryContentDAO.findExisting(contentHash, titleHash);
|
||||
Optional<FeedEntryContent> equivalentContent = existing.stream().filter(content::equivalentTo).findFirst();
|
||||
Optional<FeedEntryContent> equivalentContent = existing.stream()
|
||||
.filter(c -> isEquivalent(c, content, title, contentString))
|
||||
.findFirst();
|
||||
if (equivalentContent.isPresent()) {
|
||||
return equivalentContent.get();
|
||||
}
|
||||
|
||||
feedEntryContentDAO.saveOrUpdate(content);
|
||||
return content;
|
||||
FeedEntryContent entryContent = new FeedEntryContent();
|
||||
entryContent.setTitle(title);
|
||||
entryContent.setTitleHash(titleHash);
|
||||
entryContent.setContent(contentString);
|
||||
entryContent.setContentHash(contentHash);
|
||||
entryContent.setAuthor(FeedUtils.truncate(handleContent(content.author(), baseUrl, true), 128));
|
||||
entryContent.setCategories(FeedUtils.truncate(content.categories(), 4096));
|
||||
|
||||
Enclosure enclosure = content.enclosure();
|
||||
if (enclosure != null) {
|
||||
entryContent.setEnclosureUrl(enclosure.url());
|
||||
entryContent.setEnclosureType(enclosure.type());
|
||||
}
|
||||
|
||||
Media media = content.media();
|
||||
if (media != null) {
|
||||
entryContent.setMediaDescription(handleContent(media.description(), baseUrl, false));
|
||||
entryContent.setMediaThumbnailUrl(media.thumbnailUrl());
|
||||
entryContent.setMediaThumbnailWidth(media.thumbnailWidth());
|
||||
entryContent.setMediaThumbnailHeight(media.thumbnailHeight());
|
||||
}
|
||||
|
||||
feedEntryContentDAO.saveOrUpdate(entryContent);
|
||||
return entryContent;
|
||||
}
|
||||
|
||||
private boolean isEquivalent(FeedEntryContent content, Content c, String title, String contentString) {
|
||||
EqualsBuilder builder = new EqualsBuilder().append(content.getTitle(), title)
|
||||
.append(content.getContent(), contentString)
|
||||
.append(content.getAuthor(), c.author())
|
||||
.append(content.getCategories(), c.categories());
|
||||
|
||||
if (c.enclosure() != null) {
|
||||
builder.append(content.getEnclosureUrl(), c.enclosure().url()).append(content.getEnclosureType(), c.enclosure().type());
|
||||
}
|
||||
|
||||
if (c.media() != null) {
|
||||
builder.append(content.getMediaDescription(), c.media().description())
|
||||
.append(content.getMediaThumbnailUrl(), c.media().thumbnailUrl())
|
||||
.append(content.getMediaThumbnailWidth(), c.media().thumbnailWidth())
|
||||
.append(content.getMediaThumbnailHeight(), c.media().thumbnailHeight());
|
||||
}
|
||||
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
private static Safelist buildWhiteList() {
|
||||
|
||||
@@ -10,9 +10,10 @@ import com.commafeed.backend.dao.FeedEntryDAO;
|
||||
import com.commafeed.backend.dao.FeedEntryStatusDAO;
|
||||
import com.commafeed.backend.dao.FeedSubscriptionDAO;
|
||||
import com.commafeed.backend.feed.FeedEntryKeyword;
|
||||
import com.commafeed.backend.feed.FeedUtils;
|
||||
import com.commafeed.backend.feed.parser.FeedParserResult.Entry;
|
||||
import com.commafeed.backend.model.Feed;
|
||||
import com.commafeed.backend.model.FeedEntry;
|
||||
import com.commafeed.backend.model.FeedEntryContent;
|
||||
import com.commafeed.backend.model.FeedEntryStatus;
|
||||
import com.commafeed.backend.model.FeedSubscription;
|
||||
import com.commafeed.backend.model.User;
|
||||
@@ -37,30 +38,27 @@ public class FeedEntryService {
|
||||
/**
|
||||
* this is NOT thread-safe
|
||||
*/
|
||||
public boolean addEntry(Feed feed, FeedEntry entry, List<FeedSubscription> subscriptions) {
|
||||
|
||||
Long existing = feedEntryDAO.findExisting(entry.getGuid(), feed);
|
||||
public boolean addEntry(Feed feed, Entry entry, List<FeedSubscription> subscriptions) {
|
||||
String guid = FeedUtils.truncate(entry.guid(), 2048);
|
||||
String guidHash = DigestUtils.sha1Hex(entry.guid());
|
||||
Long existing = feedEntryDAO.findExisting(guidHash, feed);
|
||||
if (existing != null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
FeedEntryContent content = feedEntryContentService.findOrCreate(entry.getContent(), feed.getLink());
|
||||
entry.setGuidHash(DigestUtils.sha1Hex(entry.getGuid()));
|
||||
entry.setContent(content);
|
||||
entry.setInserted(new Date());
|
||||
entry.setFeed(feed);
|
||||
feedEntryDAO.saveOrUpdate(entry);
|
||||
FeedEntry feedEntry = buildEntry(feed, entry, guid, guidHash);
|
||||
feedEntryDAO.saveOrUpdate(feedEntry);
|
||||
|
||||
// if filter does not match the entry, mark it as read
|
||||
for (FeedSubscription sub : subscriptions) {
|
||||
boolean matches = true;
|
||||
try {
|
||||
matches = feedEntryFilteringService.filterMatchesEntry(sub.getFilter(), entry);
|
||||
matches = feedEntryFilteringService.filterMatchesEntry(sub.getFilter(), feedEntry);
|
||||
} catch (FeedEntryFilteringService.FeedEntryFilterException e) {
|
||||
log.error("could not evaluate filter {}", sub.getFilter(), e);
|
||||
}
|
||||
if (!matches) {
|
||||
FeedEntryStatus status = new FeedEntryStatus(sub.getUser(), sub, entry);
|
||||
FeedEntryStatus status = new FeedEntryStatus(sub.getUser(), sub, feedEntry);
|
||||
status.setRead(true);
|
||||
feedEntryStatusDAO.saveOrUpdate(status);
|
||||
}
|
||||
@@ -69,8 +67,20 @@ public class FeedEntryService {
|
||||
return true;
|
||||
}
|
||||
|
||||
public void markEntry(User user, Long entryId, boolean read) {
|
||||
private FeedEntry buildEntry(Feed feed, Entry e, String guid, String guidHash) {
|
||||
FeedEntry entry = new FeedEntry();
|
||||
entry.setGuid(guid);
|
||||
entry.setGuidHash(guidHash);
|
||||
entry.setUrl(FeedUtils.truncate(e.url(), 2048));
|
||||
entry.setUpdated(e.updated());
|
||||
entry.setInserted(new Date());
|
||||
entry.setFeed(feed);
|
||||
|
||||
entry.setContent(feedEntryContentService.findOrCreate(e.content(), feed.getLink()));
|
||||
return entry;
|
||||
}
|
||||
|
||||
public void markEntry(User user, Long entryId, boolean read) {
|
||||
FeedEntry entry = feedEntryDAO.findById(entryId);
|
||||
if (entry == null) {
|
||||
return;
|
||||
|
||||
@@ -37,13 +37,14 @@ public class FeedService {
|
||||
}
|
||||
|
||||
public synchronized Feed findOrCreate(String url) {
|
||||
String normalized = FeedUtils.normalizeURL(url);
|
||||
Feed feed = feedDAO.findByUrl(normalized);
|
||||
String normalizedUrl = FeedUtils.normalizeURL(url);
|
||||
String normalizedUrlHash = DigestUtils.sha1Hex(normalizedUrl);
|
||||
Feed feed = feedDAO.findByUrl(normalizedUrl, normalizedUrlHash);
|
||||
if (feed == null) {
|
||||
feed = new Feed();
|
||||
feed.setUrl(url);
|
||||
feed.setNormalizedUrl(normalized);
|
||||
feed.setNormalizedUrlHash(DigestUtils.sha1Hex(normalized));
|
||||
feed.setNormalizedUrl(normalizedUrl);
|
||||
feed.setNormalizedUrlHash(normalizedUrlHash);
|
||||
feed.setDisabledUntil(new Date(0));
|
||||
feedDAO.saveOrUpdate(feed);
|
||||
}
|
||||
@@ -55,6 +56,7 @@ public class FeedService {
|
||||
feed.setNormalizedUrl(normalized);
|
||||
feed.setNormalizedUrlHash(DigestUtils.sha1Hex(normalized));
|
||||
feed.setLastUpdated(new Date());
|
||||
feed.setEtagHeader(FeedUtils.truncate(feed.getEtagHeader(), 255));
|
||||
feedDAO.saveOrUpdate(feed);
|
||||
}
|
||||
|
||||
|
||||
@@ -245,8 +245,8 @@ public class FeedREST {
|
||||
try {
|
||||
FeedFetcherResult feedFetcherResult = feedFetcher.fetch(url, true, null, null, null, null);
|
||||
info = new FeedInfo();
|
||||
info.setUrl(feedFetcherResult.getUrlAfterRedirect());
|
||||
info.setTitle(feedFetcherResult.getTitle());
|
||||
info.setUrl(feedFetcherResult.urlAfterRedirect());
|
||||
info.setTitle(feedFetcherResult.feed().title());
|
||||
|
||||
} catch (Exception e) {
|
||||
log.debug(e.getMessage(), e);
|
||||
|
||||
@@ -15,6 +15,7 @@ import org.mockito.junit.jupiter.MockitoExtension;
|
||||
import com.commafeed.backend.HttpGetter;
|
||||
import com.commafeed.backend.HttpGetter.HttpResult;
|
||||
import com.commafeed.backend.HttpGetter.NotModifiedException;
|
||||
import com.commafeed.backend.feed.parser.FeedParser;
|
||||
import com.commafeed.backend.urlprovider.FeedURLProvider;
|
||||
|
||||
@ExtendWith(MockitoExtension.class)
|
||||
|
||||
@@ -64,21 +64,6 @@ class FeedUtilsTest {
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
void testExtractDeclaredEncoding() {
|
||||
Assertions.assertNull(FeedUtils.extractDeclaredEncoding("<?xml ?>".getBytes()));
|
||||
Assertions.assertNull(FeedUtils.extractDeclaredEncoding("<feed></feed>".getBytes()));
|
||||
Assertions.assertEquals("UTF-8", FeedUtils.extractDeclaredEncoding("<?xml encoding=\"UTF-8\" ?>".getBytes()));
|
||||
Assertions.assertEquals("UTF-8", FeedUtils.extractDeclaredEncoding("<?xml encoding='UTF-8' ?>".getBytes()));
|
||||
Assertions.assertEquals("UTF-8", FeedUtils.extractDeclaredEncoding("<?xml encoding='UTF-8'?>".getBytes()));
|
||||
}
|
||||
|
||||
@Test
|
||||
void testReplaceHtmlEntitiesWithNumericEntities() {
|
||||
String source = "<source>T´l´phone ′</source>";
|
||||
Assertions.assertEquals("<source>T´l´phone ′</source>", FeedUtils.replaceHtmlEntitiesWithNumericEntities(source));
|
||||
}
|
||||
|
||||
@Test
|
||||
void testRemoveTrailingSlash() {
|
||||
final String url = "http://localhost/";
|
||||
|
||||
@@ -0,0 +1,19 @@
|
||||
package com.commafeed.backend.feed.parser;
|
||||
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
class EncodingDetectorTest {
|
||||
|
||||
EncodingDetector encodingDetector = new EncodingDetector();
|
||||
|
||||
@Test
|
||||
void testExtractDeclaredEncoding() {
|
||||
Assertions.assertNull(encodingDetector.extractDeclaredEncoding("<?xml ?>".getBytes()));
|
||||
Assertions.assertNull(encodingDetector.extractDeclaredEncoding("<feed></feed>".getBytes()));
|
||||
Assertions.assertEquals("UTF-8", encodingDetector.extractDeclaredEncoding("<?xml encoding=\"UTF-8\" ?>".getBytes()));
|
||||
Assertions.assertEquals("UTF-8", encodingDetector.extractDeclaredEncoding("<?xml encoding='UTF-8' ?>".getBytes()));
|
||||
Assertions.assertEquals("UTF-8", encodingDetector.extractDeclaredEncoding("<?xml encoding='UTF-8'?>".getBytes()));
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
package com.commafeed.backend.feed.parser;
|
||||
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
class FeedCleanerTest {
|
||||
|
||||
FeedCleaner feedCleaner = new FeedCleaner();
|
||||
|
||||
@Test
|
||||
void testReplaceHtmlEntitiesWithNumericEntities() {
|
||||
String source = "<source>T´l´phone ′</source>";
|
||||
Assertions.assertEquals("<source>T´l´phone ′</source>", feedCleaner.replaceHtmlEntitiesWithNumericEntities(source));
|
||||
}
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user