split client and server into maven modules

This commit is contained in:
Athou
2022-08-13 10:34:59 +02:00
parent 4c4868a2b6
commit ac7b6eeb21
277 changed files with 645 additions and 521 deletions

View File

@@ -0,0 +1,39 @@
package com.commafeed.backend.feed;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import lombok.Getter;
import lombok.RequiredArgsConstructor;
/**
* A keyword used in a search query
*/
@Getter
@RequiredArgsConstructor
public class FeedEntryKeyword {
public enum Mode {
INCLUDE, EXCLUDE;
}
private final String keyword;
private final Mode mode;
public static List<FeedEntryKeyword> fromQueryString(String keywords) {
List<FeedEntryKeyword> list = new ArrayList<>();
if (keywords != null) {
for (String keyword : StringUtils.split(keywords)) {
boolean not = false;
if (keyword.startsWith("-") || keyword.startsWith("!")) {
not = true;
keyword = keyword.substring(1);
}
list.add(new FeedEntryKeyword(keyword, not ? Mode.EXCLUDE : Mode.INCLUDE));
}
}
return list;
}
}

View File

@@ -0,0 +1,96 @@
package com.commafeed.backend.feed;
import java.io.IOException;
import java.util.Date;
import java.util.Set;
import javax.inject.Inject;
import javax.inject.Singleton;
import org.apache.commons.codec.binary.StringUtils;
import org.apache.commons.codec.digest.DigestUtils;
import com.commafeed.backend.HttpGetter;
import com.commafeed.backend.HttpGetter.HttpResult;
import com.commafeed.backend.HttpGetter.NotModifiedException;
import com.commafeed.backend.model.Feed;
import com.commafeed.backend.urlprovider.FeedURLProvider;
import com.rometools.rome.io.FeedException;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@RequiredArgsConstructor(onConstructor = @__({ @Inject }))
@Singleton
public class FeedFetcher {
private final FeedParser parser;
private final HttpGetter getter;
private final Set<FeedURLProvider> urlProviders;
public FetchedFeed fetch(String feedUrl, boolean extractFeedUrlFromHtml, String lastModified, String eTag, Date lastPublishedDate,
String lastContentHash) throws FeedException, IOException, NotModifiedException {
log.debug("Fetching feed {}", feedUrl);
FetchedFeed fetchedFeed = null;
int timeout = 20000;
HttpResult result = getter.getBinary(feedUrl, lastModified, eTag, timeout);
byte[] content = result.getContent();
try {
fetchedFeed = parser.parse(result.getUrlAfterRedirect(), content);
} catch (FeedException e) {
if (extractFeedUrlFromHtml) {
String extractedUrl = extractFeedUrl(urlProviders, feedUrl, StringUtils.newStringUtf8(result.getContent()));
if (org.apache.commons.lang3.StringUtils.isNotBlank(extractedUrl)) {
feedUrl = extractedUrl;
result = getter.getBinary(extractedUrl, lastModified, eTag, timeout);
content = result.getContent();
fetchedFeed = parser.parse(result.getUrlAfterRedirect(), content);
} else {
throw e;
}
} else {
throw e;
}
}
if (content == null) {
throw new IOException("Feed content is empty.");
}
String hash = DigestUtils.sha1Hex(content);
if (lastContentHash != null && hash != null && lastContentHash.equals(hash)) {
log.debug("content hash not modified: {}", feedUrl);
throw new NotModifiedException("content hash not modified");
}
if (lastPublishedDate != null && fetchedFeed.getFeed().getLastPublishedDate() != null
&& lastPublishedDate.getTime() == fetchedFeed.getFeed().getLastPublishedDate().getTime()) {
log.debug("publishedDate not modified: {}", feedUrl);
throw new NotModifiedException("publishedDate not modified");
}
Feed feed = fetchedFeed.getFeed();
feed.setLastModifiedHeader(result.getLastModifiedSince());
feed.setEtagHeader(FeedUtils.truncate(result.getETag(), 255));
feed.setLastContentHash(hash);
fetchedFeed.setFetchDuration(result.getDuration());
fetchedFeed.setUrlAfterRedirect(result.getUrlAfterRedirect());
return fetchedFeed;
}
private static String extractFeedUrl(Set<FeedURLProvider> urlProviders, String url, String urlContent) {
for (FeedURLProvider urlProvider : urlProviders) {
String feedUrl = urlProvider.get(url, urlContent);
if (feedUrl != null) {
return feedUrl;
}
}
return null;
}
}

View File

@@ -0,0 +1,276 @@
package com.commafeed.backend.feed;
import java.io.StringReader;
import java.nio.charset.Charset;
import java.text.DateFormat;
import java.util.Date;
import java.util.List;
import java.util.stream.Collectors;
import javax.inject.Inject;
import javax.inject.Singleton;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import org.jdom2.Element;
import org.jdom2.Namespace;
import org.xml.sax.InputSource;
import com.commafeed.backend.model.Feed;
import com.commafeed.backend.model.FeedEntry;
import com.commafeed.backend.model.FeedEntryContent;
import com.google.common.collect.Iterables;
import com.rometools.modules.mediarss.MediaEntryModule;
import com.rometools.modules.mediarss.MediaModule;
import com.rometools.modules.mediarss.types.MediaGroup;
import com.rometools.modules.mediarss.types.Metadata;
import com.rometools.modules.mediarss.types.Thumbnail;
import com.rometools.rome.feed.synd.SyndCategory;
import com.rometools.rome.feed.synd.SyndContent;
import com.rometools.rome.feed.synd.SyndEnclosure;
import com.rometools.rome.feed.synd.SyndEntry;
import com.rometools.rome.feed.synd.SyndFeed;
import com.rometools.rome.feed.synd.SyndLink;
import com.rometools.rome.feed.synd.SyndLinkImpl;
import com.rometools.rome.io.FeedException;
import com.rometools.rome.io.SyndFeedInput;
import lombok.Data;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@RequiredArgsConstructor(onConstructor = @__({ @Inject }))
@Singleton
public class FeedParser {
private static final String ATOM_10_URI = "http://www.w3.org/2005/Atom";
private static final Namespace ATOM_10_NS = Namespace.getNamespace(ATOM_10_URI);
private static final Date START = new Date(86400000);
private static final Date END = new Date(1000L * Integer.MAX_VALUE - 86400000);
public FetchedFeed parse(String feedUrl, byte[] xml) throws FeedException {
FetchedFeed fetchedFeed = new FetchedFeed();
Feed feed = fetchedFeed.getFeed();
List<FeedEntry> entries = fetchedFeed.getEntries();
try {
Charset encoding = FeedUtils.guessEncoding(xml);
String xmlString = FeedUtils.trimInvalidXmlCharacters(new String(xml, encoding));
if (xmlString == null) {
throw new FeedException("Input string is null for url " + feedUrl);
}
xmlString = FeedUtils.replaceHtmlEntitiesWithNumericEntities(xmlString);
InputSource source = new InputSource(new StringReader(xmlString));
SyndFeed rss = new SyndFeedInput().build(source);
handleForeignMarkup(rss);
fetchedFeed.setTitle(rss.getTitle());
feed.setPushHub(findHub(rss));
feed.setPushTopic(findSelf(rss));
feed.setUrl(feedUrl);
feed.setLink(rss.getLink());
List<SyndEntry> items = rss.getEntries();
for (SyndEntry item : items) {
FeedEntry entry = new FeedEntry();
String guid = item.getUri();
if (StringUtils.isBlank(guid)) {
guid = item.getLink();
}
if (StringUtils.isBlank(guid)) {
// no guid and no link, skip entry
continue;
}
entry.setGuid(FeedUtils.truncate(guid, 2048));
entry.setUpdated(validateDate(getEntryUpdateDate(item), true));
entry.setUrl(FeedUtils.truncate(FeedUtils.toAbsoluteUrl(item.getLink(), feed.getLink(), feedUrl), 2048));
// if link is empty but guid is used as url
if (StringUtils.isBlank(entry.getUrl()) && StringUtils.startsWith(entry.getGuid(), "http")) {
entry.setUrl(entry.getGuid());
}
FeedEntryContent content = new FeedEntryContent();
content.setContent(getContent(item));
content.setCategories(FeedUtils
.truncate(item.getCategories().stream().map(SyndCategory::getName).collect(Collectors.joining(", ")), 4096));
content.setTitle(getTitle(item));
content.setAuthor(StringUtils.trimToNull(item.getAuthor()));
SyndEnclosure enclosure = Iterables.getFirst(item.getEnclosures(), null);
if (enclosure != null) {
content.setEnclosureUrl(FeedUtils.truncate(enclosure.getUrl(), 2048));
content.setEnclosureType(enclosure.getType());
}
MediaEntryModule module = (MediaEntryModule) item.getModule(MediaModule.URI);
if (module != null) {
Media media = getMedia(module);
if (media != null) {
content.setMediaDescription(media.getDescription());
content.setMediaThumbnailUrl(FeedUtils.truncate(media.getThumbnailUrl(), 2048));
content.setMediaThumbnailWidth(media.getThumbnailWidth());
content.setMediaThumbnailHeight(media.getThumbnailHeight());
}
}
entry.setContent(content);
entries.add(entry);
}
Date lastEntryDate = null;
Date publishedDate = validateDate(rss.getPublishedDate(), false);
if (!entries.isEmpty()) {
List<Long> sortedTimestamps = FeedUtils.getSortedTimestamps(entries);
Long timestamp = sortedTimestamps.get(0);
lastEntryDate = new Date(timestamp);
publishedDate = (publishedDate == null || publishedDate.before(lastEntryDate)) ? lastEntryDate : publishedDate;
}
feed.setLastPublishedDate(publishedDate);
feed.setAverageEntryInterval(FeedUtils.averageTimeBetweenEntries(entries));
feed.setLastEntryDate(lastEntryDate);
} catch (Exception e) {
throw new FeedException(String.format("Could not parse feed from %s : %s", feedUrl, e.getMessage()), e);
}
return fetchedFeed;
}
/**
* Adds atom links for rss feeds
*/
private void handleForeignMarkup(SyndFeed feed) {
List<Element> foreignMarkup = feed.getForeignMarkup();
if (foreignMarkup == null) {
return;
}
for (Element element : foreignMarkup) {
if ("link".equals(element.getName()) && ATOM_10_NS.equals(element.getNamespace())) {
SyndLink link = new SyndLinkImpl();
link.setRel(element.getAttributeValue("rel"));
link.setHref(element.getAttributeValue("href"));
feed.getLinks().add(link);
}
}
}
private Date getEntryUpdateDate(SyndEntry item) {
Date date = item.getUpdatedDate();
if (date == null) {
date = item.getPublishedDate();
}
if (date == null) {
date = new Date();
}
return date;
}
private Date validateDate(Date date, boolean nullToNow) {
Date now = new Date();
if (date == null) {
return nullToNow ? now : null;
}
if (date.before(START) || date.after(END)) {
return now;
}
if (date.after(now)) {
return now;
}
return date;
}
private String getContent(SyndEntry item) {
String content = null;
if (item.getContents().isEmpty()) {
content = item.getDescription() == null ? null : item.getDescription().getValue();
} else {
content = item.getContents().stream().map(SyndContent::getValue).collect(Collectors.joining(System.lineSeparator()));
}
return StringUtils.trimToNull(content);
}
private String getTitle(SyndEntry item) {
String title = item.getTitle();
if (StringUtils.isBlank(title)) {
Date date = item.getPublishedDate();
if (date != null) {
title = DateFormat.getInstance().format(date);
} else {
title = "(no title)";
}
}
return StringUtils.trimToNull(title);
}
private Media getMedia(MediaEntryModule module) {
Media media = getMedia(module.getMetadata());
if (media == null && ArrayUtils.isNotEmpty(module.getMediaGroups())) {
MediaGroup group = module.getMediaGroups()[0];
media = getMedia(group.getMetadata());
}
return media;
}
private Media getMedia(Metadata metadata) {
if (metadata == null) {
return null;
}
Media media = new Media();
media.setDescription(metadata.getDescription());
if (ArrayUtils.isNotEmpty(metadata.getThumbnail())) {
Thumbnail thumbnail = metadata.getThumbnail()[0];
media.setThumbnailWidth(thumbnail.getWidth());
media.setThumbnailHeight(thumbnail.getHeight());
if (thumbnail.getUrl() != null) {
media.setThumbnailUrl(thumbnail.getUrl().toString());
}
}
if (media.isEmpty()) {
return null;
}
return media;
}
private String findHub(SyndFeed feed) {
for (SyndLink l : feed.getLinks()) {
if ("hub".equalsIgnoreCase(l.getRel())) {
log.debug("found hub {} for feed {}", l.getHref(), feed.getLink());
return l.getHref();
}
}
return null;
}
private String findSelf(SyndFeed feed) {
for (SyndLink l : feed.getLinks()) {
if ("self".equalsIgnoreCase(l.getRel())) {
log.debug("found self {} for feed {}", l.getHref(), feed.getLink());
return l.getHref();
}
}
return null;
}
@Data
private static class Media {
private String description;
private String thumbnailUrl;
private Integer thumbnailWidth;
private Integer thumbnailHeight;
public boolean isEmpty() {
return description == null && thumbnailUrl == null;
}
}
}

View File

@@ -0,0 +1,160 @@
package com.commafeed.backend.feed;
import java.util.ArrayList;
import java.util.Date;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.stream.Collectors;
import javax.inject.Inject;
import javax.inject.Singleton;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.time.DateUtils;
import org.hibernate.SessionFactory;
import com.codahale.metrics.Gauge;
import com.codahale.metrics.Meter;
import com.codahale.metrics.MetricRegistry;
import com.commafeed.CommaFeedConfiguration;
import com.commafeed.backend.dao.FeedDAO;
import com.commafeed.backend.dao.UnitOfWork;
import com.commafeed.backend.model.Feed;
@Singleton
public class FeedQueues {
private SessionFactory sessionFactory;
private final FeedDAO feedDAO;
private final CommaFeedConfiguration config;
private Queue<FeedRefreshContext> addQueue = new ConcurrentLinkedQueue<>();
private Queue<FeedRefreshContext> takeQueue = new ConcurrentLinkedQueue<>();
private Queue<Feed> giveBackQueue = new ConcurrentLinkedQueue<>();
private Meter refill;
@Inject
public FeedQueues(SessionFactory sessionFactory, FeedDAO feedDAO, CommaFeedConfiguration config, MetricRegistry metrics) {
this.sessionFactory = sessionFactory;
this.config = config;
this.feedDAO = feedDAO;
refill = metrics.meter(MetricRegistry.name(getClass(), "refill"));
metrics.register(MetricRegistry.name(getClass(), "addQueue"), new Gauge<Integer>() {
@Override
public Integer getValue() {
return addQueue.size();
}
});
metrics.register(MetricRegistry.name(getClass(), "takeQueue"), new Gauge<Integer>() {
@Override
public Integer getValue() {
return takeQueue.size();
}
});
metrics.register(MetricRegistry.name(getClass(), "giveBackQueue"), new Gauge<Integer>() {
@Override
public Integer getValue() {
return giveBackQueue.size();
}
});
}
/**
* take a feed from the refresh queue
*/
public synchronized FeedRefreshContext take() {
FeedRefreshContext context = takeQueue.poll();
if (context == null) {
refill();
context = takeQueue.poll();
}
return context;
}
/**
* add a feed to the refresh queue
*/
public void add(Feed feed, boolean urgent) {
int refreshInterval = config.getApplicationSettings().getRefreshIntervalMinutes();
if (feed.getLastUpdated() == null || feed.getLastUpdated().before(DateUtils.addMinutes(new Date(), -1 * refreshInterval))) {
boolean alreadyQueued = addQueue.stream().anyMatch(c -> c.getFeed().getId().equals(feed.getId()));
if (!alreadyQueued) {
addQueue.add(new FeedRefreshContext(feed, urgent));
}
}
}
/**
* refills the refresh queue and empties the giveBack queue while at it
*/
private void refill() {
refill.mark();
List<FeedRefreshContext> contexts = new ArrayList<>();
int batchSize = Math.min(100, 3 * config.getApplicationSettings().getBackgroundThreads());
// add feeds we got from the add() method
int addQueueSize = addQueue.size();
for (int i = 0; i < Math.min(batchSize, addQueueSize); i++) {
contexts.add(addQueue.poll());
}
// add feeds that are up to refresh from the database
int count = batchSize - contexts.size();
if (count > 0) {
List<Feed> feeds = UnitOfWork.call(sessionFactory, () -> feedDAO.findNextUpdatable(count, getLastLoginThreshold()));
for (Feed feed : feeds) {
contexts.add(new FeedRefreshContext(feed, false));
}
}
// set the disabledDate as we use it in feedDAO to decide what to refresh next. We also use a map to remove
// duplicates.
Map<Long, FeedRefreshContext> map = new LinkedHashMap<>();
for (FeedRefreshContext context : contexts) {
Feed feed = context.getFeed();
feed.setDisabledUntil(DateUtils.addMinutes(new Date(), config.getApplicationSettings().getRefreshIntervalMinutes()));
map.put(feed.getId(), context);
}
// refill the queue
takeQueue.addAll(map.values());
// add feeds from the giveBack queue to the map, overriding duplicates
int giveBackQueueSize = giveBackQueue.size();
for (int i = 0; i < giveBackQueueSize; i++) {
Feed feed = giveBackQueue.poll();
map.put(feed.getId(), new FeedRefreshContext(feed, false));
}
// update all feeds in the database
List<Feed> feeds = map.values().stream().map(c -> c.getFeed()).collect(Collectors.toList());
UnitOfWork.run(sessionFactory, () -> feedDAO.saveOrUpdate(feeds));
}
/**
* give a feed back, updating it to the database during the next refill()
*/
public void giveBack(Feed feed) {
String normalized = FeedUtils.normalizeURL(feed.getUrl());
feed.setNormalizedUrl(normalized);
feed.setNormalizedUrlHash(DigestUtils.sha1Hex(normalized));
feed.setLastUpdated(new Date());
giveBackQueue.add(feed);
}
private Date getLastLoginThreshold() {
if (config.getApplicationSettings().getHeavyLoad()) {
return DateUtils.addDays(new Date(), -30);
} else {
return null;
}
}
}

View File

@@ -0,0 +1,22 @@
package com.commafeed.backend.feed;
import java.util.List;
import com.commafeed.backend.model.Feed;
import com.commafeed.backend.model.FeedEntry;
import lombok.Getter;
import lombok.Setter;
@Getter
@Setter
public class FeedRefreshContext {
private Feed feed;
private List<FeedEntry> entries;
private boolean urgent;
public FeedRefreshContext(Feed feed, boolean isUrgent) {
this.feed = feed;
this.urgent = isUrgent;
}
}

View File

@@ -0,0 +1,99 @@
package com.commafeed.backend.feed;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.RejectedExecutionHandler;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import com.codahale.metrics.Gauge;
import com.codahale.metrics.MetricRegistry;
import lombok.extern.slf4j.Slf4j;
/**
* Wraps a {@link ThreadPoolExecutor} instance. Blocks when queue is full instead of rejecting the task. Allow priority queueing by using
* {@link Task} instead of {@link Runnable}
*
*/
@Slf4j
public class FeedRefreshExecutor {
private String poolName;
private ThreadPoolExecutor pool;
private LinkedBlockingDeque<Runnable> queue;
public FeedRefreshExecutor(final String poolName, int threads, int queueCapacity, MetricRegistry metrics) {
log.info("Creating pool {} with {} threads", poolName, threads);
this.poolName = poolName;
pool = new ThreadPoolExecutor(threads, threads, 0, TimeUnit.MILLISECONDS, queue = new LinkedBlockingDeque<Runnable>(queueCapacity) {
private static final long serialVersionUID = 1L;
@Override
public boolean offer(Runnable r) {
Task task = (Task) r;
if (task.isUrgent()) {
return offerFirst(r);
} else {
return offerLast(r);
}
}
}) {
@Override
protected void afterExecute(Runnable r, Throwable t) {
if (t != null) {
log.error("thread from pool {} threw a runtime exception", poolName, t);
}
}
};
pool.setRejectedExecutionHandler(new RejectedExecutionHandler() {
@Override
public void rejectedExecution(Runnable r, ThreadPoolExecutor e) {
log.debug("{} thread queue full, waiting...", poolName);
try {
Task task = (Task) r;
if (task.isUrgent()) {
queue.putFirst(r);
} else {
queue.put(r);
}
} catch (InterruptedException e1) {
log.error(poolName + " interrupted while waiting for queue.", e1);
}
}
});
metrics.register(MetricRegistry.name(getClass(), poolName, "active"), new Gauge<Integer>() {
@Override
public Integer getValue() {
return pool.getActiveCount();
}
});
metrics.register(MetricRegistry.name(getClass(), poolName, "pending"), new Gauge<Integer>() {
@Override
public Integer getValue() {
return queue.size();
}
});
}
public void execute(Task task) {
pool.execute(task);
}
public void shutdown() {
pool.shutdownNow();
while (!pool.isTerminated()) {
try {
Thread.sleep(100);
} catch (InterruptedException e) {
log.error("{} interrupted while waiting for threads to finish.", poolName);
}
}
}
public interface Task extends Runnable {
boolean isUrgent();
}
}

View File

@@ -0,0 +1,85 @@
package com.commafeed.backend.feed;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import javax.inject.Inject;
import javax.inject.Singleton;
import com.codahale.metrics.Meter;
import com.codahale.metrics.MetricRegistry;
import com.commafeed.CommaFeedConfiguration;
import com.commafeed.backend.dao.FeedDAO;
import io.dropwizard.lifecycle.Managed;
import lombok.extern.slf4j.Slf4j;
/**
* Infinite loop fetching feeds from @FeedQueues and queuing them to the {@link FeedRefreshWorker} pool.
*
*/
@Slf4j
@Singleton
public class FeedRefreshTaskGiver implements Managed {
private final FeedQueues queues;
private final FeedRefreshWorker worker;
private final ExecutorService executor;
private final Meter feedRefreshed;
private final Meter threadWaited;
@Inject
public FeedRefreshTaskGiver(FeedQueues queues, FeedDAO feedDAO, FeedRefreshWorker worker, CommaFeedConfiguration config,
MetricRegistry metrics) {
this.queues = queues;
this.worker = worker;
executor = Executors.newFixedThreadPool(1);
feedRefreshed = metrics.meter(MetricRegistry.name(getClass(), "feedRefreshed"));
threadWaited = metrics.meter(MetricRegistry.name(getClass(), "threadWaited"));
}
@Override
public void stop() {
log.info("shutting down feed refresh task giver");
executor.shutdownNow();
while (!executor.isTerminated()) {
try {
Thread.sleep(100);
} catch (InterruptedException e) {
log.error("interrupted while waiting for threads to finish.");
}
}
}
@Override
public void start() {
log.info("starting feed refresh task giver");
executor.execute(new Runnable() {
@Override
public void run() {
while (!executor.isShutdown()) {
try {
FeedRefreshContext context = queues.take();
if (context != null) {
feedRefreshed.mark();
worker.updateFeed(context);
} else {
log.debug("nothing to do, sleeping for 15s");
threadWaited.mark();
try {
Thread.sleep(15000);
} catch (InterruptedException e) {
log.debug("interrupted while sleeping");
}
}
} catch (Exception e) {
log.error(e.getMessage(), e);
}
}
}
});
}
}

View File

@@ -0,0 +1,226 @@
package com.commafeed.backend.feed;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Lock;
import java.util.stream.Collectors;
import javax.inject.Inject;
import javax.inject.Singleton;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.time.DateUtils;
import org.hibernate.SessionFactory;
import com.codahale.metrics.Meter;
import com.codahale.metrics.MetricRegistry;
import com.commafeed.CommaFeedConfiguration;
import com.commafeed.CommaFeedConfiguration.ApplicationSettings;
import com.commafeed.backend.cache.CacheService;
import com.commafeed.backend.dao.FeedSubscriptionDAO;
import com.commafeed.backend.dao.UnitOfWork;
import com.commafeed.backend.feed.FeedRefreshExecutor.Task;
import com.commafeed.backend.model.Feed;
import com.commafeed.backend.model.FeedEntry;
import com.commafeed.backend.model.FeedEntryContent;
import com.commafeed.backend.model.FeedSubscription;
import com.commafeed.backend.model.User;
import com.commafeed.backend.service.FeedUpdateService;
import com.commafeed.backend.service.PubSubService;
import com.google.common.util.concurrent.Striped;
import io.dropwizard.lifecycle.Managed;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Singleton
public class FeedRefreshUpdater implements Managed {
private final SessionFactory sessionFactory;
private final FeedUpdateService feedUpdateService;
private final PubSubService pubSubService;
private final FeedQueues queues;
private final CommaFeedConfiguration config;
private final FeedSubscriptionDAO feedSubscriptionDAO;
private final CacheService cache;
private final FeedRefreshExecutor pool;
private final Striped<Lock> locks;
private final Meter entryCacheMiss;
private final Meter entryCacheHit;
private final Meter feedUpdated;
private final Meter entryInserted;
@Inject
public FeedRefreshUpdater(SessionFactory sessionFactory, FeedUpdateService feedUpdateService, PubSubService pubSubService,
FeedQueues queues, CommaFeedConfiguration config, MetricRegistry metrics, FeedSubscriptionDAO feedSubscriptionDAO,
CacheService cache) {
this.sessionFactory = sessionFactory;
this.feedUpdateService = feedUpdateService;
this.pubSubService = pubSubService;
this.queues = queues;
this.config = config;
this.feedSubscriptionDAO = feedSubscriptionDAO;
this.cache = cache;
ApplicationSettings settings = config.getApplicationSettings();
int threads = Math.max(settings.getDatabaseUpdateThreads(), 1);
pool = new FeedRefreshExecutor("feed-refresh-updater", threads, Math.min(50 * threads, 1000), metrics);
locks = Striped.lazyWeakLock(threads * 100000);
entryCacheMiss = metrics.meter(MetricRegistry.name(getClass(), "entryCacheMiss"));
entryCacheHit = metrics.meter(MetricRegistry.name(getClass(), "entryCacheHit"));
feedUpdated = metrics.meter(MetricRegistry.name(getClass(), "feedUpdated"));
entryInserted = metrics.meter(MetricRegistry.name(getClass(), "entryInserted"));
}
@Override
public void start() throws Exception {
}
@Override
public void stop() throws Exception {
log.info("shutting down feed refresh updater");
pool.shutdown();
}
public void updateFeed(FeedRefreshContext context) {
pool.execute(new EntryTask(context));
}
private boolean addEntry(final Feed feed, final FeedEntry entry, final List<FeedSubscription> subscriptions) {
boolean success = false;
// lock on feed, make sure we are not updating the same feed twice at
// the same time
String key1 = StringUtils.trimToEmpty("" + feed.getId());
// lock on content, make sure we are not updating the same entry
// twice at the same time
FeedEntryContent content = entry.getContent();
String key2 = DigestUtils.sha1Hex(StringUtils.trimToEmpty(content.getContent() + content.getTitle()));
Iterator<Lock> iterator = locks.bulkGet(Arrays.asList(key1, key2)).iterator();
Lock lock1 = iterator.next();
Lock lock2 = iterator.next();
boolean locked1 = false;
boolean locked2 = false;
try {
locked1 = lock1.tryLock(1, TimeUnit.MINUTES);
locked2 = lock2.tryLock(1, TimeUnit.MINUTES);
if (locked1 && locked2) {
boolean inserted = UnitOfWork.call(sessionFactory, () -> feedUpdateService.addEntry(feed, entry, subscriptions));
if (inserted) {
entryInserted.mark();
}
success = true;
} else {
log.error("lock timeout for " + feed.getUrl() + " - " + key1);
}
} catch (InterruptedException e) {
log.error("interrupted while waiting for lock for " + feed.getUrl() + " : " + e.getMessage(), e);
} finally {
if (locked1) {
lock1.unlock();
}
if (locked2) {
lock2.unlock();
}
}
return success;
}
private void handlePubSub(final Feed feed) {
if (feed.getPushHub() != null && feed.getPushTopic() != null) {
Date lastPing = feed.getPushLastPing();
Date now = new Date();
if (lastPing == null || lastPing.before(DateUtils.addDays(now, -3))) {
new Thread() {
@Override
public void run() {
try {
// make sure the feed has been updated in the database so that the
// callback works
Thread.sleep(30000);
} catch (InterruptedException e1) {
// do nothing
}
pubSubService.subscribe(feed);
}
}.start();
}
}
}
private class EntryTask implements Task {
private final FeedRefreshContext context;
public EntryTask(FeedRefreshContext context) {
this.context = context;
}
@Override
public void run() {
boolean ok = true;
final Feed feed = context.getFeed();
List<FeedEntry> entries = context.getEntries();
if (entries.isEmpty()) {
feed.setMessage("Feed has no entries");
} else {
List<String> lastEntries = cache.getLastEntries(feed);
List<String> currentEntries = new ArrayList<>();
List<FeedSubscription> subscriptions = null;
for (FeedEntry entry : entries) {
String cacheKey = cache.buildUniqueEntryKey(feed, entry);
if (!lastEntries.contains(cacheKey)) {
log.debug("cache miss for {}", entry.getUrl());
if (subscriptions == null) {
subscriptions = UnitOfWork.call(sessionFactory, () -> feedSubscriptionDAO.findByFeed(feed));
}
ok &= addEntry(feed, entry, subscriptions);
entryCacheMiss.mark();
} else {
log.debug("cache hit for {}", entry.getUrl());
entryCacheHit.mark();
}
currentEntries.add(cacheKey);
}
cache.setLastEntries(feed, currentEntries);
if (subscriptions == null) {
feed.setMessage("No new entries found");
} else if (!subscriptions.isEmpty()) {
List<User> users = subscriptions.stream().map(FeedSubscription::getUser).collect(Collectors.toList());
cache.invalidateUnreadCount(subscriptions.toArray(new FeedSubscription[0]));
cache.invalidateUserRootCategory(users.toArray(new User[0]));
}
}
if (config.getApplicationSettings().getPubsubhubbub()) {
handlePubSub(feed);
}
if (!ok) {
// requeue asap
feed.setDisabledUntil(new Date(0));
}
feedUpdated.mark();
queues.giveBack(feed);
}
@Override
public boolean isUrgent() {
return context.isUrgent();
}
}
}

View File

@@ -0,0 +1,167 @@
package com.commafeed.backend.feed;
import java.util.Date;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import javax.inject.Inject;
import javax.inject.Singleton;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.time.DateUtils;
import com.codahale.metrics.MetricRegistry;
import com.commafeed.CommaFeedConfiguration;
import com.commafeed.backend.HttpGetter.NotModifiedException;
import com.commafeed.backend.feed.FeedRefreshExecutor.Task;
import com.commafeed.backend.model.Feed;
import com.commafeed.backend.model.FeedEntry;
import io.dropwizard.lifecycle.Managed;
import lombok.extern.slf4j.Slf4j;
/**
* Calls {@link FeedFetcher} and handles its outcome
*
*/
@Slf4j
@Singleton
public class FeedRefreshWorker implements Managed {
private final FeedRefreshUpdater feedRefreshUpdater;
private final FeedFetcher fetcher;
private final FeedQueues queues;
private final CommaFeedConfiguration config;
private final FeedRefreshExecutor pool;
@Inject
public FeedRefreshWorker(FeedRefreshUpdater feedRefreshUpdater, FeedFetcher fetcher, FeedQueues queues, CommaFeedConfiguration config,
MetricRegistry metrics) {
this.feedRefreshUpdater = feedRefreshUpdater;
this.fetcher = fetcher;
this.config = config;
this.queues = queues;
int threads = config.getApplicationSettings().getBackgroundThreads();
pool = new FeedRefreshExecutor("feed-refresh-worker", threads, Math.min(20 * threads, 1000), metrics);
}
@Override
public void start() throws Exception {
}
@Override
public void stop() throws Exception {
pool.shutdown();
}
public void updateFeed(FeedRefreshContext context) {
pool.execute(new FeedTask(context));
}
private void update(FeedRefreshContext context) {
Feed feed = context.getFeed();
int refreshInterval = config.getApplicationSettings().getRefreshIntervalMinutes();
Date disabledUntil = DateUtils.addMinutes(new Date(), refreshInterval);
try {
String url = Optional.ofNullable(feed.getUrlAfterRedirect()).orElse(feed.getUrl());
FetchedFeed fetchedFeed = fetcher.fetch(url, false, feed.getLastModifiedHeader(), feed.getEtagHeader(),
feed.getLastPublishedDate(), feed.getLastContentHash());
// stops here if NotModifiedException or any other exception is thrown
List<FeedEntry> entries = fetchedFeed.getEntries();
Integer maxFeedCapacity = config.getApplicationSettings().getMaxFeedCapacity();
if (maxFeedCapacity > 0) {
entries = entries.stream().limit(maxFeedCapacity).collect(Collectors.toList());
}
if (config.getApplicationSettings().getHeavyLoad()) {
disabledUntil = FeedUtils.buildDisabledUntil(fetchedFeed.getFeed().getLastEntryDate(),
fetchedFeed.getFeed().getAverageEntryInterval(), disabledUntil);
}
String urlAfterRedirect = fetchedFeed.getUrlAfterRedirect();
if (StringUtils.equals(url, urlAfterRedirect)) {
urlAfterRedirect = null;
}
feed.setUrlAfterRedirect(urlAfterRedirect);
feed.setLink(fetchedFeed.getFeed().getLink());
feed.setLastModifiedHeader(fetchedFeed.getFeed().getLastModifiedHeader());
feed.setEtagHeader(fetchedFeed.getFeed().getEtagHeader());
feed.setLastContentHash(fetchedFeed.getFeed().getLastContentHash());
feed.setLastPublishedDate(fetchedFeed.getFeed().getLastPublishedDate());
feed.setAverageEntryInterval(fetchedFeed.getFeed().getAverageEntryInterval());
feed.setLastEntryDate(fetchedFeed.getFeed().getLastEntryDate());
feed.setErrorCount(0);
feed.setMessage(null);
feed.setDisabledUntil(disabledUntil);
handlePubSub(feed, fetchedFeed.getFeed());
context.setEntries(entries);
feedRefreshUpdater.updateFeed(context);
} catch (NotModifiedException e) {
log.debug("Feed not modified : {} - {}", feed.getUrl(), e.getMessage());
if (config.getApplicationSettings().getHeavyLoad()) {
disabledUntil = FeedUtils.buildDisabledUntil(feed.getLastEntryDate(), feed.getAverageEntryInterval(), disabledUntil);
}
feed.setErrorCount(0);
feed.setMessage(e.getMessage());
feed.setDisabledUntil(disabledUntil);
queues.giveBack(feed);
} catch (Exception e) {
String message = "Unable to refresh feed " + feed.getUrl() + " : " + e.getMessage();
log.debug(e.getClass().getName() + " " + message, e);
feed.setErrorCount(feed.getErrorCount() + 1);
feed.setMessage(message);
feed.setDisabledUntil(FeedUtils.buildDisabledUntil(feed.getErrorCount()));
queues.giveBack(feed);
}
}
private void handlePubSub(Feed feed, Feed fetchedFeed) {
String hub = fetchedFeed.getPushHub();
String topic = fetchedFeed.getPushTopic();
if (hub != null && topic != null) {
if (hub.contains("hubbub.api.typepad.com")) {
// that hub does not exist anymore
return;
}
if (topic.startsWith("www.")) {
topic = "http://" + topic;
} else if (topic.startsWith("feed://")) {
topic = "http://" + topic.substring(7);
} else if (!topic.startsWith("http")) {
topic = "http://" + topic;
}
log.debug("feed {} has pubsub info: {}", feed.getUrl(), topic);
feed.setPushHub(hub);
feed.setPushTopic(topic);
feed.setPushTopicHash(DigestUtils.sha1Hex(topic));
}
}
private class FeedTask implements Task {
private final FeedRefreshContext context;
public FeedTask(FeedRefreshContext context) {
this.context = context;
}
@Override
public void run() {
update(context);
}
@Override
public boolean isUrgent() {
return context.isUrgent();
}
}
}

View File

@@ -0,0 +1,556 @@
package com.commafeed.backend.feed;
import java.io.StringReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.ahocorasick.trie.Emit;
import org.ahocorasick.trie.Trie;
import org.ahocorasick.trie.Trie.TrieBuilder;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.time.DateUtils;
import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Document.OutputSettings;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Entities.EscapeMode;
import org.jsoup.safety.Cleaner;
import org.jsoup.safety.Safelist;
import org.jsoup.select.Elements;
import org.w3c.css.sac.InputSource;
import org.w3c.dom.css.CSSStyleDeclaration;
import com.commafeed.backend.feed.FeedEntryKeyword.Mode;
import com.commafeed.backend.model.FeedEntry;
import com.commafeed.backend.model.FeedSubscription;
import com.commafeed.frontend.model.Entry;
import com.google.gwt.i18n.client.HasDirection.Direction;
import com.google.gwt.i18n.shared.BidiUtils;
import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
import com.steadystate.css.parser.CSSOMParser;
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
import lombok.extern.slf4j.Slf4j;
/**
* Utility methods related to feed handling
*
*/
@Slf4j
public class FeedUtils {
private static final String ESCAPED_QUESTION_MARK = Pattern.quote("?");
private static final List<String> ALLOWED_IFRAME_CSS_RULES = Arrays.asList("height", "width", "border");
private static final List<String> ALLOWED_IMG_CSS_RULES = Arrays.asList("display", "width", "height");
private static final char[] FORBIDDEN_CSS_RULE_CHARACTERS = new char[] { '(', ')' };
private static final Safelist WHITELIST = buildWhiteList();
public static String truncate(String string, int length) {
if (string != null) {
string = string.substring(0, Math.min(length, string.length()));
}
return string;
}
private static synchronized Safelist buildWhiteList() {
Safelist whitelist = new Safelist();
whitelist.addTags("a", "b", "blockquote", "br", "caption", "cite", "code", "col", "colgroup", "dd", "div", "dl", "dt", "em", "h1",
"h2", "h3", "h4", "h5", "h6", "i", "iframe", "img", "li", "ol", "p", "pre", "q", "small", "strike", "strong", "sub", "sup",
"table", "tbody", "td", "tfoot", "th", "thead", "tr", "u", "ul");
whitelist.addAttributes("div", "dir");
whitelist.addAttributes("pre", "dir");
whitelist.addAttributes("code", "dir");
whitelist.addAttributes("table", "dir");
whitelist.addAttributes("p", "dir");
whitelist.addAttributes("a", "href", "title");
whitelist.addAttributes("blockquote", "cite");
whitelist.addAttributes("col", "span", "width");
whitelist.addAttributes("colgroup", "span", "width");
whitelist.addAttributes("iframe", "src", "height", "width", "allowfullscreen", "frameborder", "style");
whitelist.addAttributes("img", "align", "alt", "height", "src", "title", "width", "style");
whitelist.addAttributes("ol", "start", "type");
whitelist.addAttributes("q", "cite");
whitelist.addAttributes("table", "border", "bordercolor", "summary", "width");
whitelist.addAttributes("td", "border", "bordercolor", "abbr", "axis", "colspan", "rowspan", "width");
whitelist.addAttributes("th", "border", "bordercolor", "abbr", "axis", "colspan", "rowspan", "scope", "width");
whitelist.addAttributes("ul", "type");
whitelist.addProtocols("a", "href", "ftp", "http", "https", "magnet", "mailto");
whitelist.addProtocols("blockquote", "cite", "http", "https");
whitelist.addProtocols("img", "src", "http", "https");
whitelist.addProtocols("q", "cite", "http", "https");
whitelist.addEnforcedAttribute("a", "target", "_blank");
whitelist.addEnforcedAttribute("a", "rel", "noreferrer");
return whitelist;
}
/**
* Detect feed encoding by using the declared encoding in the xml processing instruction and by detecting the characters used in the
* feed
*
*/
public static Charset guessEncoding(byte[] bytes) {
String extracted = extractDeclaredEncoding(bytes);
if (StringUtils.startsWithIgnoreCase(extracted, "iso-8859-")) {
if (!StringUtils.endsWith(extracted, "1")) {
return Charset.forName(extracted);
}
} else if (StringUtils.startsWithIgnoreCase(extracted, "windows-")) {
return Charset.forName(extracted);
}
return detectEncoding(bytes);
}
/**
* Detect encoding by analyzing characters in the array
*/
public static Charset detectEncoding(byte[] bytes) {
String encoding = "UTF-8";
CharsetDetector detector = new CharsetDetector();
detector.setText(bytes);
CharsetMatch match = detector.detect();
if (match != null) {
encoding = match.getName();
}
if (encoding.equalsIgnoreCase("ISO-8859-1")) {
encoding = "windows-1252";
}
return Charset.forName(encoding);
}
public static String replaceHtmlEntitiesWithNumericEntities(String source) {
// Create a buffer sufficiently large that re-allocations are minimized.
StringBuilder sb = new StringBuilder(source.length() << 1);
TrieBuilder builder = Trie.builder();
builder.ignoreOverlaps();
for (String key : HtmlEntities.HTML_ENTITIES) {
builder.addKeyword(key);
}
Trie trie = builder.build();
Collection<Emit> emits = trie.parseText(source);
int prevIndex = 0;
for (Emit emit : emits) {
int matchIndex = emit.getStart();
sb.append(source.substring(prevIndex, matchIndex));
sb.append(HtmlEntities.HTML_TO_NUMERIC_MAP.get(emit.getKeyword()));
prevIndex = emit.getEnd() + 1;
}
// Add the remainder of the string (contains no more matches).
sb.append(source.substring(prevIndex));
return sb.toString();
}
public static boolean isHttp(String url) {
return url.startsWith("http://");
}
public static boolean isHttps(String url) {
return url.startsWith("https://");
}
/**
* Normalize the url. The resulting url is not meant to be fetched but rather used as a mean to identify a feed and avoid duplicates
*/
public static String normalizeURL(String url) {
if (url == null) {
return null;
}
String normalized = URLCanonicalizer.getCanonicalURL(url);
if (normalized == null) {
normalized = url;
}
// convert to lower case, the url probably won't work in some cases
// after that but we don't care we just want to compare urls to avoid
// duplicates
normalized = normalized.toLowerCase();
// store all urls as http
if (normalized.startsWith("https")) {
normalized = "http" + normalized.substring(5);
}
// remove the www. part
normalized = normalized.replace("//www.", "//");
// feedproxy redirects to feedburner
normalized = normalized.replace("feedproxy.google.com", "feeds.feedburner.com");
// feedburner feeds have a special treatment
if (normalized.split(ESCAPED_QUESTION_MARK)[0].contains("feedburner.com")) {
normalized = normalized.replace("feeds2.feedburner.com", "feeds.feedburner.com");
normalized = normalized.split(ESCAPED_QUESTION_MARK)[0];
normalized = StringUtils.removeEnd(normalized, "/");
}
return normalized;
}
/**
* Extract the declared encoding from the xml
*/
public static String extractDeclaredEncoding(byte[] bytes) {
int index = ArrayUtils.indexOf(bytes, (byte) '>');
if (index == -1) {
return null;
}
String pi = new String(ArrayUtils.subarray(bytes, 0, index + 1)).replace('\'', '"');
index = StringUtils.indexOf(pi, "encoding=\"");
if (index == -1) {
return null;
}
String encoding = pi.substring(index + 10, pi.length());
encoding = encoding.substring(0, encoding.indexOf('"'));
return encoding;
}
public static String handleContent(String content, String baseUri, boolean keepTextOnly) {
if (StringUtils.isNotBlank(content)) {
baseUri = StringUtils.trimToEmpty(baseUri);
Document dirty = Jsoup.parseBodyFragment(content, baseUri);
Cleaner cleaner = new Cleaner(WHITELIST);
Document clean = cleaner.clean(dirty);
for (Element e : clean.select("iframe[style]")) {
String style = e.attr("style");
String escaped = escapeIFrameCss(style);
e.attr("style", escaped);
}
for (Element e : clean.select("img[style]")) {
String style = e.attr("style");
String escaped = escapeImgCss(style);
e.attr("style", escaped);
}
clean.outputSettings(new OutputSettings().escapeMode(EscapeMode.base).prettyPrint(false));
Element body = clean.body();
if (keepTextOnly) {
content = body.text();
} else {
content = body.html();
}
}
return content;
}
public static String escapeIFrameCss(String orig) {
String rule = "";
CSSOMParser parser = new CSSOMParser();
try {
List<String> rules = new ArrayList<>();
CSSStyleDeclaration decl = parser.parseStyleDeclaration(new InputSource(new StringReader(orig)));
for (int i = 0; i < decl.getLength(); i++) {
String property = decl.item(i);
String value = decl.getPropertyValue(property);
if (StringUtils.isBlank(property) || StringUtils.isBlank(value)) {
continue;
}
if (ALLOWED_IFRAME_CSS_RULES.contains(property) && StringUtils.containsNone(value, FORBIDDEN_CSS_RULE_CHARACTERS)) {
rules.add(property + ":" + decl.getPropertyValue(property) + ";");
}
}
rule = StringUtils.join(rules, "");
} catch (Exception e) {
log.error(e.getMessage(), e);
}
return rule;
}
public static String escapeImgCss(String orig) {
String rule = "";
CSSOMParser parser = new CSSOMParser();
try {
List<String> rules = new ArrayList<>();
CSSStyleDeclaration decl = parser.parseStyleDeclaration(new InputSource(new StringReader(orig)));
for (int i = 0; i < decl.getLength(); i++) {
String property = decl.item(i);
String value = decl.getPropertyValue(property);
if (StringUtils.isBlank(property) || StringUtils.isBlank(value)) {
continue;
}
if (ALLOWED_IMG_CSS_RULES.contains(property) && StringUtils.containsNone(value, FORBIDDEN_CSS_RULE_CHARACTERS)) {
rules.add(property + ":" + decl.getPropertyValue(property) + ";");
}
}
rule = StringUtils.join(rules, "");
} catch (Exception e) {
log.error(e.getMessage(), e);
}
return rule;
}
public static boolean isRTL(FeedEntry entry) {
String text = entry.getContent().getContent();
if (StringUtils.isBlank(text)) {
text = entry.getContent().getTitle();
}
if (StringUtils.isBlank(text)) {
return false;
}
text = Jsoup.parse(text).text();
if (StringUtils.isBlank(text)) {
return false;
}
Direction direction = BidiUtils.get().estimateDirection(text);
return direction == Direction.RTL;
}
public static String trimInvalidXmlCharacters(String xml) {
if (StringUtils.isBlank(xml)) {
return null;
}
StringBuilder sb = new StringBuilder();
boolean firstTagFound = false;
for (int i = 0; i < xml.length(); i++) {
char c = xml.charAt(i);
if (!firstTagFound) {
if (c == '<') {
firstTagFound = true;
} else {
continue;
}
}
if (c >= 32 || c == 9 || c == 10 || c == 13) {
if (!Character.isHighSurrogate(c) && !Character.isLowSurrogate(c)) {
sb.append(c);
}
}
}
return sb.toString();
}
/**
* When there was an error fetching the feed
*
*/
public static Date buildDisabledUntil(int errorCount) {
Date now = new Date();
int retriesBeforeDisable = 3;
if (errorCount >= retriesBeforeDisable) {
int disabledHours = errorCount - retriesBeforeDisable + 1;
disabledHours = Math.min(24 * 7, disabledHours);
return DateUtils.addHours(now, disabledHours);
}
return now;
}
/**
* When the feed was refreshed successfully
*/
public static Date buildDisabledUntil(Date publishedDate, Long averageEntryInterval, Date defaultRefreshInterval) {
Date now = new Date();
if (publishedDate == null) {
// feed with no entries, recheck in 24 hours
return DateUtils.addHours(now, 24);
} else if (publishedDate.before(DateUtils.addMonths(now, -1))) {
// older than a month, recheck in 24 hours
return DateUtils.addHours(now, 24);
} else if (publishedDate.before(DateUtils.addDays(now, -14))) {
// older than two weeks, recheck in 12 hours
return DateUtils.addHours(now, 12);
} else if (publishedDate.before(DateUtils.addDays(now, -7))) {
// older than a week, recheck in 6 hours
return DateUtils.addHours(now, 6);
} else if (averageEntryInterval != null) {
// use average time between entries to decide when to refresh next, divided by factor
int factor = 2;
// not more than 6 hours
long date = Math.min(DateUtils.addHours(now, 6).getTime(), now.getTime() + averageEntryInterval / factor);
// not less than default refresh interval
date = Math.max(defaultRefreshInterval.getTime(), date);
return new Date(date);
} else {
// unknown case, recheck in 24 hours
return DateUtils.addHours(now, 24);
}
}
public static Long averageTimeBetweenEntries(List<FeedEntry> entries) {
if (entries.isEmpty() || entries.size() == 1) {
return null;
}
List<Long> timestamps = getSortedTimestamps(entries);
SummaryStatistics stats = new SummaryStatistics();
for (int i = 0; i < timestamps.size() - 1; i++) {
long diff = Math.abs(timestamps.get(i) - timestamps.get(i + 1));
stats.addValue(diff);
}
return (long) stats.getMean();
}
public static List<Long> getSortedTimestamps(List<FeedEntry> entries) {
return entries.stream().map(t -> t.getUpdated().getTime()).sorted(Collections.reverseOrder()).collect(Collectors.toList());
}
public static String removeTrailingSlash(String url) {
if (url.endsWith("/")) {
url = url.substring(0, url.length() - 1);
}
return url;
}
/**
*
* @param url
* the url of the entry
* @param feedLink
* the url of the feed as described in the feed
* @param feedUrl
* the url of the feed that we used to fetch the feed
* @return an absolute url pointing to the entry
*/
public static String toAbsoluteUrl(String url, String feedLink, String feedUrl) {
url = StringUtils.trimToNull(StringUtils.normalizeSpace(url));
if (url == null || url.startsWith("http")) {
return url;
}
String baseUrl = (feedLink == null || isRelative(feedLink)) ? feedUrl : feedLink;
if (baseUrl == null) {
return url;
}
String result = null;
try {
result = new URL(new URL(baseUrl), url).toString();
} catch (MalformedURLException e) {
log.debug("could not parse url : " + e.getMessage(), e);
result = url;
}
return result;
}
public static boolean isRelative(final String url) {
// the regex means "start with 'scheme://'"
return url.startsWith("/") || url.startsWith("#") || !url.matches("^\\w+\\:\\/\\/.*");
}
public static String getFaviconUrl(FeedSubscription subscription, String publicUrl) {
return removeTrailingSlash(publicUrl) + "/rest/feed/favicon/" + subscription.getId();
}
public static String proxyImages(String content, String publicUrl) {
if (StringUtils.isBlank(content)) {
return content;
}
Document doc = Jsoup.parse(content);
Elements elements = doc.select("img");
for (Element element : elements) {
String href = element.attr("src");
if (href != null) {
String proxy = proxyImage(href, publicUrl);
element.attr("src", proxy);
}
}
return doc.body().html();
}
public static String proxyImage(String url, String publicUrl) {
if (StringUtils.isBlank(url)) {
return url;
}
return removeTrailingSlash(publicUrl) + "/rest/server/proxy?u=" + imageProxyEncoder(url);
}
public static String rot13(String msg) {
StringBuilder message = new StringBuilder();
for (char c : msg.toCharArray()) {
if (c >= 'a' && c <= 'm') {
c += 13;
} else if (c >= 'n' && c <= 'z') {
c -= 13;
} else if (c >= 'A' && c <= 'M') {
c += 13;
} else if (c >= 'N' && c <= 'Z') {
c -= 13;
}
message.append(c);
}
return message.toString();
}
public static String imageProxyEncoder(String url) {
return Base64.encodeBase64String(rot13(url).getBytes());
}
public static String imageProxyDecoder(String code) {
return rot13(new String(Base64.decodeBase64(code)));
}
public static void removeUnwantedFromSearch(List<Entry> entries, List<FeedEntryKeyword> keywords) {
Iterator<Entry> it = entries.iterator();
while (it.hasNext()) {
Entry entry = it.next();
boolean keep = true;
for (FeedEntryKeyword keyword : keywords) {
String title = entry.getTitle() == null ? null : Jsoup.parse(entry.getTitle()).text();
String content = entry.getContent() == null ? null : Jsoup.parse(entry.getContent()).text();
boolean condition = !StringUtils.containsIgnoreCase(content, keyword.getKeyword())
&& !StringUtils.containsIgnoreCase(title, keyword.getKeyword());
if (keyword.getMode() == Mode.EXCLUDE) {
condition = !condition;
}
if (condition) {
keep = false;
break;
}
}
if (!keep) {
it.remove();
}
}
}
}

View File

@@ -0,0 +1,23 @@
package com.commafeed.backend.feed;
import java.util.ArrayList;
import java.util.List;
import com.commafeed.backend.model.Feed;
import com.commafeed.backend.model.FeedEntry;
import lombok.Getter;
import lombok.Setter;
@Getter
@Setter
public class FetchedFeed {
private Feed feed = new Feed();
private List<FeedEntry> entries = new ArrayList<>();
private String title;
private String urlAfterRedirect;
private long fetchDuration;
}

View File

@@ -0,0 +1,269 @@
package com.commafeed.backend.feed;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.Map;
public class HtmlEntities {
public static final Map<String, String> HTML_TO_NUMERIC_MAP;
public static final String[] HTML_ENTITIES;
public static final String[] NUMERIC_ENTITIES;
static {
Map<String, String> map = new LinkedHashMap<>();
map.put("&Aacute;", "&#193;");
map.put("&aacute;", "&#225;");
map.put("&Acirc;", "&#194;");
map.put("&acirc;", "&#226;");
map.put("&acute;", "&#180;");
map.put("&AElig;", "&#198;");
map.put("&aelig;", "&#230;");
map.put("&Agrave;", "&#192;");
map.put("&agrave;", "&#224;");
map.put("&alefsym;", "&#8501;");
map.put("&Alpha;", "&#913;");
map.put("&alpha;", "&#945;");
map.put("&amp;", "&#38;");
map.put("&and;", "&#8743;");
map.put("&ang;", "&#8736;");
map.put("&Aring;", "&#197;");
map.put("&aring;", "&#229;");
map.put("&asymp;", "&#8776;");
map.put("&Atilde;", "&#195;");
map.put("&atilde;", "&#227;");
map.put("&Auml;", "&#196;");
map.put("&auml;", "&#228;");
map.put("&bdquo;", "&#8222;");
map.put("&Beta;", "&#914;");
map.put("&beta;", "&#946;");
map.put("&brvbar;", "&#166;");
map.put("&bull;", "&#8226;");
map.put("&cap;", "&#8745;");
map.put("&Ccedil;", "&#199;");
map.put("&ccedil;", "&#231;");
map.put("&cedil;", "&#184;");
map.put("&cent;", "&#162;");
map.put("&Chi;", "&#935;");
map.put("&chi;", "&#967;");
map.put("&circ;", "&#710;");
map.put("&clubs;", "&#9827;");
map.put("&cong;", "&#8773;");
map.put("&copy;", "&#169;");
map.put("&crarr;", "&#8629;");
map.put("&cup;", "&#8746;");
map.put("&curren;", "&#164;");
map.put("&dagger;", "&#8224;");
map.put("&Dagger;", "&#8225;");
map.put("&darr;", "&#8595;");
map.put("&dArr;", "&#8659;");
map.put("&deg;", "&#176;");
map.put("&Delta;", "&#916;");
map.put("&delta;", "&#948;");
map.put("&diams;", "&#9830;");
map.put("&divide;", "&#247;");
map.put("&Eacute;", "&#201;");
map.put("&eacute;", "&#233;");
map.put("&Ecirc;", "&#202;");
map.put("&ecirc;", "&#234;");
map.put("&Egrave;", "&#200;");
map.put("&egrave;", "&#232;");
map.put("&empty;", "&#8709;");
map.put("&emsp;", "&#8195;");
map.put("&ensp;", "&#8194;");
map.put("&Epsilon;", "&#917;");
map.put("&epsilon;", "&#949;");
map.put("&equiv;", "&#8801;");
map.put("&Eta;", "&#919;");
map.put("&eta;", "&#951;");
map.put("&ETH;", "&#208;");
map.put("&eth;", "&#240;");
map.put("&Euml;", "&#203;");
map.put("&euml;", "&#235;");
map.put("&euro;", "&#8364;");
map.put("&exist;", "&#8707;");
map.put("&fnof;", "&#402;");
map.put("&forall;", "&#8704;");
map.put("&frac12;", "&#189;");
map.put("&frac14;", "&#188;");
map.put("&frac34;", "&#190;");
map.put("&frasl;", "&#8260;");
map.put("&Gamma;", "&#915;");
map.put("&gamma;", "&#947;");
map.put("&ge;", "&#8805;");
map.put("&harr;", "&#8596;");
map.put("&hArr;", "&#8660;");
map.put("&hearts;", "&#9829;");
map.put("&hellip;", "&#8230;");
map.put("&Iacute;", "&#205;");
map.put("&iacute;", "&#237;");
map.put("&Icirc;", "&#206;");
map.put("&icirc;", "&#238;");
map.put("&iexcl;", "&#161;");
map.put("&Igrave;", "&#204;");
map.put("&igrave;", "&#236;");
map.put("&image;", "&#8465;");
map.put("&infin;", "&#8734;");
map.put("&int;", "&#8747;");
map.put("&Iota;", "&#921;");
map.put("&iota;", "&#953;");
map.put("&iquest;", "&#191;");
map.put("&isin;", "&#8712;");
map.put("&Iuml;", "&#207;");
map.put("&iuml;", "&#239;");
map.put("&Kappa;", "&#922;");
map.put("&kappa;", "&#954;");
map.put("&Lambda;", "&#923;");
map.put("&lambda;", "&#955;");
map.put("&lang;", "&#9001;");
map.put("&laquo;", "&#171;");
map.put("&larr;", "&#8592;");
map.put("&lArr;", "&#8656;");
map.put("&lceil;", "&#8968;");
map.put("&ldquo;", "&#8220;");
map.put("&le;", "&#8804;");
map.put("&lfloor;", "&#8970;");
map.put("&lowast;", "&#8727;");
map.put("&loz;", "&#9674;");
map.put("&lrm;", "&#8206;");
map.put("&lsaquo;", "&#8249;");
map.put("&lsquo;", "&#8216;");
map.put("&macr;", "&#175;");
map.put("&mdash;", "&#8212;");
map.put("&micro;", "&#181;");
map.put("&middot;", "&#183;");
map.put("&minus;", "&#8722;");
map.put("&Mu;", "&#924;");
map.put("&mu;", "&#956;");
map.put("&nabla;", "&#8711;");
map.put("&nbsp;", "&#160;");
map.put("&ndash;", "&#8211;");
map.put("&ne;", "&#8800;");
map.put("&ni;", "&#8715;");
map.put("&not;", "&#172;");
map.put("&notin;", "&#8713;");
map.put("&nsub;", "&#8836;");
map.put("&Ntilde;", "&#209;");
map.put("&ntilde;", "&#241;");
map.put("&Nu;", "&#925;");
map.put("&nu;", "&#957;");
map.put("&Oacute;", "&#211;");
map.put("&oacute;", "&#243;");
map.put("&Ocirc;", "&#212;");
map.put("&ocirc;", "&#244;");
map.put("&OElig;", "&#338;");
map.put("&oelig;", "&#339;");
map.put("&Ograve;", "&#210;");
map.put("&ograve;", "&#242;");
map.put("&oline;", "&#8254;");
map.put("&Omega;", "&#937;");
map.put("&omega;", "&#969;");
map.put("&Omicron;", "&#927;");
map.put("&omicron;", "&#959;");
map.put("&oplus;", "&#8853;");
map.put("&or;", "&#8744;");
map.put("&ordf;", "&#170;");
map.put("&ordm;", "&#186;");
map.put("&Oslash;", "&#216;");
map.put("&oslash;", "&#248;");
map.put("&Otilde;", "&#213;");
map.put("&otilde;", "&#245;");
map.put("&otimes;", "&#8855;");
map.put("&Ouml;", "&#214;");
map.put("&ouml;", "&#246;");
map.put("&para;", "&#182;");
map.put("&part;", "&#8706;");
map.put("&permil;", "&#8240;");
map.put("&perp;", "&#8869;");
map.put("&Phi;", "&#934;");
map.put("&phi;", "&#966;");
map.put("&Pi;", "&#928;");
map.put("&pi;", "&#960;");
map.put("&piv;", "&#982;");
map.put("&plusmn;", "&#177;");
map.put("&pound;", "&#163;");
map.put("&prime;", "&#8242;");
map.put("&Prime;", "&#8243;");
map.put("&prod;", "&#8719;");
map.put("&prop;", "&#8733;");
map.put("&Psi;", "&#936;");
map.put("&psi;", "&#968;");
map.put("&quot;", "&#34;");
map.put("&radic;", "&#8730;");
map.put("&rang;", "&#9002;");
map.put("&raquo;", "&#187;");
map.put("&rarr;", "&#8594;");
map.put("&rArr;", "&#8658;");
map.put("&rceil;", "&#8969;");
map.put("&rdquo;", "&#8221;");
map.put("&real;", "&#8476;");
map.put("&reg;", "&#174;");
map.put("&rfloor;", "&#8971;");
map.put("&Rho;", "&#929;");
map.put("&rho;", "&#961;");
map.put("&rlm;", "&#8207;");
map.put("&rsaquo;", "&#8250;");
map.put("&rsquo;", "&#8217;");
map.put("&sbquo;", "&#8218;");
map.put("&Scaron;", "&#352;");
map.put("&scaron;", "&#353;");
map.put("&sdot;", "&#8901;");
map.put("&sect;", "&#167;");
map.put("&shy;", "&#173;");
map.put("&Sigma;", "&#931;");
map.put("&sigma;", "&#963;");
map.put("&sigmaf;", "&#962;");
map.put("&sim;", "&#8764;");
map.put("&spades;", "&#9824;");
map.put("&sub;", "&#8834;");
map.put("&sube;", "&#8838;");
map.put("&sum;", "&#8721;");
map.put("&sup1;", "&#185;");
map.put("&sup2;", "&#178;");
map.put("&sup3;", "&#179;");
map.put("&sup;", "&#8835;");
map.put("&supe;", "&#8839;");
map.put("&szlig;", "&#223;");
map.put("&Tau;", "&#932;");
map.put("&tau;", "&#964;");
map.put("&there4;", "&#8756;");
map.put("&Theta;", "&#920;");
map.put("&theta;", "&#952;");
map.put("&thetasym;", "&#977;");
map.put("&thinsp;", "&#8201;");
map.put("&THORN;", "&#222;");
map.put("&thorn;", "&#254;");
map.put("&tilde;", "&#732;");
map.put("&times;", "&#215;");
map.put("&trade;", "&#8482;");
map.put("&Uacute;", "&#218;");
map.put("&uacute;", "&#250;");
map.put("&uarr;", "&#8593;");
map.put("&uArr;", "&#8657;");
map.put("&Ucirc;", "&#219;");
map.put("&ucirc;", "&#251;");
map.put("&Ugrave;", "&#217;");
map.put("&ugrave;", "&#249;");
map.put("&uml;", "&#168;");
map.put("&upsih;", "&#978;");
map.put("&Upsilon;", "&#933;");
map.put("&upsilon;", "&#965;");
map.put("&Uuml;", "&#220;");
map.put("&uuml;", "&#252;");
map.put("&weierp;", "&#8472;");
map.put("&Xi;", "&#926;");
map.put("&xi;", "&#958;");
map.put("&Yacute;", "&#221;");
map.put("&yacute;", "&#253;");
map.put("&yen;", "&#165;");
map.put("&yuml;", "&#255;");
map.put("&Yuml;", "&#376;");
map.put("&Zeta;", "&#918;");
map.put("&zeta;", "&#950;");
map.put("&zwj;", "&#8205;");
map.put("&zwnj;", "&#8204;");
HTML_TO_NUMERIC_MAP = Collections.unmodifiableMap(map);
HTML_ENTITIES = map.keySet().toArray(new String[map.size()]);
NUMERIC_ENTITIES = map.values().toArray(new String[map.size()]);
}
}