removed wicket and tomee, use dropwizard instead. remove wro4j, use gulp instead

This commit is contained in:
Athou
2014-08-08 16:49:02 +02:00
parent bbcd79e49f
commit 986fd25942
357 changed files with 2178 additions and 19556 deletions

View File

@@ -0,0 +1,156 @@
package com.commafeed.backend.feed;
import java.util.Arrays;
import java.util.List;
import lombok.AllArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import com.commafeed.backend.HttpGetter;
import com.commafeed.backend.HttpGetter.HttpResult;
/**
* Inspired/Ported from https://github.com/potatolondon/getfavicon
*
*/
@Slf4j
@AllArgsConstructor
public class FaviconFetcher {
private static long MIN_ICON_LENGTH = 100;
private static long MAX_ICON_LENGTH = 100000;
private static int TIMEOUT = 4000;
protected static List<String> ICON_MIMETYPES = Arrays.asList("image/x-icon", "image/vnd.microsoft.icon", "image/ico", "image/icon",
"text/ico", "application/ico", "image/x-ms-bmp", "image/x-bmp", "image/gif", "image/png", "image/jpeg");
private static List<String> ICON_MIMETYPE_BLACKLIST = Arrays.asList("application/xml", "text/html");
private final HttpGetter getter;
public byte[] fetch(String url) {
if (url == null) {
log.debug("url is null");
return null;
}
int doubleSlash = url.indexOf("//");
if (doubleSlash == -1) {
doubleSlash = 0;
} else {
doubleSlash += 2;
}
int firstSlash = url.indexOf('/', doubleSlash);
if (firstSlash != -1) {
url = url.substring(0, firstSlash);
}
byte[] icon = getIconAtRoot(url);
if (icon == null) {
icon = getIconInPage(url);
}
return icon;
}
private byte[] getIconAtRoot(String url) {
byte[] bytes = null;
String contentType = null;
try {
url = FeedUtils.removeTrailingSlash(url) + "/favicon.ico";
log.debug("getting root icon at {}", url);
HttpResult result = getter.getBinary(url, TIMEOUT);
bytes = result.getContent();
contentType = result.getContentType();
} catch (Exception e) {
log.debug("Failed to retrieve iconAtRoot: " + e.getMessage(), e);
}
if (!isValidIconResponse(bytes, contentType)) {
bytes = null;
}
return bytes;
}
private boolean isValidIconResponse(byte[] content, String contentType) {
if (content == null) {
return false;
}
long length = content.length;
if (StringUtils.isNotBlank(contentType)) {
contentType = contentType.split(";")[0];
}
if (ICON_MIMETYPE_BLACKLIST.contains(contentType)) {
log.debug("Content-Type {} is blacklisted", contentType);
return false;
}
if (length < MIN_ICON_LENGTH) {
log.debug("Length {} below MIN_ICON_LENGTH {}", length, MIN_ICON_LENGTH);
return false;
}
if (length > MAX_ICON_LENGTH) {
log.debug("Length {} greater than MAX_ICON_LENGTH {}", length, MAX_ICON_LENGTH);
return false;
}
return true;
}
private byte[] getIconInPage(String url) {
Document doc = null;
try {
HttpResult result = getter.getBinary(url, TIMEOUT);
doc = Jsoup.parse(new String(result.getContent()), url);
} catch (Exception e) {
log.debug("Failed to retrieve page to find icon");
return null;
}
Elements icons = doc.select("link[rel~=(?i)^(shortcut|icon|shortcut icon)$]");
if (icons.isEmpty()) {
log.debug("No icon found in page {}", url);
return null;
}
String href = icons.get(0).attr("abs:href");
if (StringUtils.isBlank(href)) {
log.debug("No icon found in page");
return null;
}
log.debug("Found unconfirmed iconInPage at {}", href);
byte[] bytes = null;
String contentType = null;
try {
HttpResult result = getter.getBinary(href, TIMEOUT);
bytes = result.getContent();
contentType = result.getContentType();
} catch (Exception e) {
log.debug("Failed to retrieve icon found in page {}", href);
return null;
}
if (!isValidIconResponse(bytes, contentType)) {
log.debug("Invalid icon found for {}", href);
return null;
}
return bytes;
}
}

View File

@@ -0,0 +1,99 @@
package com.commafeed.backend.feed;
import java.io.IOException;
import java.util.Date;
import lombok.AllArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.codec.binary.StringUtils;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.http.client.ClientProtocolException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import com.commafeed.backend.HttpGetter;
import com.commafeed.backend.HttpGetter.HttpResult;
import com.commafeed.backend.HttpGetter.NotModifiedException;
import com.commafeed.backend.model.Feed;
import com.sun.syndication.io.FeedException;
@Slf4j
@AllArgsConstructor
public class FeedFetcher {
private final FeedParser parser;
private final HttpGetter getter;
public FetchedFeed fetch(String feedUrl, boolean extractFeedUrlFromHtml, String lastModified, String eTag, Date lastPublishedDate,
String lastContentHash) throws FeedException, ClientProtocolException, IOException, NotModifiedException {
log.debug("Fetching feed {}", feedUrl);
FetchedFeed fetchedFeed = null;
int timeout = 20000;
HttpResult result = getter.getBinary(feedUrl, lastModified, eTag, timeout);
byte[] content = result.getContent();
try {
fetchedFeed = parser.parse(feedUrl, content);
} catch (FeedException e) {
if (extractFeedUrlFromHtml) {
String extractedUrl = extractFeedUrl(StringUtils.newStringUtf8(result.getContent()), feedUrl);
if (org.apache.commons.lang.StringUtils.isNotBlank(extractedUrl)) {
feedUrl = extractedUrl;
result = getter.getBinary(extractedUrl, lastModified, eTag, timeout);
content = result.getContent();
fetchedFeed = parser.parse(feedUrl, content);
} else {
throw e;
}
} else {
throw e;
}
}
if (content == null) {
throw new IOException("Feed content is empty.");
}
String hash = DigestUtils.sha1Hex(content);
if (lastContentHash != null && hash != null && lastContentHash.equals(hash)) {
log.debug("content hash not modified: {}", feedUrl);
throw new NotModifiedException("content hash not modified");
}
if (lastPublishedDate != null && fetchedFeed.getFeed().getLastPublishedDate() != null
&& lastPublishedDate.getTime() == fetchedFeed.getFeed().getLastPublishedDate().getTime()) {
log.debug("publishedDate not modified: {}", feedUrl);
throw new NotModifiedException("publishedDate not modified");
}
Feed feed = fetchedFeed.getFeed();
feed.setLastModifiedHeader(result.getLastModifiedSince());
feed.setEtagHeader(FeedUtils.truncate(result.geteTag(), 255));
feed.setLastContentHash(hash);
fetchedFeed.setFetchDuration(result.getDuration());
fetchedFeed.setUrlAfterRedirect(result.getUrlAfterRedirect());
return fetchedFeed;
}
private String extractFeedUrl(String html, String baseUri) {
String foundUrl = null;
Document doc = Jsoup.parse(html, baseUri);
String root = doc.children().get(0).tagName();
if ("html".equals(root)) {
Elements atom = doc.select("link[type=application/atom+xml]");
Elements rss = doc.select("link[type=application/rss+xml]");
if (!atom.isEmpty()) {
foundUrl = atom.get(0).attr("abs:href").toString();
} else if (!rss.isEmpty()) {
foundUrl = rss.get(0).attr("abs:href").toString();
}
}
return foundUrl;
}
}

View File

@@ -0,0 +1,218 @@
package com.commafeed.backend.feed;
import java.io.StringReader;
import java.text.DateFormat;
import java.util.Date;
import java.util.List;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.SystemUtils;
import org.jdom.Element;
import org.jdom.Namespace;
import org.xml.sax.InputSource;
import com.commafeed.backend.model.Feed;
import com.commafeed.backend.model.FeedEntry;
import com.commafeed.backend.model.FeedEntryContent;
import com.google.common.base.Function;
import com.google.common.collect.Collections2;
import com.google.common.collect.Iterables;
import com.sun.syndication.feed.synd.SyndContent;
import com.sun.syndication.feed.synd.SyndEnclosure;
import com.sun.syndication.feed.synd.SyndEntry;
import com.sun.syndication.feed.synd.SyndFeed;
import com.sun.syndication.feed.synd.SyndLink;
import com.sun.syndication.feed.synd.SyndLinkImpl;
import com.sun.syndication.io.FeedException;
import com.sun.syndication.io.SyndFeedInput;
@Slf4j
public class FeedParser {
private static final String ATOM_10_URI = "http://www.w3.org/2005/Atom";
private static final Namespace ATOM_10_NS = Namespace.getNamespace(ATOM_10_URI);
private static final Date START = new Date(86400000);
private static final Date END = new Date(1000l * Integer.MAX_VALUE - 86400000);
private static final Function<SyndContent, String> CONTENT_TO_STRING = new Function<SyndContent, String>() {
public String apply(SyndContent content) {
return content.getValue();
}
};
@SuppressWarnings("unchecked")
public FetchedFeed parse(String feedUrl, byte[] xml) throws FeedException {
FetchedFeed fetchedFeed = new FetchedFeed();
Feed feed = fetchedFeed.getFeed();
List<FeedEntry> entries = fetchedFeed.getEntries();
try {
String encoding = FeedUtils.guessEncoding(xml);
String xmlString = FeedUtils.trimInvalidXmlCharacters(new String(xml, encoding));
if (xmlString == null) {
throw new FeedException("Input string is null for url " + feedUrl);
}
xmlString = FeedUtils.replaceHtmlEntitiesWithNumericEntities(xmlString);
InputSource source = new InputSource(new StringReader(xmlString));
SyndFeed rss = new SyndFeedInput().build(source);
handleForeignMarkup(rss);
fetchedFeed.setTitle(rss.getTitle());
feed.setPushHub(findHub(rss));
feed.setPushTopic(findSelf(rss));
feed.setUrl(feedUrl);
feed.setLink(rss.getLink());
List<SyndEntry> items = rss.getEntries();
for (SyndEntry item : items) {
FeedEntry entry = new FeedEntry();
String guid = item.getUri();
if (StringUtils.isBlank(guid)) {
guid = item.getLink();
}
if (StringUtils.isBlank(guid)) {
// no guid and no link, skip entry
continue;
}
entry.setGuid(FeedUtils.truncate(guid, 2048));
entry.setUpdated(validateDate(getEntryUpdateDate(item), true));
entry.setUrl(FeedUtils.truncate(FeedUtils.toAbsoluteUrl(item.getLink(), feed.getLink(), feed.getUrlAfterRedirect()), 2048));
// if link is empty but guid is used as url
if (StringUtils.isBlank(entry.getUrl()) && StringUtils.startsWith(entry.getGuid(), "http")) {
entry.setUrl(entry.getGuid());
}
FeedEntryContent content = new FeedEntryContent();
content.setContent(getContent(item));
content.setTitle(getTitle(item));
content.setAuthor(StringUtils.trimToNull(item.getAuthor()));
SyndEnclosure enclosure = (SyndEnclosure) Iterables.getFirst(item.getEnclosures(), null);
if (enclosure != null) {
content.setEnclosureUrl(FeedUtils.truncate(enclosure.getUrl(), 2048));
content.setEnclosureType(enclosure.getType());
}
entry.setContent(content);
entries.add(entry);
}
Date lastEntryDate = null;
Date publishedDate = validateDate(rss.getPublishedDate(), false);
if (!entries.isEmpty()) {
List<Long> sortedTimestamps = FeedUtils.getSortedTimestamps(entries);
Long timestamp = sortedTimestamps.get(0);
lastEntryDate = new Date(timestamp);
publishedDate = (publishedDate == null || publishedDate.before(lastEntryDate)) ? lastEntryDate : publishedDate;
}
feed.setLastPublishedDate(publishedDate);
feed.setAverageEntryInterval(FeedUtils.averageTimeBetweenEntries(entries));
feed.setLastEntryDate(lastEntryDate);
} catch (Exception e) {
throw new FeedException(String.format("Could not parse feed from %s : %s", feedUrl, e.getMessage()), e);
}
return fetchedFeed;
}
/**
* Adds atom links for rss feeds
*/
@SuppressWarnings({ "unchecked", "rawtypes" })
private void handleForeignMarkup(SyndFeed feed) {
Object foreignMarkup = feed.getForeignMarkup();
if (foreignMarkup == null) {
return;
}
if (foreignMarkup instanceof List) {
List elements = (List) foreignMarkup;
for (Object object : elements) {
if (object instanceof Element) {
Element element = (Element) object;
if ("link".equals(element.getName()) && ATOM_10_NS.equals(element.getNamespace())) {
SyndLink link = new SyndLinkImpl();
link.setRel(element.getAttributeValue("rel"));
link.setHref(element.getAttributeValue("href"));
feed.getLinks().add(link);
}
}
}
}
}
private Date getEntryUpdateDate(SyndEntry item) {
Date date = item.getUpdatedDate();
if (date == null) {
date = item.getPublishedDate();
}
if (date == null) {
date = new Date();
}
return date;
}
private Date validateDate(Date date, boolean nullToNow) {
Date now = new Date();
if (date == null) {
return nullToNow ? now : null;
}
if (date.before(START) || date.after(END)) {
return now;
}
if (date.after(now)) {
return now;
}
return date;
}
@SuppressWarnings("unchecked")
private String getContent(SyndEntry item) {
String content = null;
if (item.getContents().isEmpty()) {
content = item.getDescription() == null ? null : item.getDescription().getValue();
} else {
content = StringUtils.join(Collections2.transform(item.getContents(), CONTENT_TO_STRING), SystemUtils.LINE_SEPARATOR);
}
return StringUtils.trimToNull(content);
}
private String getTitle(SyndEntry item) {
String title = item.getTitle();
if (StringUtils.isBlank(title)) {
Date date = item.getPublishedDate();
if (date != null) {
title = DateFormat.getInstance().format(date);
} else {
title = "(no title)";
}
}
return StringUtils.trimToNull(title);
}
@SuppressWarnings("unchecked")
private String findHub(SyndFeed feed) {
for (SyndLink l : (List<SyndLink>) feed.getLinks()) {
if ("hub".equalsIgnoreCase(l.getRel())) {
log.debug("found hub {} for feed {}", l.getHref(), feed.getLink());
return l.getHref();
}
}
return null;
}
@SuppressWarnings("unchecked")
private String findSelf(SyndFeed feed) {
for (SyndLink l : (List<SyndLink>) feed.getLinks()) {
if ("self".equalsIgnoreCase(l.getRel())) {
log.debug("found self {} for feed {}", l.getHref(), feed.getLink());
return l.getHref();
}
}
return null;
}
}

View File

@@ -0,0 +1,155 @@
package com.commafeed.backend.feed;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import javax.inject.Inject;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang.time.DateUtils;
import com.codahale.metrics.Gauge;
import com.codahale.metrics.Meter;
import com.codahale.metrics.MetricRegistry;
import com.commafeed.CommaFeedConfiguration;
import com.commafeed.backend.dao.FeedDAO;
import com.commafeed.backend.model.Feed;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Queues;
public class FeedQueues {
private final FeedDAO feedDAO;
private final CommaFeedConfiguration config;
private Queue<FeedRefreshContext> addQueue = Queues.newConcurrentLinkedQueue();
private Queue<FeedRefreshContext> takeQueue = Queues.newConcurrentLinkedQueue();
private Queue<Feed> giveBackQueue = Queues.newConcurrentLinkedQueue();
private Meter refill;
@Inject
public FeedQueues(FeedDAO feedDAO, CommaFeedConfiguration config, MetricRegistry metrics) {
this.config = config;
this.feedDAO = feedDAO;
refill = metrics.meter(MetricRegistry.name(getClass(), "refill"));
metrics.register(MetricRegistry.name(getClass(), "addQueue"), new Gauge<Integer>() {
@Override
public Integer getValue() {
return addQueue.size();
}
});
metrics.register(MetricRegistry.name(getClass(), "takeQueue"), new Gauge<Integer>() {
@Override
public Integer getValue() {
return takeQueue.size();
}
});
metrics.register(MetricRegistry.name(getClass(), "giveBackQueue"), new Gauge<Integer>() {
@Override
public Integer getValue() {
return giveBackQueue.size();
}
});
}
/**
* take a feed from the refresh queue
*/
public synchronized FeedRefreshContext take() {
FeedRefreshContext context = takeQueue.poll();
if (context == null) {
refill();
context = takeQueue.poll();
}
return context;
}
/**
* add a feed to the refresh queue
*/
public void add(Feed feed, boolean urgent) {
int refreshInterval = config.getApplicationSettings().getRefreshIntervalMinutes();
if (feed.getLastUpdated() == null || feed.getLastUpdated().before(DateUtils.addMinutes(new Date(), -1 * refreshInterval))) {
addQueue.add(new FeedRefreshContext(feed, urgent));
}
}
/**
* refills the refresh queue and empties the giveBack queue while at it
*/
private void refill() {
refill.mark();
List<FeedRefreshContext> contexts = Lists.newArrayList();
int batchSize = Math.min(100, 3 * config.getApplicationSettings().getBackgroundThreads());
// add feeds we got from the add() method
int addQueueSize = addQueue.size();
for (int i = 0; i < Math.min(batchSize, addQueueSize); i++) {
contexts.add(addQueue.poll());
}
// add feeds that are up to refresh from the database
if (!config.getApplicationSettings().isCrawlingPaused()) {
int count = batchSize - contexts.size();
if (count > 0) {
List<Feed> feeds = feedDAO.findNextUpdatable(count, getLastLoginThreshold());
for (Feed feed : feeds) {
contexts.add(new FeedRefreshContext(feed, false));
}
}
}
// set the disabledDate as we use it in feedDAO to decide what to refresh next. We also use a map to remove
// duplicates.
Map<Long, FeedRefreshContext> map = Maps.newLinkedHashMap();
for (FeedRefreshContext context : contexts) {
Feed feed = context.getFeed();
feed.setDisabledUntil(DateUtils.addMinutes(new Date(), config.getApplicationSettings().getRefreshIntervalMinutes()));
map.put(feed.getId(), context);
}
// refill the queue
takeQueue.addAll(map.values());
// add feeds from the giveBack queue to the map, overriding duplicates
int giveBackQueueSize = giveBackQueue.size();
for (int i = 0; i < giveBackQueueSize; i++) {
Feed feed = giveBackQueue.poll();
map.put(feed.getId(), new FeedRefreshContext(feed, false));
}
// update all feeds in the database
List<Feed> feeds = Lists.newArrayList();
for (FeedRefreshContext context : map.values()) {
feeds.add(context.getFeed());
}
feedDAO.merge(feeds);
}
/**
* give a feed back, updating it to the database during the next refill()
*/
public void giveBack(Feed feed) {
String normalized = FeedUtils.normalizeURL(feed.getUrl());
feed.setNormalizedUrl(normalized);
feed.setNormalizedUrlHash(DigestUtils.sha1Hex(normalized));
feed.setLastUpdated(new Date());
giveBackQueue.add(feed);
}
private Date getLastLoginThreshold() {
if (config.getApplicationSettings().isHeavyLoad()) {
return DateUtils.addDays(new Date(), -30);
} else {
return null;
}
}
}

View File

@@ -0,0 +1,42 @@
package com.commafeed.backend.feed;
import java.util.List;
import com.commafeed.backend.model.Feed;
import com.commafeed.backend.model.FeedEntry;
public class FeedRefreshContext {
private Feed feed;
private List<FeedEntry> entries;
private boolean urgent;
public FeedRefreshContext(Feed feed, boolean isUrgent) {
this.feed = feed;
this.urgent = isUrgent;
}
public Feed getFeed() {
return feed;
}
public void setFeed(Feed feed) {
this.feed = feed;
}
public boolean isUrgent() {
return urgent;
}
public void setUrgent(boolean urgent) {
this.urgent = urgent;
}
public List<FeedEntry> getEntries() {
return entries;
}
public void setEntries(List<FeedEntry> entries) {
this.entries = entries;
}
}

View File

@@ -0,0 +1,91 @@
package com.commafeed.backend.feed;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.RejectedExecutionHandler;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import lombok.extern.slf4j.Slf4j;
import com.codahale.metrics.Gauge;
import com.codahale.metrics.MetricRegistry;
/**
* Wraps a {@link ThreadPoolExecutor} instance. Blocks when queue is full instead of rejecting the task. Allow priority queueing by using
* {@link Task} instead of {@link Runnable}
*
*/
@Slf4j
public class FeedRefreshExecutor {
private String poolName;
private ThreadPoolExecutor pool;
private LinkedBlockingDeque<Runnable> queue;
public FeedRefreshExecutor(final String poolName, int threads, int queueCapacity, MetricRegistry metrics) {
log.info("Creating pool {} with {} threads", poolName, threads);
this.poolName = poolName;
pool = new ThreadPoolExecutor(threads, threads, 0, TimeUnit.MILLISECONDS, queue = new LinkedBlockingDeque<Runnable>(queueCapacity) {
private static final long serialVersionUID = 1L;
@Override
public boolean offer(Runnable r) {
Task task = (Task) r;
if (task.isUrgent()) {
return offerFirst(r);
} else {
return offerLast(r);
}
}
});
pool.setRejectedExecutionHandler(new RejectedExecutionHandler() {
@Override
public void rejectedExecution(Runnable r, ThreadPoolExecutor e) {
log.debug("{} thread queue full, waiting...", poolName);
try {
Task task = (Task) r;
if (task.isUrgent()) {
queue.putFirst(r);
} else {
queue.put(r);
}
} catch (InterruptedException e1) {
log.error(poolName + " interrupted while waiting for queue.", e1);
}
}
});
metrics.register(MetricRegistry.name(getClass(), poolName, "active"), new Gauge<Integer>() {
@Override
public Integer getValue() {
return pool.getActiveCount();
}
});
metrics.register(MetricRegistry.name(getClass(), poolName, "pending"), new Gauge<Integer>() {
@Override
public Integer getValue() {
return queue.size();
}
});
}
public void execute(Task task) {
pool.execute(task);
}
public static interface Task extends Runnable {
boolean isUrgent();
}
public void shutdown() {
pool.shutdownNow();
while (!pool.isTerminated()) {
try {
Thread.sleep(100);
} catch (InterruptedException e) {
log.error("{} interrupted while waiting for threads to finish.", poolName);
}
}
}
}

View File

@@ -0,0 +1,95 @@
package com.commafeed.backend.feed;
import io.dropwizard.lifecycle.Managed;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import javax.inject.Inject;
import lombok.extern.slf4j.Slf4j;
import org.hibernate.SessionFactory;
import com.codahale.metrics.Meter;
import com.codahale.metrics.MetricRegistry;
import com.commafeed.CommaFeedConfiguration;
import com.commafeed.backend.dao.FeedDAO;
import com.commafeed.backend.dao.UnitOfWork;
/**
* Infinite loop fetching feeds from @FeedQueues and queuing them to the {@link FeedRefreshWorker} pool.
*
*/
@Slf4j
public class FeedRefreshTaskGiver implements Managed {
private final SessionFactory sessionFactory;
private final FeedQueues queues;
private final FeedRefreshWorker worker;
private ExecutorService executor;
private Meter feedRefreshed;
private Meter threadWaited;
@Inject
public FeedRefreshTaskGiver(SessionFactory sessionFactory, FeedQueues queues, FeedDAO feedDAO, FeedRefreshWorker worker,
CommaFeedConfiguration config, MetricRegistry metrics) {
this.sessionFactory = sessionFactory;
this.queues = queues;
this.worker = worker;
executor = Executors.newFixedThreadPool(1);
feedRefreshed = metrics.meter(MetricRegistry.name(getClass(), "feedRefreshed"));
threadWaited = metrics.meter(MetricRegistry.name(getClass(), "threadWaited"));
}
@Override
public void stop() {
executor.shutdownNow();
while (!executor.isTerminated()) {
try {
Thread.sleep(100);
} catch (InterruptedException e) {
log.error("interrupted while waiting for threads to finish.");
}
}
}
@Override
public void start() {
log.info("starting feed refresh task giver");
executor.execute(new Runnable() {
@Override
public void run() {
while (!executor.isShutdown()) {
try {
FeedRefreshContext context = new UnitOfWork<FeedRefreshContext>(sessionFactory) {
@Override
protected FeedRefreshContext runInSession() throws Exception {
FeedRefreshContext context = queues.take();
if (context != null) {
feedRefreshed.mark();
worker.updateFeed(context);
}
return context;
}
}.run();
if (context == null) {
log.debug("nothing to do, sleeping for 15s");
threadWaited.mark();
try {
Thread.sleep(15000);
} catch (InterruptedException e) {
log.error("interrupted while sleeping");
}
}
} catch (Exception e) {
log.error(e.getMessage(), e);
}
}
}
});
}
}

View File

@@ -0,0 +1,228 @@
package com.commafeed.backend.feed;
import io.dropwizard.lifecycle.Managed;
import java.util.Arrays;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Lock;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.time.DateUtils;
import org.hibernate.SessionFactory;
import com.codahale.metrics.Meter;
import com.codahale.metrics.MetricRegistry;
import com.commafeed.CommaFeedConfiguration;
import com.commafeed.CommaFeedConfiguration.ApplicationSettings;
import com.commafeed.backend.cache.CacheService;
import com.commafeed.backend.dao.FeedSubscriptionDAO;
import com.commafeed.backend.dao.UnitOfWork;
import com.commafeed.backend.feed.FeedRefreshExecutor.Task;
import com.commafeed.backend.model.Feed;
import com.commafeed.backend.model.FeedEntry;
import com.commafeed.backend.model.FeedEntryContent;
import com.commafeed.backend.model.FeedSubscription;
import com.commafeed.backend.model.User;
import com.commafeed.backend.service.FeedUpdateService;
import com.commafeed.backend.service.PubSubService;
import com.google.common.collect.Lists;
import com.google.common.util.concurrent.Striped;
@Slf4j
public class FeedRefreshUpdater implements Managed {
private final SessionFactory sessionFactory;
private final FeedUpdateService feedUpdateService;
private final PubSubService pubSubService;
private final FeedQueues queues;
private final CommaFeedConfiguration config;
private final FeedSubscriptionDAO feedSubscriptionDAO;
private final CacheService cache;
private FeedRefreshExecutor pool;
private Striped<Lock> locks;
private Meter entryCacheMiss;
private Meter entryCacheHit;
private Meter feedUpdated;
private Meter entryInserted;
public FeedRefreshUpdater(SessionFactory sessionFactory, FeedUpdateService feedUpdateService, PubSubService pubSubService,
FeedQueues queues, CommaFeedConfiguration config, MetricRegistry metrics, FeedSubscriptionDAO feedSubscriptionDAO,
CacheService cache) {
this.sessionFactory = sessionFactory;
this.feedUpdateService = feedUpdateService;
this.pubSubService = pubSubService;
this.queues = queues;
this.config = config;
this.feedSubscriptionDAO = feedSubscriptionDAO;
this.cache = cache;
ApplicationSettings settings = config.getApplicationSettings();
int threads = Math.max(settings.getDatabaseUpdateThreads(), 1);
pool = new FeedRefreshExecutor("feed-refresh-updater", threads, Math.min(50 * threads, 1000), metrics);
locks = Striped.lazyWeakLock(threads * 100000);
entryCacheMiss = metrics.meter(MetricRegistry.name(getClass(), "entryCacheMiss"));
entryCacheHit = metrics.meter(MetricRegistry.name(getClass(), "entryCacheHit"));
feedUpdated = metrics.meter(MetricRegistry.name(getClass(), "feedUpdated"));
entryInserted = metrics.meter(MetricRegistry.name(getClass(), "entryInserted"));
}
@Override
public void start() throws Exception {
}
@Override
public void stop() throws Exception {
pool.shutdown();
}
public void updateFeed(FeedRefreshContext context) {
pool.execute(new EntryTask(context));
}
private class EntryTask implements Task {
private FeedRefreshContext context;
public EntryTask(FeedRefreshContext context) {
this.context = context;
}
@Override
public void run() {
new UnitOfWork<Void>(sessionFactory) {
@Override
protected Void runInSession() throws Exception {
internalRun();
return null;
}
}.run();
}
public void internalRun() {
boolean ok = true;
Feed feed = context.getFeed();
List<FeedEntry> entries = context.getEntries();
if (entries.isEmpty()) {
feed.setMessage("Feed has no entries");
} else {
List<String> lastEntries = cache.getLastEntries(feed);
List<String> currentEntries = Lists.newArrayList();
List<FeedSubscription> subscriptions = null;
for (FeedEntry entry : entries) {
String cacheKey = cache.buildUniqueEntryKey(feed, entry);
if (!lastEntries.contains(cacheKey)) {
log.debug("cache miss for {}", entry.getUrl());
if (subscriptions == null) {
subscriptions = feedSubscriptionDAO.findByFeed(feed);
}
ok &= addEntry(feed, entry, subscriptions);
entryCacheMiss.mark();
} else {
log.debug("cache hit for {}", entry.getUrl());
entryCacheHit.mark();
}
currentEntries.add(cacheKey);
}
cache.setLastEntries(feed, currentEntries);
if (subscriptions == null) {
feed.setMessage("No new entries found");
}
if (CollectionUtils.isNotEmpty(subscriptions)) {
List<User> users = Lists.newArrayList();
for (FeedSubscription sub : subscriptions) {
users.add(sub.getUser());
}
cache.invalidateUnreadCount(subscriptions.toArray(new FeedSubscription[0]));
cache.invalidateUserRootCategory(users.toArray(new User[0]));
}
}
if (config.getApplicationSettings().isPubsubhubbub()) {
handlePubSub(feed);
}
if (!ok) {
// requeue asap
feed.setDisabledUntil(new Date(0));
}
feedUpdated.mark();
queues.giveBack(feed);
}
@Override
public boolean isUrgent() {
return context.isUrgent();
}
}
private boolean addEntry(final Feed feed, final FeedEntry entry, final List<FeedSubscription> subscriptions) {
boolean success = false;
// lock on feed, make sure we are not updating the same feed twice at
// the same time
String key1 = StringUtils.trimToEmpty("" + feed.getId());
// lock on content, make sure we are not updating the same entry
// twice at the same time
FeedEntryContent content = entry.getContent();
String key2 = DigestUtils.sha1Hex(StringUtils.trimToEmpty(content.getContent() + content.getTitle()));
Iterator<Lock> iterator = locks.bulkGet(Arrays.asList(key1, key2)).iterator();
Lock lock1 = iterator.next();
Lock lock2 = iterator.next();
boolean locked1 = false;
boolean locked2 = false;
try {
locked1 = lock1.tryLock(1, TimeUnit.MINUTES);
locked2 = lock2.tryLock(1, TimeUnit.MINUTES);
if (locked1 && locked2) {
boolean inserted = feedUpdateService.addEntry(feed, entry);
if (inserted) {
entryInserted.mark();
}
success = true;
} else {
log.error("lock timeout for " + feed.getUrl() + " - " + key1);
}
} catch (InterruptedException e) {
log.error("interrupted while waiting for lock for " + feed.getUrl() + " : " + e.getMessage(), e);
} finally {
if (locked1) {
lock1.unlock();
}
if (locked2) {
lock2.unlock();
}
}
return success;
}
private void handlePubSub(final Feed feed) {
if (feed.getPushHub() != null && feed.getPushTopic() != null) {
Date lastPing = feed.getPushLastPing();
Date now = new Date();
if (lastPing == null || lastPing.before(DateUtils.addDays(now, -3))) {
new Thread() {
@Override
public void run() {
pubSubService.subscribe(feed);
}
}.start();
}
}
}
}

View File

@@ -0,0 +1,157 @@
package com.commafeed.backend.feed;
import io.dropwizard.lifecycle.Managed;
import java.util.Date;
import java.util.List;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.time.DateUtils;
import com.codahale.metrics.MetricRegistry;
import com.commafeed.CommaFeedConfiguration;
import com.commafeed.backend.HttpGetter.NotModifiedException;
import com.commafeed.backend.feed.FeedRefreshExecutor.Task;
import com.commafeed.backend.model.Feed;
import com.commafeed.backend.model.FeedEntry;
import com.google.common.base.Optional;
/**
* Calls {@link FeedFetcher} and handles its outcome
*
*/
@Slf4j
public class FeedRefreshWorker implements Managed {
private final FeedRefreshUpdater feedRefreshUpdater;
private final FeedFetcher fetcher;
private final FeedQueues queues;
private final CommaFeedConfiguration config;
private final FeedRefreshExecutor pool;
public FeedRefreshWorker(FeedRefreshUpdater feedRefreshUpdater, FeedFetcher fetcher, FeedQueues queues, CommaFeedConfiguration config,
MetricRegistry metrics) {
this.feedRefreshUpdater = feedRefreshUpdater;
this.fetcher = fetcher;
this.config = config;
this.queues = queues;
int threads = config.getApplicationSettings().getBackgroundThreads();
pool = new FeedRefreshExecutor("feed-refresh-worker", threads, Math.min(20 * threads, 1000), metrics);
}
@Override
public void start() throws Exception {
}
@Override
public void stop() throws Exception {
pool.shutdown();
}
public void updateFeed(FeedRefreshContext context) {
pool.execute(new FeedTask(context));
}
private class FeedTask implements Task {
private FeedRefreshContext context;
public FeedTask(FeedRefreshContext context) {
this.context = context;
}
@Override
public void run() {
update(context);
}
@Override
public boolean isUrgent() {
return context.isUrgent();
}
}
private void update(FeedRefreshContext context) {
Feed feed = context.getFeed();
int refreshInterval = config.getApplicationSettings().getRefreshIntervalMinutes();
Date disabledUntil = DateUtils.addMinutes(new Date(), refreshInterval);
try {
String url = Optional.fromNullable(feed.getUrlAfterRedirect()).or(feed.getUrl());
FetchedFeed fetchedFeed = fetcher.fetch(url, false, feed.getLastModifiedHeader(), feed.getEtagHeader(),
feed.getLastPublishedDate(), feed.getLastContentHash());
// stops here if NotModifiedException or any other exception is thrown
List<FeedEntry> entries = fetchedFeed.getEntries();
if (config.getApplicationSettings().isHeavyLoad()) {
disabledUntil = FeedUtils.buildDisabledUntil(fetchedFeed.getFeed().getLastEntryDate(), fetchedFeed.getFeed()
.getAverageEntryInterval(), disabledUntil);
}
String urlAfterRedirect = fetchedFeed.getUrlAfterRedirect();
if (StringUtils.equals(url, urlAfterRedirect)) {
urlAfterRedirect = null;
}
feed.setUrlAfterRedirect(urlAfterRedirect);
feed.setLink(fetchedFeed.getFeed().getLink());
feed.setLastModifiedHeader(fetchedFeed.getFeed().getLastModifiedHeader());
feed.setEtagHeader(fetchedFeed.getFeed().getEtagHeader());
feed.setLastContentHash(fetchedFeed.getFeed().getLastContentHash());
feed.setLastPublishedDate(fetchedFeed.getFeed().getLastPublishedDate());
feed.setAverageEntryInterval(fetchedFeed.getFeed().getAverageEntryInterval());
feed.setLastEntryDate(fetchedFeed.getFeed().getLastEntryDate());
feed.setErrorCount(0);
feed.setMessage(null);
feed.setDisabledUntil(disabledUntil);
handlePubSub(feed, fetchedFeed.getFeed());
context.setEntries(entries);
feedRefreshUpdater.updateFeed(context);
} catch (NotModifiedException e) {
log.debug("Feed not modified : {} - {}", feed.getUrl(), e.getMessage());
if (config.getApplicationSettings().isHeavyLoad()) {
disabledUntil = FeedUtils.buildDisabledUntil(feed.getLastEntryDate(), feed.getAverageEntryInterval(), disabledUntil);
}
feed.setErrorCount(0);
feed.setMessage(e.getMessage());
feed.setDisabledUntil(disabledUntil);
queues.giveBack(feed);
} catch (Exception e) {
String message = "Unable to refresh feed " + feed.getUrl() + " : " + e.getMessage();
log.debug(e.getClass().getName() + " " + message, e);
feed.setErrorCount(feed.getErrorCount() + 1);
feed.setMessage(message);
feed.setDisabledUntil(FeedUtils.buildDisabledUntil(feed.getErrorCount()));
queues.giveBack(feed);
}
}
private void handlePubSub(Feed feed, Feed fetchedFeed) {
String hub = fetchedFeed.getPushHub();
String topic = fetchedFeed.getPushTopic();
if (hub != null && topic != null) {
if (hub.contains("hubbub.api.typepad.com")) {
// that hub does not exist anymore
return;
}
if (topic.startsWith("www.")) {
topic = "http://" + topic;
} else if (topic.startsWith("feed://")) {
topic = "http://" + topic.substring(7);
} else if (topic.startsWith("http") == false) {
topic = "http://" + topic;
}
log.debug("feed {} has pubsub info: {}", feed.getUrl(), topic);
feed.setPushHub(hub);
feed.setPushTopic(topic);
feed.setPushTopicHash(DigestUtils.sha1Hex(topic));
}
}
}

View File

@@ -0,0 +1,524 @@
package com.commafeed.backend.feed;
import java.io.StringReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Arrays;
import java.util.Collections;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.time.DateUtils;
import org.apache.commons.math.stat.descriptive.SummaryStatistics;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Document.OutputSettings;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Entities.EscapeMode;
import org.jsoup.safety.Cleaner;
import org.jsoup.safety.Whitelist;
import org.jsoup.select.Elements;
import org.mozilla.universalchardet.UniversalDetector;
import org.w3c.css.sac.InputSource;
import org.w3c.dom.css.CSSStyleDeclaration;
import com.commafeed.backend.model.FeedEntry;
import com.commafeed.backend.model.FeedSubscription;
import com.commafeed.frontend.model.Entry;
import com.google.common.collect.Lists;
import com.google.gwt.i18n.client.HasDirection.Direction;
import com.google.gwt.i18n.shared.BidiUtils;
import com.steadystate.css.parser.CSSOMParser;
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
/**
* Utility methods related to feed handling
*
*/
@Slf4j
public class FeedUtils {
private static final String ESCAPED_QUESTION_MARK = Pattern.quote("?");
private static final List<String> ALLOWED_IFRAME_CSS_RULES = Arrays.asList("height", "width", "border");
private static final List<String> ALLOWED_IMG_CSS_RULES = Arrays.asList("display", "width", "height");
private static final char[] FORBIDDEN_CSS_RULE_CHARACTERS = new char[] { '(', ')' };
private static final Whitelist WHITELIST = buildWhiteList();
public static String truncate(String string, int length) {
if (string != null) {
string = string.substring(0, Math.min(length, string.length()));
}
return string;
}
private static synchronized Whitelist buildWhiteList() {
Whitelist whitelist = new Whitelist();
whitelist.addTags("a", "b", "blockquote", "br", "caption", "cite", "code", "col", "colgroup", "dd", "div", "dl", "dt", "em", "h1",
"h2", "h3", "h4", "h5", "h6", "i", "iframe", "img", "li", "ol", "p", "pre", "q", "small", "strike", "strong", "sub", "sup",
"table", "tbody", "td", "tfoot", "th", "thead", "tr", "u", "ul");
whitelist.addAttributes("div", "dir");
whitelist.addAttributes("pre", "dir");
whitelist.addAttributes("code", "dir");
whitelist.addAttributes("table", "dir");
whitelist.addAttributes("p", "dir");
whitelist.addAttributes("a", "href", "title");
whitelist.addAttributes("blockquote", "cite");
whitelist.addAttributes("col", "span", "width");
whitelist.addAttributes("colgroup", "span", "width");
whitelist.addAttributes("iframe", "src", "height", "width", "allowfullscreen", "frameborder", "style");
whitelist.addAttributes("img", "align", "alt", "height", "src", "title", "width", "style");
whitelist.addAttributes("ol", "start", "type");
whitelist.addAttributes("q", "cite");
whitelist.addAttributes("table", "border", "bordercolor", "summary", "width");
whitelist.addAttributes("td", "border", "bordercolor", "abbr", "axis", "colspan", "rowspan", "width");
whitelist.addAttributes("th", "border", "bordercolor", "abbr", "axis", "colspan", "rowspan", "scope", "width");
whitelist.addAttributes("ul", "type");
whitelist.addProtocols("a", "href", "ftp", "http", "https", "mailto");
whitelist.addProtocols("blockquote", "cite", "http", "https");
whitelist.addProtocols("img", "src", "http", "https");
whitelist.addProtocols("q", "cite", "http", "https");
whitelist.addEnforcedAttribute("a", "target", "_blank");
return whitelist;
}
/**
* Detect feed encoding by using the declared encoding in the xml processing instruction and by detecting the characters used in the
* feed
*
*/
public static String guessEncoding(byte[] bytes) {
String extracted = extractDeclaredEncoding(bytes);
if (StringUtils.startsWithIgnoreCase(extracted, "iso-8859-")) {
if (StringUtils.endsWith(extracted, "1") == false) {
return extracted;
}
} else if (StringUtils.startsWithIgnoreCase(extracted, "windows-")) {
return extracted;
}
return detectEncoding(bytes);
}
/**
* Detect encoding by analyzing characters in the array
*/
public static String detectEncoding(byte[] bytes) {
String DEFAULT_ENCODING = "UTF-8";
UniversalDetector detector = new UniversalDetector(null);
detector.handleData(bytes, 0, bytes.length);
detector.dataEnd();
String encoding = detector.getDetectedCharset();
detector.reset();
if (encoding == null) {
encoding = DEFAULT_ENCODING;
} else if (encoding.equalsIgnoreCase("ISO-8859-1")) {
encoding = "windows-1252";
}
return encoding;
}
public static String replaceHtmlEntitiesWithNumericEntities(String source) {
String result = source;
for (String entity : HtmlEntities.NUMERIC_MAPPING.keySet()) {
result = result.replace(entity, HtmlEntities.NUMERIC_MAPPING.get(entity));
}
return result;
}
/**
* Normalize the url. The resulting url is not meant to be fetched but rather used as a mean to identify a feed and avoid duplicates
*/
public static String normalizeURL(String url) {
if (url == null) {
return null;
}
String normalized = URLCanonicalizer.getCanonicalURL(url);
if (normalized == null) {
normalized = url;
}
// convert to lower case, the url probably won't work in some cases
// after that but we don't care we just want to compare urls to avoid
// duplicates
normalized = normalized.toLowerCase();
// store all urls as http
if (normalized.startsWith("https")) {
normalized = "http" + normalized.substring(5);
}
// remove the www. part
normalized = normalized.replace("//www.", "//");
// feedproxy redirects to feedburner
normalized = normalized.replace("feedproxy.google.com", "feeds.feedburner.com");
// feedburner feeds have a special treatment
if (normalized.split(ESCAPED_QUESTION_MARK)[0].contains("feedburner.com")) {
normalized = normalized.replace("feeds2.feedburner.com", "feeds.feedburner.com");
normalized = normalized.split(ESCAPED_QUESTION_MARK)[0];
normalized = StringUtils.removeEnd(normalized, "/");
}
return normalized;
}
/**
* Extract the declared encoding from the xml
*/
public static String extractDeclaredEncoding(byte[] bytes) {
int index = ArrayUtils.indexOf(bytes, (byte) '>');
if (index == -1) {
return null;
}
String pi = new String(ArrayUtils.subarray(bytes, 0, index + 1));
index = StringUtils.indexOf(pi, "encoding=\"");
if (index == -1) {
return null;
}
String encoding = pi.substring(index + 10, pi.length());
encoding = encoding.substring(0, encoding.indexOf('"'));
return encoding;
}
public static String handleContent(String content, String baseUri, boolean keepTextOnly) {
if (StringUtils.isNotBlank(content)) {
baseUri = StringUtils.trimToEmpty(baseUri);
Document dirty = Jsoup.parseBodyFragment(content, baseUri);
Cleaner cleaner = new Cleaner(WHITELIST);
Document clean = cleaner.clean(dirty);
for (Element e : clean.select("iframe[style]")) {
String style = e.attr("style");
String escaped = escapeIFrameCss(style);
e.attr("style", escaped);
}
for (Element e : clean.select("img[style]")) {
String style = e.attr("style");
String escaped = escapeImgCss(style);
e.attr("style", escaped);
}
clean.outputSettings(new OutputSettings().escapeMode(EscapeMode.base).prettyPrint(false));
Element body = clean.body();
if (keepTextOnly) {
content = body.text();
} else {
content = body.html();
}
}
return content;
}
public static String escapeIFrameCss(String orig) {
String rule = "";
CSSOMParser parser = new CSSOMParser();
try {
List<String> rules = Lists.newArrayList();
CSSStyleDeclaration decl = parser.parseStyleDeclaration(new InputSource(new StringReader(orig)));
for (int i = 0; i < decl.getLength(); i++) {
String property = decl.item(i);
String value = decl.getPropertyValue(property);
if (StringUtils.isBlank(property) || StringUtils.isBlank(value)) {
continue;
}
if (ALLOWED_IFRAME_CSS_RULES.contains(property) && StringUtils.containsNone(value, FORBIDDEN_CSS_RULE_CHARACTERS)) {
rules.add(property + ":" + decl.getPropertyValue(property) + ";");
}
}
rule = StringUtils.join(rules, "");
} catch (Exception e) {
log.error(e.getMessage(), e);
}
return rule;
}
public static String escapeImgCss(String orig) {
String rule = "";
CSSOMParser parser = new CSSOMParser();
try {
List<String> rules = Lists.newArrayList();
CSSStyleDeclaration decl = parser.parseStyleDeclaration(new InputSource(new StringReader(orig)));
for (int i = 0; i < decl.getLength(); i++) {
String property = decl.item(i);
String value = decl.getPropertyValue(property);
if (StringUtils.isBlank(property) || StringUtils.isBlank(value)) {
continue;
}
if (ALLOWED_IMG_CSS_RULES.contains(property) && StringUtils.containsNone(value, FORBIDDEN_CSS_RULE_CHARACTERS)) {
rules.add(property + ":" + decl.getPropertyValue(property) + ";");
}
}
rule = StringUtils.join(rules, "");
} catch (Exception e) {
log.error(e.getMessage(), e);
}
return rule;
}
public static boolean isRTL(FeedEntry entry) {
String text = entry.getContent().getContent();
if (StringUtils.isBlank(text)) {
text = entry.getContent().getTitle();
}
if (StringUtils.isBlank(text)) {
return false;
}
text = Jsoup.parse(text).text();
if (StringUtils.isBlank(text)) {
return false;
}
Direction direction = BidiUtils.get().estimateDirection(text);
return direction == Direction.RTL;
}
public static String trimInvalidXmlCharacters(String xml) {
if (StringUtils.isBlank(xml)) {
return null;
}
StringBuilder sb = new StringBuilder();
boolean firstTagFound = false;
for (int i = 0; i < xml.length(); i++) {
char c = xml.charAt(i);
if (!firstTagFound) {
if (c == '<') {
firstTagFound = true;
} else {
continue;
}
}
if (c >= 32 || c == 9 || c == 10 || c == 13) {
if (!Character.isHighSurrogate(c) && !Character.isLowSurrogate(c)) {
sb.append(c);
}
}
}
return sb.toString();
}
/**
* When there was an error fetching the feed
*
*/
public static Date buildDisabledUntil(int errorCount) {
Date now = new Date();
int retriesBeforeDisable = 3;
if (errorCount >= retriesBeforeDisable) {
int disabledHours = errorCount - retriesBeforeDisable + 1;
disabledHours = Math.min(24 * 7, disabledHours);
return DateUtils.addHours(now, disabledHours);
}
return now;
}
/**
* When the feed was refreshed successfully
*/
public static Date buildDisabledUntil(Date publishedDate, Long averageEntryInterval, Date defaultRefreshInterval) {
Date now = new Date();
if (publishedDate == null) {
// feed with no entries, recheck in 24 hours
return DateUtils.addHours(now, 24);
} else if (publishedDate.before(DateUtils.addMonths(now, -1))) {
// older than a month, recheck in 24 hours
return DateUtils.addHours(now, 24);
} else if (publishedDate.before(DateUtils.addDays(now, -14))) {
// older than two weeks, recheck in 12 hours
return DateUtils.addHours(now, 12);
} else if (publishedDate.before(DateUtils.addDays(now, -7))) {
// older than a week, recheck in 6 hours
return DateUtils.addHours(now, 6);
} else if (averageEntryInterval != null) {
// use average time between entries to decide when to refresh next, divided by factor
int factor = 2;
// not more than 6 hours
long date = Math.min(DateUtils.addHours(now, 6).getTime(), now.getTime() + averageEntryInterval / factor);
// not less than default refresh interval
date = Math.max(defaultRefreshInterval.getTime(), date);
return new Date(date);
} else {
// unknown case, recheck in 24 hours
return DateUtils.addHours(now, 24);
}
}
public static Long averageTimeBetweenEntries(List<FeedEntry> entries) {
if (entries.isEmpty() || entries.size() == 1) {
return null;
}
List<Long> timestamps = getSortedTimestamps(entries);
SummaryStatistics stats = new SummaryStatistics();
for (int i = 0; i < timestamps.size() - 1; i++) {
long diff = Math.abs(timestamps.get(i) - timestamps.get(i + 1));
stats.addValue(diff);
}
return (long) stats.getMean();
}
public static List<Long> getSortedTimestamps(List<FeedEntry> entries) {
List<Long> timestamps = Lists.newArrayList();
for (FeedEntry entry : entries) {
timestamps.add(entry.getUpdated().getTime());
}
Collections.sort(timestamps);
Collections.reverse(timestamps);
return timestamps;
}
public static String removeTrailingSlash(String url) {
if (url.endsWith("/")) {
url = url.substring(0, url.length() - 1);
}
return url;
}
/**
*
* @param url
* the url of the entry
* @param feedLink
* the url of the feed as described in the feed
* @param feedUrl
* the url of the feed that we used to fetch the feed
* @return an absolute url pointing to the entry
*/
public static String toAbsoluteUrl(String url, String feedLink, String feedUrl) {
url = StringUtils.trimToNull(StringUtils.normalizeSpace(url));
if (url == null || url.startsWith("http")) {
return url;
}
String baseUrl = (feedLink == null || isRelative(feedLink)) ? feedUrl : feedLink;
if (baseUrl == null) {
return url;
}
String result = null;
try {
result = new URL(new URL(baseUrl), url).toString();
} catch (MalformedURLException e) {
log.debug("could not parse url : " + e.getMessage(), e);
result = url;
}
return result;
}
public static boolean isRelative(final String url) {
// the regex means "doesn't start with 'scheme://'"
if ((url != null) && (url.startsWith("/") == false) && (!url.matches("^\\w+\\:\\/\\/.*")) && !(url.startsWith("#"))) {
return true;
} else {
return false;
}
}
public static String getFaviconUrl(FeedSubscription subscription, String publicUrl) {
return removeTrailingSlash(publicUrl) + "/rest/feed/favicon/" + subscription.getId();
}
public static String proxyImages(String content, String publicUrl, boolean proxyImages) {
if (!proxyImages) {
return content;
}
if (StringUtils.isBlank(content)) {
return content;
}
Document doc = Jsoup.parse(content);
Elements elements = doc.select("img");
for (Element element : elements) {
String href = element.attr("src");
if (href != null) {
String proxy = removeTrailingSlash(publicUrl) + "/rest/server/proxy?u=" + imageProxyEncoder(href);
element.attr("src", proxy);
}
}
return doc.body().html();
}
public static String rot13(String msg) {
StringBuilder message = new StringBuilder();
for (char c : msg.toCharArray()) {
if (c >= 'a' && c <= 'm')
c += 13;
else if (c >= 'n' && c <= 'z')
c -= 13;
else if (c >= 'A' && c <= 'M')
c += 13;
else if (c >= 'N' && c <= 'Z')
c -= 13;
message.append(c);
}
return message.toString();
}
public static String imageProxyEncoder(String url) {
return Base64.encodeBase64String(rot13(url).getBytes());
}
public static String imageProxyDecoder(String code) {
return rot13(new String(Base64.decodeBase64(code)));
}
public static void removeUnwantedFromSearch(List<Entry> entries, String keywords) {
if (StringUtils.isBlank(keywords)) {
return;
}
Iterator<Entry> it = entries.iterator();
while (it.hasNext()) {
Entry entry = it.next();
boolean keep = true;
for (String keyword : keywords.split(" ")) {
String title = Jsoup.parse(entry.getTitle()).text();
String content = Jsoup.parse(entry.getContent()).text();
if (!StringUtils.containsIgnoreCase(content, keyword) && !StringUtils.containsIgnoreCase(title, keyword)) {
keep = false;
break;
}
}
if (!keep) {
it.remove();
}
}
}
}

View File

@@ -0,0 +1,58 @@
package com.commafeed.backend.feed;
import java.util.List;
import com.commafeed.backend.model.Feed;
import com.commafeed.backend.model.FeedEntry;
import com.google.common.collect.Lists;
public class FetchedFeed {
private Feed feed = new Feed();
private List<FeedEntry> entries = Lists.newArrayList();
private String title;
private String urlAfterRedirect;
private long fetchDuration;
public Feed getFeed() {
return feed;
}
public void setFeed(Feed feed) {
this.feed = feed;
}
public List<FeedEntry> getEntries() {
return entries;
}
public void setEntries(List<FeedEntry> entries) {
this.entries = entries;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public long getFetchDuration() {
return fetchDuration;
}
public void setFetchDuration(long fetchDuration) {
this.fetchDuration = fetchDuration;
}
public String getUrlAfterRedirect() {
return urlAfterRedirect;
}
public void setUrlAfterRedirect(String urlAfterRedirect) {
this.urlAfterRedirect = urlAfterRedirect;
}
}

View File

@@ -0,0 +1,266 @@
package com.commafeed.backend.feed;
import java.util.Collections;
import java.util.Map;
import com.google.gwt.thirdparty.guava.common.collect.Maps;
public class HtmlEntities {
public static final Map<String, String> NUMERIC_MAPPING = Collections.unmodifiableMap(loadMap());
private static synchronized Map<String, String> loadMap() {
Map<String, String> map = Maps.newLinkedHashMap();
map.put("&Aacute;", "&#193;");
map.put("&aacute;", "&#225;");
map.put("&Acirc;", "&#194;");
map.put("&acirc;", "&#226;");
map.put("&acute;", "&#180;");
map.put("&AElig;", "&#198;");
map.put("&aelig;", "&#230;");
map.put("&Agrave;", "&#192;");
map.put("&agrave;", "&#224;");
map.put("&alefsym;", "&#8501;");
map.put("&Alpha;", "&#913;");
map.put("&alpha;", "&#945;");
map.put("&amp;", "&#38;");
map.put("&and;", "&#8743;");
map.put("&ang;", "&#8736;");
map.put("&Aring;", "&#197;");
map.put("&aring;", "&#229;");
map.put("&asymp;", "&#8776;");
map.put("&Atilde;", "&#195;");
map.put("&atilde;", "&#227;");
map.put("&Auml;", "&#196;");
map.put("&auml;", "&#228;");
map.put("&bdquo;", "&#8222;");
map.put("&Beta;", "&#914;");
map.put("&beta;", "&#946;");
map.put("&brvbar;", "&#166;");
map.put("&bull;", "&#8226;");
map.put("&cap;", "&#8745;");
map.put("&Ccedil;", "&#199;");
map.put("&ccedil;", "&#231;");
map.put("&cedil;", "&#184;");
map.put("&cent;", "&#162;");
map.put("&Chi;", "&#935;");
map.put("&chi;", "&#967;");
map.put("&circ;", "&#710;");
map.put("&clubs;", "&#9827;");
map.put("&cong;", "&#8773;");
map.put("&copy;", "&#169;");
map.put("&crarr;", "&#8629;");
map.put("&cup;", "&#8746;");
map.put("&curren;", "&#164;");
map.put("&dagger;", "&#8224;");
map.put("&Dagger;", "&#8225;");
map.put("&darr;", "&#8595;");
map.put("&dArr;", "&#8659;");
map.put("&deg;", "&#176;");
map.put("&Delta;", "&#916;");
map.put("&delta;", "&#948;");
map.put("&diams;", "&#9830;");
map.put("&divide;", "&#247;");
map.put("&Eacute;", "&#201;");
map.put("&eacute;", "&#233;");
map.put("&Ecirc;", "&#202;");
map.put("&ecirc;", "&#234;");
map.put("&Egrave;", "&#200;");
map.put("&egrave;", "&#232;");
map.put("&empty;", "&#8709;");
map.put("&emsp;", "&#8195;");
map.put("&ensp;", "&#8194;");
map.put("&Epsilon;", "&#917;");
map.put("&epsilon;", "&#949;");
map.put("&equiv;", "&#8801;");
map.put("&Eta;", "&#919;");
map.put("&eta;", "&#951;");
map.put("&ETH;", "&#208;");
map.put("&eth;", "&#240;");
map.put("&Euml;", "&#203;");
map.put("&euml;", "&#235;");
map.put("&euro;", "&#8364;");
map.put("&exist;", "&#8707;");
map.put("&fnof;", "&#402;");
map.put("&forall;", "&#8704;");
map.put("&frac12;", "&#189;");
map.put("&frac14;", "&#188;");
map.put("&frac34;", "&#190;");
map.put("&frasl;", "&#8260;");
map.put("&Gamma;", "&#915;");
map.put("&gamma;", "&#947;");
map.put("&ge;", "&#8805;");
map.put("&harr;", "&#8596;");
map.put("&hArr;", "&#8660;");
map.put("&hearts;", "&#9829;");
map.put("&hellip;", "&#8230;");
map.put("&Iacute;", "&#205;");
map.put("&iacute;", "&#237;");
map.put("&Icirc;", "&#206;");
map.put("&icirc;", "&#238;");
map.put("&iexcl;", "&#161;");
map.put("&Igrave;", "&#204;");
map.put("&igrave;", "&#236;");
map.put("&image;", "&#8465;");
map.put("&infin;", "&#8734;");
map.put("&int;", "&#8747;");
map.put("&Iota;", "&#921;");
map.put("&iota;", "&#953;");
map.put("&iquest;", "&#191;");
map.put("&isin;", "&#8712;");
map.put("&Iuml;", "&#207;");
map.put("&iuml;", "&#239;");
map.put("&Kappa;", "&#922;");
map.put("&kappa;", "&#954;");
map.put("&Lambda;", "&#923;");
map.put("&lambda;", "&#955;");
map.put("&lang;", "&#9001;");
map.put("&laquo;", "&#171;");
map.put("&larr;", "&#8592;");
map.put("&lArr;", "&#8656;");
map.put("&lceil;", "&#8968;");
map.put("&ldquo;", "&#8220;");
map.put("&le;", "&#8804;");
map.put("&lfloor;", "&#8970;");
map.put("&lowast;", "&#8727;");
map.put("&loz;", "&#9674;");
map.put("&lrm;", "&#8206;");
map.put("&lsaquo;", "&#8249;");
map.put("&lsquo;", "&#8216;");
map.put("&macr;", "&#175;");
map.put("&mdash;", "&#8212;");
map.put("&micro;", "&#181;");
map.put("&middot;", "&#183;");
map.put("&minus;", "&#8722;");
map.put("&Mu;", "&#924;");
map.put("&mu;", "&#956;");
map.put("&nabla;", "&#8711;");
map.put("&nbsp;", "&#160;");
map.put("&ndash;", "&#8211;");
map.put("&ne;", "&#8800;");
map.put("&ni;", "&#8715;");
map.put("&not;", "&#172;");
map.put("&notin;", "&#8713;");
map.put("&nsub;", "&#8836;");
map.put("&Ntilde;", "&#209;");
map.put("&ntilde;", "&#241;");
map.put("&Nu;", "&#925;");
map.put("&nu;", "&#957;");
map.put("&Oacute;", "&#211;");
map.put("&oacute;", "&#243;");
map.put("&Ocirc;", "&#212;");
map.put("&ocirc;", "&#244;");
map.put("&OElig;", "&#338;");
map.put("&oelig;", "&#339;");
map.put("&Ograve;", "&#210;");
map.put("&ograve;", "&#242;");
map.put("&oline;", "&#8254;");
map.put("&Omega;", "&#937;");
map.put("&omega;", "&#969;");
map.put("&Omicron;", "&#927;");
map.put("&omicron;", "&#959;");
map.put("&oplus;", "&#8853;");
map.put("&or;", "&#8744;");
map.put("&ordf;", "&#170;");
map.put("&ordm;", "&#186;");
map.put("&Oslash;", "&#216;");
map.put("&oslash;", "&#248;");
map.put("&Otilde;", "&#213;");
map.put("&otilde;", "&#245;");
map.put("&otimes;", "&#8855;");
map.put("&Ouml;", "&#214;");
map.put("&ouml;", "&#246;");
map.put("&para;", "&#182;");
map.put("&part;", "&#8706;");
map.put("&permil;", "&#8240;");
map.put("&perp;", "&#8869;");
map.put("&Phi;", "&#934;");
map.put("&phi;", "&#966;");
map.put("&Pi;", "&#928;");
map.put("&pi;", "&#960;");
map.put("&piv;", "&#982;");
map.put("&plusmn;", "&#177;");
map.put("&pound;", "&#163;");
map.put("&prime;", "&#8242;");
map.put("&Prime;", "&#8243;");
map.put("&prod;", "&#8719;");
map.put("&prop;", "&#8733;");
map.put("&Psi;", "&#936;");
map.put("&psi;", "&#968;");
map.put("&quot;", "&#34;");
map.put("&radic;", "&#8730;");
map.put("&rang;", "&#9002;");
map.put("&raquo;", "&#187;");
map.put("&rarr;", "&#8594;");
map.put("&rArr;", "&#8658;");
map.put("&rceil;", "&#8969;");
map.put("&rdquo;", "&#8221;");
map.put("&real;", "&#8476;");
map.put("&reg;", "&#174;");
map.put("&rfloor;", "&#8971;");
map.put("&Rho;", "&#929;");
map.put("&rho;", "&#961;");
map.put("&rlm;", "&#8207;");
map.put("&rsaquo;", "&#8250;");
map.put("&rsquo;", "&#8217;");
map.put("&sbquo;", "&#8218;");
map.put("&Scaron;", "&#352;");
map.put("&scaron;", "&#353;");
map.put("&sdot;", "&#8901;");
map.put("&sect;", "&#167;");
map.put("&shy;", "&#173;");
map.put("&Sigma;", "&#931;");
map.put("&sigma;", "&#963;");
map.put("&sigmaf;", "&#962;");
map.put("&sim;", "&#8764;");
map.put("&spades;", "&#9824;");
map.put("&sub;", "&#8834;");
map.put("&sube;", "&#8838;");
map.put("&sum;", "&#8721;");
map.put("&sup1;", "&#185;");
map.put("&sup2;", "&#178;");
map.put("&sup3;", "&#179;");
map.put("&sup;", "&#8835;");
map.put("&supe;", "&#8839;");
map.put("&szlig;", "&#223;");
map.put("&Tau;", "&#932;");
map.put("&tau;", "&#964;");
map.put("&there4;", "&#8756;");
map.put("&Theta;", "&#920;");
map.put("&theta;", "&#952;");
map.put("&thetasym;", "&#977;");
map.put("&thinsp;", "&#8201;");
map.put("&THORN;", "&#222;");
map.put("&thorn;", "&#254;");
map.put("&tilde;", "&#732;");
map.put("&times;", "&#215;");
map.put("&trade;", "&#8482;");
map.put("&Uacute;", "&#218;");
map.put("&uacute;", "&#250;");
map.put("&uarr;", "&#8593;");
map.put("&uArr;", "&#8657;");
map.put("&Ucirc;", "&#219;");
map.put("&ucirc;", "&#251;");
map.put("&Ugrave;", "&#217;");
map.put("&ugrave;", "&#249;");
map.put("&uml;", "&#168;");
map.put("&upsih;", "&#978;");
map.put("&Upsilon;", "&#933;");
map.put("&upsilon;", "&#965;");
map.put("&Uuml;", "&#220;");
map.put("&uuml;", "&#252;");
map.put("&weierp;", "&#8472;");
map.put("&Xi;", "&#926;");
map.put("&xi;", "&#958;");
map.put("&Yacute;", "&#221;");
map.put("&yacute;", "&#253;");
map.put("&yen;", "&#165;");
map.put("&yuml;", "&#255;");
map.put("&Yuml;", "&#376;");
map.put("&Zeta;", "&#918;");
map.put("&zeta;", "&#950;");
map.put("&zwj;", "&#8205;");
map.put("&zwnj;", "&#8204;");
return map;
}
}