mirror of
https://github.com/Athou/commafeed.git
synced 2026-03-21 21:37:29 +00:00
removed wicket and tomee, use dropwizard instead. remove wro4j, use gulp instead
This commit is contained in:
156
src/main/java/com/commafeed/backend/feed/FaviconFetcher.java
Normal file
156
src/main/java/com/commafeed/backend/feed/FaviconFetcher.java
Normal file
@@ -0,0 +1,156 @@
|
||||
package com.commafeed.backend.feed;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.select.Elements;
|
||||
|
||||
import com.commafeed.backend.HttpGetter;
|
||||
import com.commafeed.backend.HttpGetter.HttpResult;
|
||||
|
||||
/**
|
||||
* Inspired/Ported from https://github.com/potatolondon/getfavicon
|
||||
*
|
||||
*/
|
||||
@Slf4j
|
||||
@AllArgsConstructor
|
||||
public class FaviconFetcher {
|
||||
|
||||
private static long MIN_ICON_LENGTH = 100;
|
||||
private static long MAX_ICON_LENGTH = 100000;
|
||||
private static int TIMEOUT = 4000;
|
||||
|
||||
protected static List<String> ICON_MIMETYPES = Arrays.asList("image/x-icon", "image/vnd.microsoft.icon", "image/ico", "image/icon",
|
||||
"text/ico", "application/ico", "image/x-ms-bmp", "image/x-bmp", "image/gif", "image/png", "image/jpeg");
|
||||
private static List<String> ICON_MIMETYPE_BLACKLIST = Arrays.asList("application/xml", "text/html");
|
||||
|
||||
private final HttpGetter getter;
|
||||
|
||||
public byte[] fetch(String url) {
|
||||
|
||||
if (url == null) {
|
||||
log.debug("url is null");
|
||||
return null;
|
||||
}
|
||||
|
||||
int doubleSlash = url.indexOf("//");
|
||||
if (doubleSlash == -1) {
|
||||
doubleSlash = 0;
|
||||
} else {
|
||||
doubleSlash += 2;
|
||||
}
|
||||
int firstSlash = url.indexOf('/', doubleSlash);
|
||||
if (firstSlash != -1) {
|
||||
url = url.substring(0, firstSlash);
|
||||
}
|
||||
|
||||
byte[] icon = getIconAtRoot(url);
|
||||
|
||||
if (icon == null) {
|
||||
icon = getIconInPage(url);
|
||||
}
|
||||
|
||||
return icon;
|
||||
}
|
||||
|
||||
private byte[] getIconAtRoot(String url) {
|
||||
byte[] bytes = null;
|
||||
String contentType = null;
|
||||
|
||||
try {
|
||||
url = FeedUtils.removeTrailingSlash(url) + "/favicon.ico";
|
||||
log.debug("getting root icon at {}", url);
|
||||
HttpResult result = getter.getBinary(url, TIMEOUT);
|
||||
bytes = result.getContent();
|
||||
contentType = result.getContentType();
|
||||
} catch (Exception e) {
|
||||
log.debug("Failed to retrieve iconAtRoot: " + e.getMessage(), e);
|
||||
}
|
||||
|
||||
if (!isValidIconResponse(bytes, contentType)) {
|
||||
bytes = null;
|
||||
}
|
||||
return bytes;
|
||||
}
|
||||
|
||||
private boolean isValidIconResponse(byte[] content, String contentType) {
|
||||
if (content == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
long length = content.length;
|
||||
|
||||
if (StringUtils.isNotBlank(contentType)) {
|
||||
contentType = contentType.split(";")[0];
|
||||
}
|
||||
|
||||
if (ICON_MIMETYPE_BLACKLIST.contains(contentType)) {
|
||||
log.debug("Content-Type {} is blacklisted", contentType);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (length < MIN_ICON_LENGTH) {
|
||||
log.debug("Length {} below MIN_ICON_LENGTH {}", length, MIN_ICON_LENGTH);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (length > MAX_ICON_LENGTH) {
|
||||
log.debug("Length {} greater than MAX_ICON_LENGTH {}", length, MAX_ICON_LENGTH);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private byte[] getIconInPage(String url) {
|
||||
|
||||
Document doc = null;
|
||||
try {
|
||||
HttpResult result = getter.getBinary(url, TIMEOUT);
|
||||
doc = Jsoup.parse(new String(result.getContent()), url);
|
||||
} catch (Exception e) {
|
||||
log.debug("Failed to retrieve page to find icon");
|
||||
return null;
|
||||
}
|
||||
|
||||
Elements icons = doc.select("link[rel~=(?i)^(shortcut|icon|shortcut icon)$]");
|
||||
|
||||
if (icons.isEmpty()) {
|
||||
log.debug("No icon found in page {}", url);
|
||||
return null;
|
||||
}
|
||||
|
||||
String href = icons.get(0).attr("abs:href");
|
||||
if (StringUtils.isBlank(href)) {
|
||||
log.debug("No icon found in page");
|
||||
return null;
|
||||
}
|
||||
|
||||
log.debug("Found unconfirmed iconInPage at {}", href);
|
||||
|
||||
byte[] bytes = null;
|
||||
String contentType = null;
|
||||
try {
|
||||
HttpResult result = getter.getBinary(href, TIMEOUT);
|
||||
bytes = result.getContent();
|
||||
contentType = result.getContentType();
|
||||
} catch (Exception e) {
|
||||
log.debug("Failed to retrieve icon found in page {}", href);
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!isValidIconResponse(bytes, contentType)) {
|
||||
log.debug("Invalid icon found for {}", href);
|
||||
return null;
|
||||
}
|
||||
|
||||
return bytes;
|
||||
}
|
||||
|
||||
}
|
||||
99
src/main/java/com/commafeed/backend/feed/FeedFetcher.java
Normal file
99
src/main/java/com/commafeed/backend/feed/FeedFetcher.java
Normal file
@@ -0,0 +1,99 @@
|
||||
package com.commafeed.backend.feed;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Date;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import org.apache.commons.codec.binary.StringUtils;
|
||||
import org.apache.commons.codec.digest.DigestUtils;
|
||||
import org.apache.http.client.ClientProtocolException;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.select.Elements;
|
||||
|
||||
import com.commafeed.backend.HttpGetter;
|
||||
import com.commafeed.backend.HttpGetter.HttpResult;
|
||||
import com.commafeed.backend.HttpGetter.NotModifiedException;
|
||||
import com.commafeed.backend.model.Feed;
|
||||
import com.sun.syndication.io.FeedException;
|
||||
|
||||
@Slf4j
|
||||
@AllArgsConstructor
|
||||
public class FeedFetcher {
|
||||
|
||||
private final FeedParser parser;
|
||||
private final HttpGetter getter;
|
||||
|
||||
public FetchedFeed fetch(String feedUrl, boolean extractFeedUrlFromHtml, String lastModified, String eTag, Date lastPublishedDate,
|
||||
String lastContentHash) throws FeedException, ClientProtocolException, IOException, NotModifiedException {
|
||||
log.debug("Fetching feed {}", feedUrl);
|
||||
FetchedFeed fetchedFeed = null;
|
||||
|
||||
int timeout = 20000;
|
||||
|
||||
HttpResult result = getter.getBinary(feedUrl, lastModified, eTag, timeout);
|
||||
byte[] content = result.getContent();
|
||||
|
||||
try {
|
||||
fetchedFeed = parser.parse(feedUrl, content);
|
||||
} catch (FeedException e) {
|
||||
if (extractFeedUrlFromHtml) {
|
||||
String extractedUrl = extractFeedUrl(StringUtils.newStringUtf8(result.getContent()), feedUrl);
|
||||
if (org.apache.commons.lang.StringUtils.isNotBlank(extractedUrl)) {
|
||||
feedUrl = extractedUrl;
|
||||
|
||||
result = getter.getBinary(extractedUrl, lastModified, eTag, timeout);
|
||||
content = result.getContent();
|
||||
fetchedFeed = parser.parse(feedUrl, content);
|
||||
} else {
|
||||
throw e;
|
||||
}
|
||||
} else {
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
if (content == null) {
|
||||
throw new IOException("Feed content is empty.");
|
||||
}
|
||||
|
||||
String hash = DigestUtils.sha1Hex(content);
|
||||
if (lastContentHash != null && hash != null && lastContentHash.equals(hash)) {
|
||||
log.debug("content hash not modified: {}", feedUrl);
|
||||
throw new NotModifiedException("content hash not modified");
|
||||
}
|
||||
|
||||
if (lastPublishedDate != null && fetchedFeed.getFeed().getLastPublishedDate() != null
|
||||
&& lastPublishedDate.getTime() == fetchedFeed.getFeed().getLastPublishedDate().getTime()) {
|
||||
log.debug("publishedDate not modified: {}", feedUrl);
|
||||
throw new NotModifiedException("publishedDate not modified");
|
||||
}
|
||||
|
||||
Feed feed = fetchedFeed.getFeed();
|
||||
feed.setLastModifiedHeader(result.getLastModifiedSince());
|
||||
feed.setEtagHeader(FeedUtils.truncate(result.geteTag(), 255));
|
||||
feed.setLastContentHash(hash);
|
||||
fetchedFeed.setFetchDuration(result.getDuration());
|
||||
fetchedFeed.setUrlAfterRedirect(result.getUrlAfterRedirect());
|
||||
return fetchedFeed;
|
||||
}
|
||||
|
||||
private String extractFeedUrl(String html, String baseUri) {
|
||||
String foundUrl = null;
|
||||
|
||||
Document doc = Jsoup.parse(html, baseUri);
|
||||
String root = doc.children().get(0).tagName();
|
||||
if ("html".equals(root)) {
|
||||
Elements atom = doc.select("link[type=application/atom+xml]");
|
||||
Elements rss = doc.select("link[type=application/rss+xml]");
|
||||
if (!atom.isEmpty()) {
|
||||
foundUrl = atom.get(0).attr("abs:href").toString();
|
||||
} else if (!rss.isEmpty()) {
|
||||
foundUrl = rss.get(0).attr("abs:href").toString();
|
||||
}
|
||||
}
|
||||
return foundUrl;
|
||||
}
|
||||
}
|
||||
218
src/main/java/com/commafeed/backend/feed/FeedParser.java
Normal file
218
src/main/java/com/commafeed/backend/feed/FeedParser.java
Normal file
@@ -0,0 +1,218 @@
|
||||
package com.commafeed.backend.feed;
|
||||
|
||||
import java.io.StringReader;
|
||||
import java.text.DateFormat;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.commons.lang.SystemUtils;
|
||||
import org.jdom.Element;
|
||||
import org.jdom.Namespace;
|
||||
import org.xml.sax.InputSource;
|
||||
|
||||
import com.commafeed.backend.model.Feed;
|
||||
import com.commafeed.backend.model.FeedEntry;
|
||||
import com.commafeed.backend.model.FeedEntryContent;
|
||||
import com.google.common.base.Function;
|
||||
import com.google.common.collect.Collections2;
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.sun.syndication.feed.synd.SyndContent;
|
||||
import com.sun.syndication.feed.synd.SyndEnclosure;
|
||||
import com.sun.syndication.feed.synd.SyndEntry;
|
||||
import com.sun.syndication.feed.synd.SyndFeed;
|
||||
import com.sun.syndication.feed.synd.SyndLink;
|
||||
import com.sun.syndication.feed.synd.SyndLinkImpl;
|
||||
import com.sun.syndication.io.FeedException;
|
||||
import com.sun.syndication.io.SyndFeedInput;
|
||||
|
||||
@Slf4j
|
||||
public class FeedParser {
|
||||
|
||||
private static final String ATOM_10_URI = "http://www.w3.org/2005/Atom";
|
||||
private static final Namespace ATOM_10_NS = Namespace.getNamespace(ATOM_10_URI);
|
||||
|
||||
private static final Date START = new Date(86400000);
|
||||
private static final Date END = new Date(1000l * Integer.MAX_VALUE - 86400000);
|
||||
|
||||
private static final Function<SyndContent, String> CONTENT_TO_STRING = new Function<SyndContent, String>() {
|
||||
public String apply(SyndContent content) {
|
||||
return content.getValue();
|
||||
}
|
||||
};
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public FetchedFeed parse(String feedUrl, byte[] xml) throws FeedException {
|
||||
FetchedFeed fetchedFeed = new FetchedFeed();
|
||||
Feed feed = fetchedFeed.getFeed();
|
||||
List<FeedEntry> entries = fetchedFeed.getEntries();
|
||||
|
||||
try {
|
||||
String encoding = FeedUtils.guessEncoding(xml);
|
||||
String xmlString = FeedUtils.trimInvalidXmlCharacters(new String(xml, encoding));
|
||||
if (xmlString == null) {
|
||||
throw new FeedException("Input string is null for url " + feedUrl);
|
||||
}
|
||||
xmlString = FeedUtils.replaceHtmlEntitiesWithNumericEntities(xmlString);
|
||||
InputSource source = new InputSource(new StringReader(xmlString));
|
||||
SyndFeed rss = new SyndFeedInput().build(source);
|
||||
handleForeignMarkup(rss);
|
||||
|
||||
fetchedFeed.setTitle(rss.getTitle());
|
||||
feed.setPushHub(findHub(rss));
|
||||
feed.setPushTopic(findSelf(rss));
|
||||
feed.setUrl(feedUrl);
|
||||
feed.setLink(rss.getLink());
|
||||
List<SyndEntry> items = rss.getEntries();
|
||||
|
||||
for (SyndEntry item : items) {
|
||||
FeedEntry entry = new FeedEntry();
|
||||
|
||||
String guid = item.getUri();
|
||||
if (StringUtils.isBlank(guid)) {
|
||||
guid = item.getLink();
|
||||
}
|
||||
if (StringUtils.isBlank(guid)) {
|
||||
// no guid and no link, skip entry
|
||||
continue;
|
||||
}
|
||||
entry.setGuid(FeedUtils.truncate(guid, 2048));
|
||||
entry.setUpdated(validateDate(getEntryUpdateDate(item), true));
|
||||
entry.setUrl(FeedUtils.truncate(FeedUtils.toAbsoluteUrl(item.getLink(), feed.getLink(), feed.getUrlAfterRedirect()), 2048));
|
||||
|
||||
// if link is empty but guid is used as url
|
||||
if (StringUtils.isBlank(entry.getUrl()) && StringUtils.startsWith(entry.getGuid(), "http")) {
|
||||
entry.setUrl(entry.getGuid());
|
||||
}
|
||||
|
||||
FeedEntryContent content = new FeedEntryContent();
|
||||
content.setContent(getContent(item));
|
||||
content.setTitle(getTitle(item));
|
||||
content.setAuthor(StringUtils.trimToNull(item.getAuthor()));
|
||||
SyndEnclosure enclosure = (SyndEnclosure) Iterables.getFirst(item.getEnclosures(), null);
|
||||
if (enclosure != null) {
|
||||
content.setEnclosureUrl(FeedUtils.truncate(enclosure.getUrl(), 2048));
|
||||
content.setEnclosureType(enclosure.getType());
|
||||
}
|
||||
entry.setContent(content);
|
||||
|
||||
entries.add(entry);
|
||||
}
|
||||
Date lastEntryDate = null;
|
||||
Date publishedDate = validateDate(rss.getPublishedDate(), false);
|
||||
if (!entries.isEmpty()) {
|
||||
List<Long> sortedTimestamps = FeedUtils.getSortedTimestamps(entries);
|
||||
Long timestamp = sortedTimestamps.get(0);
|
||||
lastEntryDate = new Date(timestamp);
|
||||
publishedDate = (publishedDate == null || publishedDate.before(lastEntryDate)) ? lastEntryDate : publishedDate;
|
||||
}
|
||||
feed.setLastPublishedDate(publishedDate);
|
||||
feed.setAverageEntryInterval(FeedUtils.averageTimeBetweenEntries(entries));
|
||||
feed.setLastEntryDate(lastEntryDate);
|
||||
|
||||
} catch (Exception e) {
|
||||
throw new FeedException(String.format("Could not parse feed from %s : %s", feedUrl, e.getMessage()), e);
|
||||
}
|
||||
return fetchedFeed;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds atom links for rss feeds
|
||||
*/
|
||||
@SuppressWarnings({ "unchecked", "rawtypes" })
|
||||
private void handleForeignMarkup(SyndFeed feed) {
|
||||
Object foreignMarkup = feed.getForeignMarkup();
|
||||
if (foreignMarkup == null) {
|
||||
return;
|
||||
}
|
||||
if (foreignMarkup instanceof List) {
|
||||
List elements = (List) foreignMarkup;
|
||||
for (Object object : elements) {
|
||||
if (object instanceof Element) {
|
||||
Element element = (Element) object;
|
||||
if ("link".equals(element.getName()) && ATOM_10_NS.equals(element.getNamespace())) {
|
||||
SyndLink link = new SyndLinkImpl();
|
||||
link.setRel(element.getAttributeValue("rel"));
|
||||
link.setHref(element.getAttributeValue("href"));
|
||||
feed.getLinks().add(link);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private Date getEntryUpdateDate(SyndEntry item) {
|
||||
Date date = item.getUpdatedDate();
|
||||
if (date == null) {
|
||||
date = item.getPublishedDate();
|
||||
}
|
||||
if (date == null) {
|
||||
date = new Date();
|
||||
}
|
||||
return date;
|
||||
}
|
||||
|
||||
private Date validateDate(Date date, boolean nullToNow) {
|
||||
Date now = new Date();
|
||||
if (date == null) {
|
||||
return nullToNow ? now : null;
|
||||
}
|
||||
if (date.before(START) || date.after(END)) {
|
||||
return now;
|
||||
}
|
||||
|
||||
if (date.after(now)) {
|
||||
return now;
|
||||
}
|
||||
return date;
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private String getContent(SyndEntry item) {
|
||||
String content = null;
|
||||
if (item.getContents().isEmpty()) {
|
||||
content = item.getDescription() == null ? null : item.getDescription().getValue();
|
||||
} else {
|
||||
content = StringUtils.join(Collections2.transform(item.getContents(), CONTENT_TO_STRING), SystemUtils.LINE_SEPARATOR);
|
||||
}
|
||||
return StringUtils.trimToNull(content);
|
||||
}
|
||||
|
||||
private String getTitle(SyndEntry item) {
|
||||
String title = item.getTitle();
|
||||
if (StringUtils.isBlank(title)) {
|
||||
Date date = item.getPublishedDate();
|
||||
if (date != null) {
|
||||
title = DateFormat.getInstance().format(date);
|
||||
} else {
|
||||
title = "(no title)";
|
||||
}
|
||||
}
|
||||
return StringUtils.trimToNull(title);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private String findHub(SyndFeed feed) {
|
||||
for (SyndLink l : (List<SyndLink>) feed.getLinks()) {
|
||||
if ("hub".equalsIgnoreCase(l.getRel())) {
|
||||
log.debug("found hub {} for feed {}", l.getHref(), feed.getLink());
|
||||
return l.getHref();
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private String findSelf(SyndFeed feed) {
|
||||
for (SyndLink l : (List<SyndLink>) feed.getLinks()) {
|
||||
if ("self".equalsIgnoreCase(l.getRel())) {
|
||||
log.debug("found self {} for feed {}", l.getHref(), feed.getLink());
|
||||
return l.getHref();
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
||||
155
src/main/java/com/commafeed/backend/feed/FeedQueues.java
Normal file
155
src/main/java/com/commafeed/backend/feed/FeedQueues.java
Normal file
@@ -0,0 +1,155 @@
|
||||
package com.commafeed.backend.feed;
|
||||
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Queue;
|
||||
|
||||
import javax.inject.Inject;
|
||||
|
||||
import org.apache.commons.codec.digest.DigestUtils;
|
||||
import org.apache.commons.lang.time.DateUtils;
|
||||
|
||||
import com.codahale.metrics.Gauge;
|
||||
import com.codahale.metrics.Meter;
|
||||
import com.codahale.metrics.MetricRegistry;
|
||||
import com.commafeed.CommaFeedConfiguration;
|
||||
import com.commafeed.backend.dao.FeedDAO;
|
||||
import com.commafeed.backend.model.Feed;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Maps;
|
||||
import com.google.common.collect.Queues;
|
||||
|
||||
public class FeedQueues {
|
||||
|
||||
private final FeedDAO feedDAO;
|
||||
private final CommaFeedConfiguration config;
|
||||
|
||||
private Queue<FeedRefreshContext> addQueue = Queues.newConcurrentLinkedQueue();
|
||||
private Queue<FeedRefreshContext> takeQueue = Queues.newConcurrentLinkedQueue();
|
||||
private Queue<Feed> giveBackQueue = Queues.newConcurrentLinkedQueue();
|
||||
|
||||
private Meter refill;
|
||||
|
||||
@Inject
|
||||
public FeedQueues(FeedDAO feedDAO, CommaFeedConfiguration config, MetricRegistry metrics) {
|
||||
this.config = config;
|
||||
this.feedDAO = feedDAO;
|
||||
|
||||
refill = metrics.meter(MetricRegistry.name(getClass(), "refill"));
|
||||
metrics.register(MetricRegistry.name(getClass(), "addQueue"), new Gauge<Integer>() {
|
||||
@Override
|
||||
public Integer getValue() {
|
||||
return addQueue.size();
|
||||
}
|
||||
});
|
||||
metrics.register(MetricRegistry.name(getClass(), "takeQueue"), new Gauge<Integer>() {
|
||||
@Override
|
||||
public Integer getValue() {
|
||||
return takeQueue.size();
|
||||
}
|
||||
});
|
||||
metrics.register(MetricRegistry.name(getClass(), "giveBackQueue"), new Gauge<Integer>() {
|
||||
@Override
|
||||
public Integer getValue() {
|
||||
return giveBackQueue.size();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* take a feed from the refresh queue
|
||||
*/
|
||||
public synchronized FeedRefreshContext take() {
|
||||
FeedRefreshContext context = takeQueue.poll();
|
||||
|
||||
if (context == null) {
|
||||
refill();
|
||||
context = takeQueue.poll();
|
||||
}
|
||||
return context;
|
||||
}
|
||||
|
||||
/**
|
||||
* add a feed to the refresh queue
|
||||
*/
|
||||
public void add(Feed feed, boolean urgent) {
|
||||
int refreshInterval = config.getApplicationSettings().getRefreshIntervalMinutes();
|
||||
if (feed.getLastUpdated() == null || feed.getLastUpdated().before(DateUtils.addMinutes(new Date(), -1 * refreshInterval))) {
|
||||
addQueue.add(new FeedRefreshContext(feed, urgent));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* refills the refresh queue and empties the giveBack queue while at it
|
||||
*/
|
||||
private void refill() {
|
||||
refill.mark();
|
||||
|
||||
List<FeedRefreshContext> contexts = Lists.newArrayList();
|
||||
int batchSize = Math.min(100, 3 * config.getApplicationSettings().getBackgroundThreads());
|
||||
|
||||
// add feeds we got from the add() method
|
||||
int addQueueSize = addQueue.size();
|
||||
for (int i = 0; i < Math.min(batchSize, addQueueSize); i++) {
|
||||
contexts.add(addQueue.poll());
|
||||
}
|
||||
|
||||
// add feeds that are up to refresh from the database
|
||||
if (!config.getApplicationSettings().isCrawlingPaused()) {
|
||||
int count = batchSize - contexts.size();
|
||||
if (count > 0) {
|
||||
List<Feed> feeds = feedDAO.findNextUpdatable(count, getLastLoginThreshold());
|
||||
for (Feed feed : feeds) {
|
||||
contexts.add(new FeedRefreshContext(feed, false));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// set the disabledDate as we use it in feedDAO to decide what to refresh next. We also use a map to remove
|
||||
// duplicates.
|
||||
Map<Long, FeedRefreshContext> map = Maps.newLinkedHashMap();
|
||||
for (FeedRefreshContext context : contexts) {
|
||||
Feed feed = context.getFeed();
|
||||
feed.setDisabledUntil(DateUtils.addMinutes(new Date(), config.getApplicationSettings().getRefreshIntervalMinutes()));
|
||||
map.put(feed.getId(), context);
|
||||
}
|
||||
|
||||
// refill the queue
|
||||
takeQueue.addAll(map.values());
|
||||
|
||||
// add feeds from the giveBack queue to the map, overriding duplicates
|
||||
int giveBackQueueSize = giveBackQueue.size();
|
||||
for (int i = 0; i < giveBackQueueSize; i++) {
|
||||
Feed feed = giveBackQueue.poll();
|
||||
map.put(feed.getId(), new FeedRefreshContext(feed, false));
|
||||
}
|
||||
|
||||
// update all feeds in the database
|
||||
List<Feed> feeds = Lists.newArrayList();
|
||||
for (FeedRefreshContext context : map.values()) {
|
||||
feeds.add(context.getFeed());
|
||||
}
|
||||
feedDAO.merge(feeds);
|
||||
}
|
||||
|
||||
/**
|
||||
* give a feed back, updating it to the database during the next refill()
|
||||
*/
|
||||
public void giveBack(Feed feed) {
|
||||
String normalized = FeedUtils.normalizeURL(feed.getUrl());
|
||||
feed.setNormalizedUrl(normalized);
|
||||
feed.setNormalizedUrlHash(DigestUtils.sha1Hex(normalized));
|
||||
feed.setLastUpdated(new Date());
|
||||
giveBackQueue.add(feed);
|
||||
}
|
||||
|
||||
private Date getLastLoginThreshold() {
|
||||
if (config.getApplicationSettings().isHeavyLoad()) {
|
||||
return DateUtils.addDays(new Date(), -30);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,42 @@
|
||||
package com.commafeed.backend.feed;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.commafeed.backend.model.Feed;
|
||||
import com.commafeed.backend.model.FeedEntry;
|
||||
|
||||
public class FeedRefreshContext {
|
||||
private Feed feed;
|
||||
private List<FeedEntry> entries;
|
||||
private boolean urgent;
|
||||
|
||||
public FeedRefreshContext(Feed feed, boolean isUrgent) {
|
||||
this.feed = feed;
|
||||
this.urgent = isUrgent;
|
||||
}
|
||||
|
||||
public Feed getFeed() {
|
||||
return feed;
|
||||
}
|
||||
|
||||
public void setFeed(Feed feed) {
|
||||
this.feed = feed;
|
||||
}
|
||||
|
||||
public boolean isUrgent() {
|
||||
return urgent;
|
||||
}
|
||||
|
||||
public void setUrgent(boolean urgent) {
|
||||
this.urgent = urgent;
|
||||
}
|
||||
|
||||
public List<FeedEntry> getEntries() {
|
||||
return entries;
|
||||
}
|
||||
|
||||
public void setEntries(List<FeedEntry> entries) {
|
||||
this.entries = entries;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,91 @@
|
||||
package com.commafeed.backend.feed;
|
||||
|
||||
import java.util.concurrent.LinkedBlockingDeque;
|
||||
import java.util.concurrent.RejectedExecutionHandler;
|
||||
import java.util.concurrent.ThreadPoolExecutor;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import com.codahale.metrics.Gauge;
|
||||
import com.codahale.metrics.MetricRegistry;
|
||||
|
||||
/**
|
||||
* Wraps a {@link ThreadPoolExecutor} instance. Blocks when queue is full instead of rejecting the task. Allow priority queueing by using
|
||||
* {@link Task} instead of {@link Runnable}
|
||||
*
|
||||
*/
|
||||
@Slf4j
|
||||
public class FeedRefreshExecutor {
|
||||
|
||||
private String poolName;
|
||||
private ThreadPoolExecutor pool;
|
||||
private LinkedBlockingDeque<Runnable> queue;
|
||||
|
||||
public FeedRefreshExecutor(final String poolName, int threads, int queueCapacity, MetricRegistry metrics) {
|
||||
log.info("Creating pool {} with {} threads", poolName, threads);
|
||||
this.poolName = poolName;
|
||||
pool = new ThreadPoolExecutor(threads, threads, 0, TimeUnit.MILLISECONDS, queue = new LinkedBlockingDeque<Runnable>(queueCapacity) {
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
@Override
|
||||
public boolean offer(Runnable r) {
|
||||
Task task = (Task) r;
|
||||
if (task.isUrgent()) {
|
||||
return offerFirst(r);
|
||||
} else {
|
||||
return offerLast(r);
|
||||
}
|
||||
}
|
||||
});
|
||||
pool.setRejectedExecutionHandler(new RejectedExecutionHandler() {
|
||||
@Override
|
||||
public void rejectedExecution(Runnable r, ThreadPoolExecutor e) {
|
||||
log.debug("{} thread queue full, waiting...", poolName);
|
||||
try {
|
||||
Task task = (Task) r;
|
||||
if (task.isUrgent()) {
|
||||
queue.putFirst(r);
|
||||
} else {
|
||||
queue.put(r);
|
||||
}
|
||||
} catch (InterruptedException e1) {
|
||||
log.error(poolName + " interrupted while waiting for queue.", e1);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
metrics.register(MetricRegistry.name(getClass(), poolName, "active"), new Gauge<Integer>() {
|
||||
@Override
|
||||
public Integer getValue() {
|
||||
return pool.getActiveCount();
|
||||
}
|
||||
});
|
||||
|
||||
metrics.register(MetricRegistry.name(getClass(), poolName, "pending"), new Gauge<Integer>() {
|
||||
@Override
|
||||
public Integer getValue() {
|
||||
return queue.size();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
public void execute(Task task) {
|
||||
pool.execute(task);
|
||||
}
|
||||
|
||||
public static interface Task extends Runnable {
|
||||
boolean isUrgent();
|
||||
}
|
||||
|
||||
public void shutdown() {
|
||||
pool.shutdownNow();
|
||||
while (!pool.isTerminated()) {
|
||||
try {
|
||||
Thread.sleep(100);
|
||||
} catch (InterruptedException e) {
|
||||
log.error("{} interrupted while waiting for threads to finish.", poolName);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,95 @@
|
||||
package com.commafeed.backend.feed;
|
||||
|
||||
import io.dropwizard.lifecycle.Managed;
|
||||
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
|
||||
import javax.inject.Inject;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import org.hibernate.SessionFactory;
|
||||
|
||||
import com.codahale.metrics.Meter;
|
||||
import com.codahale.metrics.MetricRegistry;
|
||||
import com.commafeed.CommaFeedConfiguration;
|
||||
import com.commafeed.backend.dao.FeedDAO;
|
||||
import com.commafeed.backend.dao.UnitOfWork;
|
||||
|
||||
/**
|
||||
* Infinite loop fetching feeds from @FeedQueues and queuing them to the {@link FeedRefreshWorker} pool.
|
||||
*
|
||||
*/
|
||||
@Slf4j
|
||||
public class FeedRefreshTaskGiver implements Managed {
|
||||
|
||||
private final SessionFactory sessionFactory;
|
||||
private final FeedQueues queues;
|
||||
private final FeedRefreshWorker worker;
|
||||
|
||||
private ExecutorService executor;
|
||||
|
||||
private Meter feedRefreshed;
|
||||
private Meter threadWaited;
|
||||
|
||||
@Inject
|
||||
public FeedRefreshTaskGiver(SessionFactory sessionFactory, FeedQueues queues, FeedDAO feedDAO, FeedRefreshWorker worker,
|
||||
CommaFeedConfiguration config, MetricRegistry metrics) {
|
||||
this.sessionFactory = sessionFactory;
|
||||
this.queues = queues;
|
||||
this.worker = worker;
|
||||
|
||||
executor = Executors.newFixedThreadPool(1);
|
||||
feedRefreshed = metrics.meter(MetricRegistry.name(getClass(), "feedRefreshed"));
|
||||
threadWaited = metrics.meter(MetricRegistry.name(getClass(), "threadWaited"));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void stop() {
|
||||
executor.shutdownNow();
|
||||
while (!executor.isTerminated()) {
|
||||
try {
|
||||
Thread.sleep(100);
|
||||
} catch (InterruptedException e) {
|
||||
log.error("interrupted while waiting for threads to finish.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void start() {
|
||||
log.info("starting feed refresh task giver");
|
||||
executor.execute(new Runnable() {
|
||||
@Override
|
||||
public void run() {
|
||||
while (!executor.isShutdown()) {
|
||||
try {
|
||||
FeedRefreshContext context = new UnitOfWork<FeedRefreshContext>(sessionFactory) {
|
||||
@Override
|
||||
protected FeedRefreshContext runInSession() throws Exception {
|
||||
FeedRefreshContext context = queues.take();
|
||||
if (context != null) {
|
||||
feedRefreshed.mark();
|
||||
worker.updateFeed(context);
|
||||
}
|
||||
return context;
|
||||
}
|
||||
}.run();
|
||||
if (context == null) {
|
||||
log.debug("nothing to do, sleeping for 15s");
|
||||
threadWaited.mark();
|
||||
try {
|
||||
Thread.sleep(15000);
|
||||
} catch (InterruptedException e) {
|
||||
log.error("interrupted while sleeping");
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.error(e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
228
src/main/java/com/commafeed/backend/feed/FeedRefreshUpdater.java
Normal file
228
src/main/java/com/commafeed/backend/feed/FeedRefreshUpdater.java
Normal file
@@ -0,0 +1,228 @@
|
||||
package com.commafeed.backend.feed;
|
||||
|
||||
import io.dropwizard.lifecycle.Managed;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Date;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.locks.Lock;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import org.apache.commons.codec.digest.DigestUtils;
|
||||
import org.apache.commons.collections.CollectionUtils;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.commons.lang.time.DateUtils;
|
||||
import org.hibernate.SessionFactory;
|
||||
|
||||
import com.codahale.metrics.Meter;
|
||||
import com.codahale.metrics.MetricRegistry;
|
||||
import com.commafeed.CommaFeedConfiguration;
|
||||
import com.commafeed.CommaFeedConfiguration.ApplicationSettings;
|
||||
import com.commafeed.backend.cache.CacheService;
|
||||
import com.commafeed.backend.dao.FeedSubscriptionDAO;
|
||||
import com.commafeed.backend.dao.UnitOfWork;
|
||||
import com.commafeed.backend.feed.FeedRefreshExecutor.Task;
|
||||
import com.commafeed.backend.model.Feed;
|
||||
import com.commafeed.backend.model.FeedEntry;
|
||||
import com.commafeed.backend.model.FeedEntryContent;
|
||||
import com.commafeed.backend.model.FeedSubscription;
|
||||
import com.commafeed.backend.model.User;
|
||||
import com.commafeed.backend.service.FeedUpdateService;
|
||||
import com.commafeed.backend.service.PubSubService;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.util.concurrent.Striped;
|
||||
|
||||
@Slf4j
|
||||
public class FeedRefreshUpdater implements Managed {
|
||||
|
||||
private final SessionFactory sessionFactory;
|
||||
private final FeedUpdateService feedUpdateService;
|
||||
private final PubSubService pubSubService;
|
||||
private final FeedQueues queues;
|
||||
private final CommaFeedConfiguration config;
|
||||
private final FeedSubscriptionDAO feedSubscriptionDAO;
|
||||
private final CacheService cache;
|
||||
|
||||
private FeedRefreshExecutor pool;
|
||||
private Striped<Lock> locks;
|
||||
|
||||
private Meter entryCacheMiss;
|
||||
private Meter entryCacheHit;
|
||||
private Meter feedUpdated;
|
||||
private Meter entryInserted;
|
||||
|
||||
public FeedRefreshUpdater(SessionFactory sessionFactory, FeedUpdateService feedUpdateService, PubSubService pubSubService,
|
||||
FeedQueues queues, CommaFeedConfiguration config, MetricRegistry metrics, FeedSubscriptionDAO feedSubscriptionDAO,
|
||||
CacheService cache) {
|
||||
this.sessionFactory = sessionFactory;
|
||||
this.feedUpdateService = feedUpdateService;
|
||||
this.pubSubService = pubSubService;
|
||||
this.queues = queues;
|
||||
this.config = config;
|
||||
this.feedSubscriptionDAO = feedSubscriptionDAO;
|
||||
this.cache = cache;
|
||||
|
||||
ApplicationSettings settings = config.getApplicationSettings();
|
||||
int threads = Math.max(settings.getDatabaseUpdateThreads(), 1);
|
||||
pool = new FeedRefreshExecutor("feed-refresh-updater", threads, Math.min(50 * threads, 1000), metrics);
|
||||
locks = Striped.lazyWeakLock(threads * 100000);
|
||||
|
||||
entryCacheMiss = metrics.meter(MetricRegistry.name(getClass(), "entryCacheMiss"));
|
||||
entryCacheHit = metrics.meter(MetricRegistry.name(getClass(), "entryCacheHit"));
|
||||
feedUpdated = metrics.meter(MetricRegistry.name(getClass(), "feedUpdated"));
|
||||
entryInserted = metrics.meter(MetricRegistry.name(getClass(), "entryInserted"));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void start() throws Exception {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void stop() throws Exception {
|
||||
pool.shutdown();
|
||||
}
|
||||
|
||||
public void updateFeed(FeedRefreshContext context) {
|
||||
pool.execute(new EntryTask(context));
|
||||
}
|
||||
|
||||
private class EntryTask implements Task {
|
||||
|
||||
private FeedRefreshContext context;
|
||||
|
||||
public EntryTask(FeedRefreshContext context) {
|
||||
this.context = context;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
new UnitOfWork<Void>(sessionFactory) {
|
||||
@Override
|
||||
protected Void runInSession() throws Exception {
|
||||
internalRun();
|
||||
return null;
|
||||
}
|
||||
}.run();
|
||||
}
|
||||
|
||||
public void internalRun() {
|
||||
boolean ok = true;
|
||||
Feed feed = context.getFeed();
|
||||
List<FeedEntry> entries = context.getEntries();
|
||||
if (entries.isEmpty()) {
|
||||
feed.setMessage("Feed has no entries");
|
||||
} else {
|
||||
List<String> lastEntries = cache.getLastEntries(feed);
|
||||
List<String> currentEntries = Lists.newArrayList();
|
||||
|
||||
List<FeedSubscription> subscriptions = null;
|
||||
for (FeedEntry entry : entries) {
|
||||
String cacheKey = cache.buildUniqueEntryKey(feed, entry);
|
||||
if (!lastEntries.contains(cacheKey)) {
|
||||
log.debug("cache miss for {}", entry.getUrl());
|
||||
if (subscriptions == null) {
|
||||
subscriptions = feedSubscriptionDAO.findByFeed(feed);
|
||||
}
|
||||
ok &= addEntry(feed, entry, subscriptions);
|
||||
entryCacheMiss.mark();
|
||||
} else {
|
||||
log.debug("cache hit for {}", entry.getUrl());
|
||||
entryCacheHit.mark();
|
||||
}
|
||||
|
||||
currentEntries.add(cacheKey);
|
||||
}
|
||||
cache.setLastEntries(feed, currentEntries);
|
||||
|
||||
if (subscriptions == null) {
|
||||
feed.setMessage("No new entries found");
|
||||
}
|
||||
|
||||
if (CollectionUtils.isNotEmpty(subscriptions)) {
|
||||
List<User> users = Lists.newArrayList();
|
||||
for (FeedSubscription sub : subscriptions) {
|
||||
users.add(sub.getUser());
|
||||
}
|
||||
cache.invalidateUnreadCount(subscriptions.toArray(new FeedSubscription[0]));
|
||||
cache.invalidateUserRootCategory(users.toArray(new User[0]));
|
||||
}
|
||||
}
|
||||
|
||||
if (config.getApplicationSettings().isPubsubhubbub()) {
|
||||
handlePubSub(feed);
|
||||
}
|
||||
if (!ok) {
|
||||
// requeue asap
|
||||
feed.setDisabledUntil(new Date(0));
|
||||
}
|
||||
feedUpdated.mark();
|
||||
queues.giveBack(feed);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isUrgent() {
|
||||
return context.isUrgent();
|
||||
}
|
||||
}
|
||||
|
||||
private boolean addEntry(final Feed feed, final FeedEntry entry, final List<FeedSubscription> subscriptions) {
|
||||
boolean success = false;
|
||||
|
||||
// lock on feed, make sure we are not updating the same feed twice at
|
||||
// the same time
|
||||
String key1 = StringUtils.trimToEmpty("" + feed.getId());
|
||||
|
||||
// lock on content, make sure we are not updating the same entry
|
||||
// twice at the same time
|
||||
FeedEntryContent content = entry.getContent();
|
||||
String key2 = DigestUtils.sha1Hex(StringUtils.trimToEmpty(content.getContent() + content.getTitle()));
|
||||
|
||||
Iterator<Lock> iterator = locks.bulkGet(Arrays.asList(key1, key2)).iterator();
|
||||
Lock lock1 = iterator.next();
|
||||
Lock lock2 = iterator.next();
|
||||
boolean locked1 = false;
|
||||
boolean locked2 = false;
|
||||
try {
|
||||
locked1 = lock1.tryLock(1, TimeUnit.MINUTES);
|
||||
locked2 = lock2.tryLock(1, TimeUnit.MINUTES);
|
||||
if (locked1 && locked2) {
|
||||
boolean inserted = feedUpdateService.addEntry(feed, entry);
|
||||
if (inserted) {
|
||||
entryInserted.mark();
|
||||
}
|
||||
success = true;
|
||||
} else {
|
||||
log.error("lock timeout for " + feed.getUrl() + " - " + key1);
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
log.error("interrupted while waiting for lock for " + feed.getUrl() + " : " + e.getMessage(), e);
|
||||
} finally {
|
||||
if (locked1) {
|
||||
lock1.unlock();
|
||||
}
|
||||
if (locked2) {
|
||||
lock2.unlock();
|
||||
}
|
||||
}
|
||||
return success;
|
||||
}
|
||||
|
||||
private void handlePubSub(final Feed feed) {
|
||||
if (feed.getPushHub() != null && feed.getPushTopic() != null) {
|
||||
Date lastPing = feed.getPushLastPing();
|
||||
Date now = new Date();
|
||||
if (lastPing == null || lastPing.before(DateUtils.addDays(now, -3))) {
|
||||
new Thread() {
|
||||
@Override
|
||||
public void run() {
|
||||
pubSubService.subscribe(feed);
|
||||
}
|
||||
}.start();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
157
src/main/java/com/commafeed/backend/feed/FeedRefreshWorker.java
Normal file
157
src/main/java/com/commafeed/backend/feed/FeedRefreshWorker.java
Normal file
@@ -0,0 +1,157 @@
|
||||
package com.commafeed.backend.feed;
|
||||
|
||||
import io.dropwizard.lifecycle.Managed;
|
||||
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import org.apache.commons.codec.digest.DigestUtils;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.commons.lang.time.DateUtils;
|
||||
|
||||
import com.codahale.metrics.MetricRegistry;
|
||||
import com.commafeed.CommaFeedConfiguration;
|
||||
import com.commafeed.backend.HttpGetter.NotModifiedException;
|
||||
import com.commafeed.backend.feed.FeedRefreshExecutor.Task;
|
||||
import com.commafeed.backend.model.Feed;
|
||||
import com.commafeed.backend.model.FeedEntry;
|
||||
import com.google.common.base.Optional;
|
||||
|
||||
/**
|
||||
* Calls {@link FeedFetcher} and handles its outcome
|
||||
*
|
||||
*/
|
||||
@Slf4j
|
||||
public class FeedRefreshWorker implements Managed {
|
||||
|
||||
private final FeedRefreshUpdater feedRefreshUpdater;
|
||||
private final FeedFetcher fetcher;
|
||||
private final FeedQueues queues;
|
||||
private final CommaFeedConfiguration config;
|
||||
private final FeedRefreshExecutor pool;
|
||||
|
||||
public FeedRefreshWorker(FeedRefreshUpdater feedRefreshUpdater, FeedFetcher fetcher, FeedQueues queues, CommaFeedConfiguration config,
|
||||
MetricRegistry metrics) {
|
||||
this.feedRefreshUpdater = feedRefreshUpdater;
|
||||
this.fetcher = fetcher;
|
||||
this.config = config;
|
||||
this.queues = queues;
|
||||
int threads = config.getApplicationSettings().getBackgroundThreads();
|
||||
pool = new FeedRefreshExecutor("feed-refresh-worker", threads, Math.min(20 * threads, 1000), metrics);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void start() throws Exception {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void stop() throws Exception {
|
||||
pool.shutdown();
|
||||
}
|
||||
|
||||
public void updateFeed(FeedRefreshContext context) {
|
||||
pool.execute(new FeedTask(context));
|
||||
}
|
||||
|
||||
private class FeedTask implements Task {
|
||||
|
||||
private FeedRefreshContext context;
|
||||
|
||||
public FeedTask(FeedRefreshContext context) {
|
||||
this.context = context;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
update(context);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isUrgent() {
|
||||
return context.isUrgent();
|
||||
}
|
||||
}
|
||||
|
||||
private void update(FeedRefreshContext context) {
|
||||
Feed feed = context.getFeed();
|
||||
int refreshInterval = config.getApplicationSettings().getRefreshIntervalMinutes();
|
||||
Date disabledUntil = DateUtils.addMinutes(new Date(), refreshInterval);
|
||||
try {
|
||||
String url = Optional.fromNullable(feed.getUrlAfterRedirect()).or(feed.getUrl());
|
||||
FetchedFeed fetchedFeed = fetcher.fetch(url, false, feed.getLastModifiedHeader(), feed.getEtagHeader(),
|
||||
feed.getLastPublishedDate(), feed.getLastContentHash());
|
||||
// stops here if NotModifiedException or any other exception is thrown
|
||||
List<FeedEntry> entries = fetchedFeed.getEntries();
|
||||
|
||||
if (config.getApplicationSettings().isHeavyLoad()) {
|
||||
disabledUntil = FeedUtils.buildDisabledUntil(fetchedFeed.getFeed().getLastEntryDate(), fetchedFeed.getFeed()
|
||||
.getAverageEntryInterval(), disabledUntil);
|
||||
}
|
||||
String urlAfterRedirect = fetchedFeed.getUrlAfterRedirect();
|
||||
if (StringUtils.equals(url, urlAfterRedirect)) {
|
||||
urlAfterRedirect = null;
|
||||
}
|
||||
feed.setUrlAfterRedirect(urlAfterRedirect);
|
||||
feed.setLink(fetchedFeed.getFeed().getLink());
|
||||
feed.setLastModifiedHeader(fetchedFeed.getFeed().getLastModifiedHeader());
|
||||
feed.setEtagHeader(fetchedFeed.getFeed().getEtagHeader());
|
||||
feed.setLastContentHash(fetchedFeed.getFeed().getLastContentHash());
|
||||
feed.setLastPublishedDate(fetchedFeed.getFeed().getLastPublishedDate());
|
||||
feed.setAverageEntryInterval(fetchedFeed.getFeed().getAverageEntryInterval());
|
||||
feed.setLastEntryDate(fetchedFeed.getFeed().getLastEntryDate());
|
||||
|
||||
feed.setErrorCount(0);
|
||||
feed.setMessage(null);
|
||||
feed.setDisabledUntil(disabledUntil);
|
||||
|
||||
handlePubSub(feed, fetchedFeed.getFeed());
|
||||
context.setEntries(entries);
|
||||
feedRefreshUpdater.updateFeed(context);
|
||||
|
||||
} catch (NotModifiedException e) {
|
||||
log.debug("Feed not modified : {} - {}", feed.getUrl(), e.getMessage());
|
||||
|
||||
if (config.getApplicationSettings().isHeavyLoad()) {
|
||||
disabledUntil = FeedUtils.buildDisabledUntil(feed.getLastEntryDate(), feed.getAverageEntryInterval(), disabledUntil);
|
||||
}
|
||||
feed.setErrorCount(0);
|
||||
feed.setMessage(e.getMessage());
|
||||
feed.setDisabledUntil(disabledUntil);
|
||||
|
||||
queues.giveBack(feed);
|
||||
} catch (Exception e) {
|
||||
String message = "Unable to refresh feed " + feed.getUrl() + " : " + e.getMessage();
|
||||
log.debug(e.getClass().getName() + " " + message, e);
|
||||
|
||||
feed.setErrorCount(feed.getErrorCount() + 1);
|
||||
feed.setMessage(message);
|
||||
feed.setDisabledUntil(FeedUtils.buildDisabledUntil(feed.getErrorCount()));
|
||||
|
||||
queues.giveBack(feed);
|
||||
}
|
||||
}
|
||||
|
||||
private void handlePubSub(Feed feed, Feed fetchedFeed) {
|
||||
String hub = fetchedFeed.getPushHub();
|
||||
String topic = fetchedFeed.getPushTopic();
|
||||
if (hub != null && topic != null) {
|
||||
if (hub.contains("hubbub.api.typepad.com")) {
|
||||
// that hub does not exist anymore
|
||||
return;
|
||||
}
|
||||
if (topic.startsWith("www.")) {
|
||||
topic = "http://" + topic;
|
||||
} else if (topic.startsWith("feed://")) {
|
||||
topic = "http://" + topic.substring(7);
|
||||
} else if (topic.startsWith("http") == false) {
|
||||
topic = "http://" + topic;
|
||||
}
|
||||
log.debug("feed {} has pubsub info: {}", feed.getUrl(), topic);
|
||||
feed.setPushHub(hub);
|
||||
feed.setPushTopic(topic);
|
||||
feed.setPushTopicHash(DigestUtils.sha1Hex(topic));
|
||||
}
|
||||
}
|
||||
}
|
||||
524
src/main/java/com/commafeed/backend/feed/FeedUtils.java
Normal file
524
src/main/java/com/commafeed/backend/feed/FeedUtils.java
Normal file
@@ -0,0 +1,524 @@
|
||||
package com.commafeed.backend.feed;
|
||||
|
||||
import java.io.StringReader;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.Date;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
|
||||
import org.apache.commons.codec.binary.Base64;
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.commons.lang.time.DateUtils;
|
||||
import org.apache.commons.math.stat.descriptive.SummaryStatistics;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.jsoup.nodes.Document.OutputSettings;
|
||||
import org.jsoup.nodes.Element;
|
||||
import org.jsoup.nodes.Entities.EscapeMode;
|
||||
import org.jsoup.safety.Cleaner;
|
||||
import org.jsoup.safety.Whitelist;
|
||||
import org.jsoup.select.Elements;
|
||||
import org.mozilla.universalchardet.UniversalDetector;
|
||||
import org.w3c.css.sac.InputSource;
|
||||
import org.w3c.dom.css.CSSStyleDeclaration;
|
||||
|
||||
import com.commafeed.backend.model.FeedEntry;
|
||||
import com.commafeed.backend.model.FeedSubscription;
|
||||
import com.commafeed.frontend.model.Entry;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.gwt.i18n.client.HasDirection.Direction;
|
||||
import com.google.gwt.i18n.shared.BidiUtils;
|
||||
import com.steadystate.css.parser.CSSOMParser;
|
||||
|
||||
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
|
||||
|
||||
/**
|
||||
* Utility methods related to feed handling
|
||||
*
|
||||
*/
|
||||
@Slf4j
|
||||
public class FeedUtils {
|
||||
|
||||
private static final String ESCAPED_QUESTION_MARK = Pattern.quote("?");
|
||||
|
||||
private static final List<String> ALLOWED_IFRAME_CSS_RULES = Arrays.asList("height", "width", "border");
|
||||
private static final List<String> ALLOWED_IMG_CSS_RULES = Arrays.asList("display", "width", "height");
|
||||
private static final char[] FORBIDDEN_CSS_RULE_CHARACTERS = new char[] { '(', ')' };
|
||||
|
||||
private static final Whitelist WHITELIST = buildWhiteList();
|
||||
|
||||
public static String truncate(String string, int length) {
|
||||
if (string != null) {
|
||||
string = string.substring(0, Math.min(length, string.length()));
|
||||
}
|
||||
return string;
|
||||
}
|
||||
|
||||
private static synchronized Whitelist buildWhiteList() {
|
||||
Whitelist whitelist = new Whitelist();
|
||||
whitelist.addTags("a", "b", "blockquote", "br", "caption", "cite", "code", "col", "colgroup", "dd", "div", "dl", "dt", "em", "h1",
|
||||
"h2", "h3", "h4", "h5", "h6", "i", "iframe", "img", "li", "ol", "p", "pre", "q", "small", "strike", "strong", "sub", "sup",
|
||||
"table", "tbody", "td", "tfoot", "th", "thead", "tr", "u", "ul");
|
||||
|
||||
whitelist.addAttributes("div", "dir");
|
||||
whitelist.addAttributes("pre", "dir");
|
||||
whitelist.addAttributes("code", "dir");
|
||||
whitelist.addAttributes("table", "dir");
|
||||
whitelist.addAttributes("p", "dir");
|
||||
whitelist.addAttributes("a", "href", "title");
|
||||
whitelist.addAttributes("blockquote", "cite");
|
||||
whitelist.addAttributes("col", "span", "width");
|
||||
whitelist.addAttributes("colgroup", "span", "width");
|
||||
whitelist.addAttributes("iframe", "src", "height", "width", "allowfullscreen", "frameborder", "style");
|
||||
whitelist.addAttributes("img", "align", "alt", "height", "src", "title", "width", "style");
|
||||
whitelist.addAttributes("ol", "start", "type");
|
||||
whitelist.addAttributes("q", "cite");
|
||||
whitelist.addAttributes("table", "border", "bordercolor", "summary", "width");
|
||||
whitelist.addAttributes("td", "border", "bordercolor", "abbr", "axis", "colspan", "rowspan", "width");
|
||||
whitelist.addAttributes("th", "border", "bordercolor", "abbr", "axis", "colspan", "rowspan", "scope", "width");
|
||||
whitelist.addAttributes("ul", "type");
|
||||
|
||||
whitelist.addProtocols("a", "href", "ftp", "http", "https", "mailto");
|
||||
whitelist.addProtocols("blockquote", "cite", "http", "https");
|
||||
whitelist.addProtocols("img", "src", "http", "https");
|
||||
whitelist.addProtocols("q", "cite", "http", "https");
|
||||
|
||||
whitelist.addEnforcedAttribute("a", "target", "_blank");
|
||||
return whitelist;
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect feed encoding by using the declared encoding in the xml processing instruction and by detecting the characters used in the
|
||||
* feed
|
||||
*
|
||||
*/
|
||||
public static String guessEncoding(byte[] bytes) {
|
||||
String extracted = extractDeclaredEncoding(bytes);
|
||||
if (StringUtils.startsWithIgnoreCase(extracted, "iso-8859-")) {
|
||||
if (StringUtils.endsWith(extracted, "1") == false) {
|
||||
return extracted;
|
||||
}
|
||||
} else if (StringUtils.startsWithIgnoreCase(extracted, "windows-")) {
|
||||
return extracted;
|
||||
}
|
||||
return detectEncoding(bytes);
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect encoding by analyzing characters in the array
|
||||
*/
|
||||
public static String detectEncoding(byte[] bytes) {
|
||||
String DEFAULT_ENCODING = "UTF-8";
|
||||
UniversalDetector detector = new UniversalDetector(null);
|
||||
detector.handleData(bytes, 0, bytes.length);
|
||||
detector.dataEnd();
|
||||
String encoding = detector.getDetectedCharset();
|
||||
detector.reset();
|
||||
if (encoding == null) {
|
||||
encoding = DEFAULT_ENCODING;
|
||||
} else if (encoding.equalsIgnoreCase("ISO-8859-1")) {
|
||||
encoding = "windows-1252";
|
||||
}
|
||||
return encoding;
|
||||
}
|
||||
|
||||
public static String replaceHtmlEntitiesWithNumericEntities(String source) {
|
||||
String result = source;
|
||||
for (String entity : HtmlEntities.NUMERIC_MAPPING.keySet()) {
|
||||
result = result.replace(entity, HtmlEntities.NUMERIC_MAPPING.get(entity));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize the url. The resulting url is not meant to be fetched but rather used as a mean to identify a feed and avoid duplicates
|
||||
*/
|
||||
public static String normalizeURL(String url) {
|
||||
if (url == null) {
|
||||
return null;
|
||||
}
|
||||
String normalized = URLCanonicalizer.getCanonicalURL(url);
|
||||
if (normalized == null) {
|
||||
normalized = url;
|
||||
}
|
||||
|
||||
// convert to lower case, the url probably won't work in some cases
|
||||
// after that but we don't care we just want to compare urls to avoid
|
||||
// duplicates
|
||||
normalized = normalized.toLowerCase();
|
||||
|
||||
// store all urls as http
|
||||
if (normalized.startsWith("https")) {
|
||||
normalized = "http" + normalized.substring(5);
|
||||
}
|
||||
|
||||
// remove the www. part
|
||||
normalized = normalized.replace("//www.", "//");
|
||||
|
||||
// feedproxy redirects to feedburner
|
||||
normalized = normalized.replace("feedproxy.google.com", "feeds.feedburner.com");
|
||||
|
||||
// feedburner feeds have a special treatment
|
||||
if (normalized.split(ESCAPED_QUESTION_MARK)[0].contains("feedburner.com")) {
|
||||
normalized = normalized.replace("feeds2.feedburner.com", "feeds.feedburner.com");
|
||||
normalized = normalized.split(ESCAPED_QUESTION_MARK)[0];
|
||||
normalized = StringUtils.removeEnd(normalized, "/");
|
||||
}
|
||||
|
||||
return normalized;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract the declared encoding from the xml
|
||||
*/
|
||||
public static String extractDeclaredEncoding(byte[] bytes) {
|
||||
int index = ArrayUtils.indexOf(bytes, (byte) '>');
|
||||
if (index == -1) {
|
||||
return null;
|
||||
}
|
||||
|
||||
String pi = new String(ArrayUtils.subarray(bytes, 0, index + 1));
|
||||
index = StringUtils.indexOf(pi, "encoding=\"");
|
||||
if (index == -1) {
|
||||
return null;
|
||||
}
|
||||
String encoding = pi.substring(index + 10, pi.length());
|
||||
encoding = encoding.substring(0, encoding.indexOf('"'));
|
||||
return encoding;
|
||||
}
|
||||
|
||||
public static String handleContent(String content, String baseUri, boolean keepTextOnly) {
|
||||
if (StringUtils.isNotBlank(content)) {
|
||||
baseUri = StringUtils.trimToEmpty(baseUri);
|
||||
|
||||
Document dirty = Jsoup.parseBodyFragment(content, baseUri);
|
||||
Cleaner cleaner = new Cleaner(WHITELIST);
|
||||
Document clean = cleaner.clean(dirty);
|
||||
|
||||
for (Element e : clean.select("iframe[style]")) {
|
||||
String style = e.attr("style");
|
||||
String escaped = escapeIFrameCss(style);
|
||||
e.attr("style", escaped);
|
||||
}
|
||||
|
||||
for (Element e : clean.select("img[style]")) {
|
||||
String style = e.attr("style");
|
||||
String escaped = escapeImgCss(style);
|
||||
e.attr("style", escaped);
|
||||
}
|
||||
|
||||
clean.outputSettings(new OutputSettings().escapeMode(EscapeMode.base).prettyPrint(false));
|
||||
Element body = clean.body();
|
||||
if (keepTextOnly) {
|
||||
content = body.text();
|
||||
} else {
|
||||
content = body.html();
|
||||
}
|
||||
}
|
||||
return content;
|
||||
}
|
||||
|
||||
public static String escapeIFrameCss(String orig) {
|
||||
String rule = "";
|
||||
CSSOMParser parser = new CSSOMParser();
|
||||
try {
|
||||
List<String> rules = Lists.newArrayList();
|
||||
CSSStyleDeclaration decl = parser.parseStyleDeclaration(new InputSource(new StringReader(orig)));
|
||||
|
||||
for (int i = 0; i < decl.getLength(); i++) {
|
||||
String property = decl.item(i);
|
||||
String value = decl.getPropertyValue(property);
|
||||
if (StringUtils.isBlank(property) || StringUtils.isBlank(value)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ALLOWED_IFRAME_CSS_RULES.contains(property) && StringUtils.containsNone(value, FORBIDDEN_CSS_RULE_CHARACTERS)) {
|
||||
rules.add(property + ":" + decl.getPropertyValue(property) + ";");
|
||||
}
|
||||
}
|
||||
rule = StringUtils.join(rules, "");
|
||||
} catch (Exception e) {
|
||||
log.error(e.getMessage(), e);
|
||||
}
|
||||
return rule;
|
||||
}
|
||||
|
||||
public static String escapeImgCss(String orig) {
|
||||
String rule = "";
|
||||
CSSOMParser parser = new CSSOMParser();
|
||||
try {
|
||||
List<String> rules = Lists.newArrayList();
|
||||
CSSStyleDeclaration decl = parser.parseStyleDeclaration(new InputSource(new StringReader(orig)));
|
||||
|
||||
for (int i = 0; i < decl.getLength(); i++) {
|
||||
String property = decl.item(i);
|
||||
String value = decl.getPropertyValue(property);
|
||||
if (StringUtils.isBlank(property) || StringUtils.isBlank(value)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (ALLOWED_IMG_CSS_RULES.contains(property) && StringUtils.containsNone(value, FORBIDDEN_CSS_RULE_CHARACTERS)) {
|
||||
rules.add(property + ":" + decl.getPropertyValue(property) + ";");
|
||||
}
|
||||
}
|
||||
rule = StringUtils.join(rules, "");
|
||||
} catch (Exception e) {
|
||||
log.error(e.getMessage(), e);
|
||||
}
|
||||
return rule;
|
||||
}
|
||||
|
||||
public static boolean isRTL(FeedEntry entry) {
|
||||
String text = entry.getContent().getContent();
|
||||
|
||||
if (StringUtils.isBlank(text)) {
|
||||
text = entry.getContent().getTitle();
|
||||
}
|
||||
|
||||
if (StringUtils.isBlank(text)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
text = Jsoup.parse(text).text();
|
||||
if (StringUtils.isBlank(text)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
Direction direction = BidiUtils.get().estimateDirection(text);
|
||||
return direction == Direction.RTL;
|
||||
}
|
||||
|
||||
public static String trimInvalidXmlCharacters(String xml) {
|
||||
if (StringUtils.isBlank(xml)) {
|
||||
return null;
|
||||
}
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
boolean firstTagFound = false;
|
||||
for (int i = 0; i < xml.length(); i++) {
|
||||
char c = xml.charAt(i);
|
||||
|
||||
if (!firstTagFound) {
|
||||
if (c == '<') {
|
||||
firstTagFound = true;
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (c >= 32 || c == 9 || c == 10 || c == 13) {
|
||||
if (!Character.isHighSurrogate(c) && !Character.isLowSurrogate(c)) {
|
||||
sb.append(c);
|
||||
}
|
||||
}
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* When there was an error fetching the feed
|
||||
*
|
||||
*/
|
||||
public static Date buildDisabledUntil(int errorCount) {
|
||||
Date now = new Date();
|
||||
int retriesBeforeDisable = 3;
|
||||
|
||||
if (errorCount >= retriesBeforeDisable) {
|
||||
int disabledHours = errorCount - retriesBeforeDisable + 1;
|
||||
disabledHours = Math.min(24 * 7, disabledHours);
|
||||
return DateUtils.addHours(now, disabledHours);
|
||||
}
|
||||
return now;
|
||||
}
|
||||
|
||||
/**
|
||||
* When the feed was refreshed successfully
|
||||
*/
|
||||
public static Date buildDisabledUntil(Date publishedDate, Long averageEntryInterval, Date defaultRefreshInterval) {
|
||||
Date now = new Date();
|
||||
|
||||
if (publishedDate == null) {
|
||||
// feed with no entries, recheck in 24 hours
|
||||
return DateUtils.addHours(now, 24);
|
||||
} else if (publishedDate.before(DateUtils.addMonths(now, -1))) {
|
||||
// older than a month, recheck in 24 hours
|
||||
return DateUtils.addHours(now, 24);
|
||||
} else if (publishedDate.before(DateUtils.addDays(now, -14))) {
|
||||
// older than two weeks, recheck in 12 hours
|
||||
return DateUtils.addHours(now, 12);
|
||||
} else if (publishedDate.before(DateUtils.addDays(now, -7))) {
|
||||
// older than a week, recheck in 6 hours
|
||||
return DateUtils.addHours(now, 6);
|
||||
} else if (averageEntryInterval != null) {
|
||||
// use average time between entries to decide when to refresh next, divided by factor
|
||||
int factor = 2;
|
||||
|
||||
// not more than 6 hours
|
||||
long date = Math.min(DateUtils.addHours(now, 6).getTime(), now.getTime() + averageEntryInterval / factor);
|
||||
|
||||
// not less than default refresh interval
|
||||
date = Math.max(defaultRefreshInterval.getTime(), date);
|
||||
|
||||
return new Date(date);
|
||||
} else {
|
||||
// unknown case, recheck in 24 hours
|
||||
return DateUtils.addHours(now, 24);
|
||||
}
|
||||
}
|
||||
|
||||
public static Long averageTimeBetweenEntries(List<FeedEntry> entries) {
|
||||
if (entries.isEmpty() || entries.size() == 1) {
|
||||
return null;
|
||||
}
|
||||
|
||||
List<Long> timestamps = getSortedTimestamps(entries);
|
||||
|
||||
SummaryStatistics stats = new SummaryStatistics();
|
||||
for (int i = 0; i < timestamps.size() - 1; i++) {
|
||||
long diff = Math.abs(timestamps.get(i) - timestamps.get(i + 1));
|
||||
stats.addValue(diff);
|
||||
}
|
||||
return (long) stats.getMean();
|
||||
}
|
||||
|
||||
public static List<Long> getSortedTimestamps(List<FeedEntry> entries) {
|
||||
List<Long> timestamps = Lists.newArrayList();
|
||||
for (FeedEntry entry : entries) {
|
||||
timestamps.add(entry.getUpdated().getTime());
|
||||
}
|
||||
Collections.sort(timestamps);
|
||||
Collections.reverse(timestamps);
|
||||
return timestamps;
|
||||
}
|
||||
|
||||
public static String removeTrailingSlash(String url) {
|
||||
if (url.endsWith("/")) {
|
||||
url = url.substring(0, url.length() - 1);
|
||||
}
|
||||
return url;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param url
|
||||
* the url of the entry
|
||||
* @param feedLink
|
||||
* the url of the feed as described in the feed
|
||||
* @param feedUrl
|
||||
* the url of the feed that we used to fetch the feed
|
||||
* @return an absolute url pointing to the entry
|
||||
*/
|
||||
public static String toAbsoluteUrl(String url, String feedLink, String feedUrl) {
|
||||
url = StringUtils.trimToNull(StringUtils.normalizeSpace(url));
|
||||
if (url == null || url.startsWith("http")) {
|
||||
return url;
|
||||
}
|
||||
|
||||
String baseUrl = (feedLink == null || isRelative(feedLink)) ? feedUrl : feedLink;
|
||||
|
||||
if (baseUrl == null) {
|
||||
return url;
|
||||
}
|
||||
|
||||
String result = null;
|
||||
try {
|
||||
result = new URL(new URL(baseUrl), url).toString();
|
||||
} catch (MalformedURLException e) {
|
||||
log.debug("could not parse url : " + e.getMessage(), e);
|
||||
result = url;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
public static boolean isRelative(final String url) {
|
||||
// the regex means "doesn't start with 'scheme://'"
|
||||
if ((url != null) && (url.startsWith("/") == false) && (!url.matches("^\\w+\\:\\/\\/.*")) && !(url.startsWith("#"))) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public static String getFaviconUrl(FeedSubscription subscription, String publicUrl) {
|
||||
return removeTrailingSlash(publicUrl) + "/rest/feed/favicon/" + subscription.getId();
|
||||
}
|
||||
|
||||
public static String proxyImages(String content, String publicUrl, boolean proxyImages) {
|
||||
if (!proxyImages) {
|
||||
return content;
|
||||
}
|
||||
if (StringUtils.isBlank(content)) {
|
||||
return content;
|
||||
}
|
||||
|
||||
Document doc = Jsoup.parse(content);
|
||||
Elements elements = doc.select("img");
|
||||
for (Element element : elements) {
|
||||
String href = element.attr("src");
|
||||
if (href != null) {
|
||||
String proxy = removeTrailingSlash(publicUrl) + "/rest/server/proxy?u=" + imageProxyEncoder(href);
|
||||
element.attr("src", proxy);
|
||||
}
|
||||
}
|
||||
|
||||
return doc.body().html();
|
||||
}
|
||||
|
||||
public static String rot13(String msg) {
|
||||
StringBuilder message = new StringBuilder();
|
||||
|
||||
for (char c : msg.toCharArray()) {
|
||||
if (c >= 'a' && c <= 'm')
|
||||
c += 13;
|
||||
else if (c >= 'n' && c <= 'z')
|
||||
c -= 13;
|
||||
else if (c >= 'A' && c <= 'M')
|
||||
c += 13;
|
||||
else if (c >= 'N' && c <= 'Z')
|
||||
c -= 13;
|
||||
message.append(c);
|
||||
}
|
||||
|
||||
return message.toString();
|
||||
}
|
||||
|
||||
public static String imageProxyEncoder(String url) {
|
||||
return Base64.encodeBase64String(rot13(url).getBytes());
|
||||
}
|
||||
|
||||
public static String imageProxyDecoder(String code) {
|
||||
return rot13(new String(Base64.decodeBase64(code)));
|
||||
}
|
||||
|
||||
public static void removeUnwantedFromSearch(List<Entry> entries, String keywords) {
|
||||
if (StringUtils.isBlank(keywords)) {
|
||||
return;
|
||||
}
|
||||
|
||||
Iterator<Entry> it = entries.iterator();
|
||||
while (it.hasNext()) {
|
||||
Entry entry = it.next();
|
||||
boolean keep = true;
|
||||
for (String keyword : keywords.split(" ")) {
|
||||
String title = Jsoup.parse(entry.getTitle()).text();
|
||||
String content = Jsoup.parse(entry.getContent()).text();
|
||||
if (!StringUtils.containsIgnoreCase(content, keyword) && !StringUtils.containsIgnoreCase(title, keyword)) {
|
||||
keep = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!keep) {
|
||||
it.remove();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
58
src/main/java/com/commafeed/backend/feed/FetchedFeed.java
Normal file
58
src/main/java/com/commafeed/backend/feed/FetchedFeed.java
Normal file
@@ -0,0 +1,58 @@
|
||||
package com.commafeed.backend.feed;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.commafeed.backend.model.Feed;
|
||||
import com.commafeed.backend.model.FeedEntry;
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
public class FetchedFeed {
|
||||
|
||||
private Feed feed = new Feed();
|
||||
private List<FeedEntry> entries = Lists.newArrayList();
|
||||
|
||||
private String title;
|
||||
private String urlAfterRedirect;
|
||||
private long fetchDuration;
|
||||
|
||||
public Feed getFeed() {
|
||||
return feed;
|
||||
}
|
||||
|
||||
public void setFeed(Feed feed) {
|
||||
this.feed = feed;
|
||||
}
|
||||
|
||||
public List<FeedEntry> getEntries() {
|
||||
return entries;
|
||||
}
|
||||
|
||||
public void setEntries(List<FeedEntry> entries) {
|
||||
this.entries = entries;
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
return title;
|
||||
}
|
||||
|
||||
public void setTitle(String title) {
|
||||
this.title = title;
|
||||
}
|
||||
|
||||
public long getFetchDuration() {
|
||||
return fetchDuration;
|
||||
}
|
||||
|
||||
public void setFetchDuration(long fetchDuration) {
|
||||
this.fetchDuration = fetchDuration;
|
||||
}
|
||||
|
||||
public String getUrlAfterRedirect() {
|
||||
return urlAfterRedirect;
|
||||
}
|
||||
|
||||
public void setUrlAfterRedirect(String urlAfterRedirect) {
|
||||
this.urlAfterRedirect = urlAfterRedirect;
|
||||
}
|
||||
|
||||
}
|
||||
266
src/main/java/com/commafeed/backend/feed/HtmlEntities.java
Normal file
266
src/main/java/com/commafeed/backend/feed/HtmlEntities.java
Normal file
@@ -0,0 +1,266 @@
|
||||
package com.commafeed.backend.feed;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.Map;
|
||||
|
||||
import com.google.gwt.thirdparty.guava.common.collect.Maps;
|
||||
|
||||
public class HtmlEntities {
|
||||
public static final Map<String, String> NUMERIC_MAPPING = Collections.unmodifiableMap(loadMap());
|
||||
|
||||
private static synchronized Map<String, String> loadMap() {
|
||||
Map<String, String> map = Maps.newLinkedHashMap();
|
||||
map.put("Á", "Á");
|
||||
map.put("á", "á");
|
||||
map.put("Â", "Â");
|
||||
map.put("â", "â");
|
||||
map.put("´", "´");
|
||||
map.put("Æ", "Æ");
|
||||
map.put("æ", "æ");
|
||||
map.put("À", "À");
|
||||
map.put("à", "à");
|
||||
map.put("ℵ", "ℵ");
|
||||
map.put("Α", "Α");
|
||||
map.put("α", "α");
|
||||
map.put("&", "&");
|
||||
map.put("∧", "∧");
|
||||
map.put("∠", "∠");
|
||||
map.put("Å", "Å");
|
||||
map.put("å", "å");
|
||||
map.put("≈", "≈");
|
||||
map.put("Ã", "Ã");
|
||||
map.put("ã", "ã");
|
||||
map.put("Ä", "Ä");
|
||||
map.put("ä", "ä");
|
||||
map.put("„", "„");
|
||||
map.put("Β", "Β");
|
||||
map.put("β", "β");
|
||||
map.put("¦", "¦");
|
||||
map.put("•", "•");
|
||||
map.put("∩", "∩");
|
||||
map.put("Ç", "Ç");
|
||||
map.put("ç", "ç");
|
||||
map.put("¸", "¸");
|
||||
map.put("¢", "¢");
|
||||
map.put("Χ", "Χ");
|
||||
map.put("χ", "χ");
|
||||
map.put("ˆ", "ˆ");
|
||||
map.put("♣", "♣");
|
||||
map.put("≅", "≅");
|
||||
map.put("©", "©");
|
||||
map.put("↵", "↵");
|
||||
map.put("∪", "∪");
|
||||
map.put("¤", "¤");
|
||||
map.put("†", "†");
|
||||
map.put("‡", "‡");
|
||||
map.put("↓", "↓");
|
||||
map.put("⇓", "⇓");
|
||||
map.put("°", "°");
|
||||
map.put("Δ", "Δ");
|
||||
map.put("δ", "δ");
|
||||
map.put("♦", "♦");
|
||||
map.put("÷", "÷");
|
||||
map.put("É", "É");
|
||||
map.put("é", "é");
|
||||
map.put("Ê", "Ê");
|
||||
map.put("ê", "ê");
|
||||
map.put("È", "È");
|
||||
map.put("è", "è");
|
||||
map.put("∅", "∅");
|
||||
map.put(" ", " ");
|
||||
map.put(" ", " ");
|
||||
map.put("Ε", "Ε");
|
||||
map.put("ε", "ε");
|
||||
map.put("≡", "≡");
|
||||
map.put("Η", "Η");
|
||||
map.put("η", "η");
|
||||
map.put("Ð", "Ð");
|
||||
map.put("ð", "ð");
|
||||
map.put("Ë", "Ë");
|
||||
map.put("ë", "ë");
|
||||
map.put("€", "€");
|
||||
map.put("∃", "∃");
|
||||
map.put("ƒ", "ƒ");
|
||||
map.put("∀", "∀");
|
||||
map.put("½", "½");
|
||||
map.put("¼", "¼");
|
||||
map.put("¾", "¾");
|
||||
map.put("⁄", "⁄");
|
||||
map.put("Γ", "Γ");
|
||||
map.put("γ", "γ");
|
||||
map.put("≥", "≥");
|
||||
map.put("↔", "↔");
|
||||
map.put("⇔", "⇔");
|
||||
map.put("♥", "♥");
|
||||
map.put("…", "…");
|
||||
map.put("Í", "Í");
|
||||
map.put("í", "í");
|
||||
map.put("Î", "Î");
|
||||
map.put("î", "î");
|
||||
map.put("¡", "¡");
|
||||
map.put("Ì", "Ì");
|
||||
map.put("ì", "ì");
|
||||
map.put("ℑ", "ℑ");
|
||||
map.put("∞", "∞");
|
||||
map.put("∫", "∫");
|
||||
map.put("Ι", "Ι");
|
||||
map.put("ι", "ι");
|
||||
map.put("¿", "¿");
|
||||
map.put("∈", "∈");
|
||||
map.put("Ï", "Ï");
|
||||
map.put("ï", "ï");
|
||||
map.put("Κ", "Κ");
|
||||
map.put("κ", "κ");
|
||||
map.put("Λ", "Λ");
|
||||
map.put("λ", "λ");
|
||||
map.put("⟨", "〈");
|
||||
map.put("«", "«");
|
||||
map.put("←", "←");
|
||||
map.put("⇐", "⇐");
|
||||
map.put("⌈", "⌈");
|
||||
map.put("“", "“");
|
||||
map.put("≤", "≤");
|
||||
map.put("⌊", "⌊");
|
||||
map.put("∗", "∗");
|
||||
map.put("◊", "◊");
|
||||
map.put("‎", "‎");
|
||||
map.put("‹", "‹");
|
||||
map.put("‘", "‘");
|
||||
map.put("¯", "¯");
|
||||
map.put("—", "—");
|
||||
map.put("µ", "µ");
|
||||
map.put("·", "·");
|
||||
map.put("−", "−");
|
||||
map.put("Μ", "Μ");
|
||||
map.put("μ", "μ");
|
||||
map.put("∇", "∇");
|
||||
map.put(" ", " ");
|
||||
map.put("–", "–");
|
||||
map.put("≠", "≠");
|
||||
map.put("∋", "∋");
|
||||
map.put("¬", "¬");
|
||||
map.put("∉", "∉");
|
||||
map.put("⊄", "⊄");
|
||||
map.put("Ñ", "Ñ");
|
||||
map.put("ñ", "ñ");
|
||||
map.put("Ν", "Ν");
|
||||
map.put("ν", "ν");
|
||||
map.put("Ó", "Ó");
|
||||
map.put("ó", "ó");
|
||||
map.put("Ô", "Ô");
|
||||
map.put("ô", "ô");
|
||||
map.put("Œ", "Œ");
|
||||
map.put("œ", "œ");
|
||||
map.put("Ò", "Ò");
|
||||
map.put("ò", "ò");
|
||||
map.put("‾", "‾");
|
||||
map.put("Ω", "Ω");
|
||||
map.put("ω", "ω");
|
||||
map.put("Ο", "Ο");
|
||||
map.put("ο", "ο");
|
||||
map.put("⊕", "⊕");
|
||||
map.put("∨", "∨");
|
||||
map.put("ª", "ª");
|
||||
map.put("º", "º");
|
||||
map.put("Ø", "Ø");
|
||||
map.put("ø", "ø");
|
||||
map.put("Õ", "Õ");
|
||||
map.put("õ", "õ");
|
||||
map.put("⊗", "⊗");
|
||||
map.put("Ö", "Ö");
|
||||
map.put("ö", "ö");
|
||||
map.put("¶", "¶");
|
||||
map.put("∂", "∂");
|
||||
map.put("‰", "‰");
|
||||
map.put("⊥", "⊥");
|
||||
map.put("Φ", "Φ");
|
||||
map.put("φ", "φ");
|
||||
map.put("Π", "Π");
|
||||
map.put("π", "π");
|
||||
map.put("ϖ", "ϖ");
|
||||
map.put("±", "±");
|
||||
map.put("£", "£");
|
||||
map.put("′", "′");
|
||||
map.put("″", "″");
|
||||
map.put("∏", "∏");
|
||||
map.put("∝", "∝");
|
||||
map.put("Ψ", "Ψ");
|
||||
map.put("ψ", "ψ");
|
||||
map.put(""", """);
|
||||
map.put("√", "√");
|
||||
map.put("⟩", "〉");
|
||||
map.put("»", "»");
|
||||
map.put("→", "→");
|
||||
map.put("⇒", "⇒");
|
||||
map.put("⌉", "⌉");
|
||||
map.put("”", "”");
|
||||
map.put("ℜ", "ℜ");
|
||||
map.put("®", "®");
|
||||
map.put("⌋", "⌋");
|
||||
map.put("Ρ", "Ρ");
|
||||
map.put("ρ", "ρ");
|
||||
map.put("‏", "‏");
|
||||
map.put("›", "›");
|
||||
map.put("’", "’");
|
||||
map.put("‚", "‚");
|
||||
map.put("Š", "Š");
|
||||
map.put("š", "š");
|
||||
map.put("⋅", "⋅");
|
||||
map.put("§", "§");
|
||||
map.put("­", "­");
|
||||
map.put("Σ", "Σ");
|
||||
map.put("σ", "σ");
|
||||
map.put("ς", "ς");
|
||||
map.put("∼", "∼");
|
||||
map.put("♠", "♠");
|
||||
map.put("⊂", "⊂");
|
||||
map.put("⊆", "⊆");
|
||||
map.put("∑", "∑");
|
||||
map.put("¹", "¹");
|
||||
map.put("²", "²");
|
||||
map.put("³", "³");
|
||||
map.put("⊃", "⊃");
|
||||
map.put("⊇", "⊇");
|
||||
map.put("ß", "ß");
|
||||
map.put("Τ", "Τ");
|
||||
map.put("τ", "τ");
|
||||
map.put("∴", "∴");
|
||||
map.put("Θ", "Θ");
|
||||
map.put("θ", "θ");
|
||||
map.put("ϑ", "ϑ");
|
||||
map.put(" ", " ");
|
||||
map.put("Þ", "Þ");
|
||||
map.put("þ", "þ");
|
||||
map.put("˜", "˜");
|
||||
map.put("×", "×");
|
||||
map.put("™", "™");
|
||||
map.put("Ú", "Ú");
|
||||
map.put("ú", "ú");
|
||||
map.put("↑", "↑");
|
||||
map.put("⇑", "⇑");
|
||||
map.put("Û", "Û");
|
||||
map.put("û", "û");
|
||||
map.put("Ù", "Ù");
|
||||
map.put("ù", "ù");
|
||||
map.put("¨", "¨");
|
||||
map.put("ϒ", "ϒ");
|
||||
map.put("Υ", "Υ");
|
||||
map.put("υ", "υ");
|
||||
map.put("Ü", "Ü");
|
||||
map.put("ü", "ü");
|
||||
map.put("℘", "℘");
|
||||
map.put("Ξ", "Ξ");
|
||||
map.put("ξ", "ξ");
|
||||
map.put("Ý", "Ý");
|
||||
map.put("ý", "ý");
|
||||
map.put("¥", "¥");
|
||||
map.put("ÿ", "ÿ");
|
||||
map.put("Ÿ", "Ÿ");
|
||||
map.put("Ζ", "Ζ");
|
||||
map.put("ζ", "ζ");
|
||||
map.put("‍", "‍");
|
||||
map.put("‌", "‌");
|
||||
|
||||
return map;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user