removed wicket and tomee, use dropwizard instead. remove wro4j, use gulp instead

2026-03-21 21:37:29 +00:00 · 2014-08-08 16:49:02 +02:00
parent bbcd79e49f
commit 986fd25942
357 changed files with 2178 additions and 19556 deletions
--- a/src/main/java/com/commafeed/backend/feed/FeedUtils.java
+++ b/src/main/java/com/commafeed/backend/feed/FeedUtils.java
@@ -0,0 +1,524 @@
+package com.commafeed.backend.feed;
+
+import java.io.StringReader;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Date;
+import java.util.Iterator;
+import java.util.List;
+import java.util.regex.Pattern;
+
+import lombok.extern.slf4j.Slf4j;
+
+import org.apache.commons.codec.binary.Base64;
+import org.apache.commons.lang.ArrayUtils;
+import org.apache.commons.lang.StringUtils;
+import org.apache.commons.lang.time.DateUtils;
+import org.apache.commons.math.stat.descriptive.SummaryStatistics;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Document.OutputSettings;
+import org.jsoup.nodes.Element;
+import org.jsoup.nodes.Entities.EscapeMode;
+import org.jsoup.safety.Cleaner;
+import org.jsoup.safety.Whitelist;
+import org.jsoup.select.Elements;
+import org.mozilla.universalchardet.UniversalDetector;
+import org.w3c.css.sac.InputSource;
+import org.w3c.dom.css.CSSStyleDeclaration;
+
+import com.commafeed.backend.model.FeedEntry;
+import com.commafeed.backend.model.FeedSubscription;
+import com.commafeed.frontend.model.Entry;
+import com.google.common.collect.Lists;
+import com.google.gwt.i18n.client.HasDirection.Direction;
+import com.google.gwt.i18n.shared.BidiUtils;
+import com.steadystate.css.parser.CSSOMParser;
+
+import edu.uci.ics.crawler4j.url.URLCanonicalizer;
+
+/**
+ * Utility methods related to feed handling
+ * 
+ */
+@Slf4j
+public class FeedUtils {
+
+	private static final String ESCAPED_QUESTION_MARK = Pattern.quote("?");
+
+	private static final List<String> ALLOWED_IFRAME_CSS_RULES = Arrays.asList("height", "width", "border");
+	private static final List<String> ALLOWED_IMG_CSS_RULES = Arrays.asList("display", "width", "height");
+	private static final char[] FORBIDDEN_CSS_RULE_CHARACTERS = new char[] { '(', ')' };
+
+	private static final Whitelist WHITELIST = buildWhiteList();
+
+	public static String truncate(String string, int length) {
+		if (string != null) {
+			string = string.substring(0, Math.min(length, string.length()));
+		}
+		return string;
+	}
+
+	private static synchronized Whitelist buildWhiteList() {
+		Whitelist whitelist = new Whitelist();
+		whitelist.addTags("a", "b", "blockquote", "br", "caption", "cite", "code", "col", "colgroup", "dd", "div", "dl", "dt", "em", "h1",
+				"h2", "h3", "h4", "h5", "h6", "i", "iframe", "img", "li", "ol", "p", "pre", "q", "small", "strike", "strong", "sub", "sup",
+				"table", "tbody", "td", "tfoot", "th", "thead", "tr", "u", "ul");
+
+		whitelist.addAttributes("div", "dir");
+		whitelist.addAttributes("pre", "dir");
+		whitelist.addAttributes("code", "dir");
+		whitelist.addAttributes("table", "dir");
+		whitelist.addAttributes("p", "dir");
+		whitelist.addAttributes("a", "href", "title");
+		whitelist.addAttributes("blockquote", "cite");
+		whitelist.addAttributes("col", "span", "width");
+		whitelist.addAttributes("colgroup", "span", "width");
+		whitelist.addAttributes("iframe", "src", "height", "width", "allowfullscreen", "frameborder", "style");
+		whitelist.addAttributes("img", "align", "alt", "height", "src", "title", "width", "style");
+		whitelist.addAttributes("ol", "start", "type");
+		whitelist.addAttributes("q", "cite");
+		whitelist.addAttributes("table", "border", "bordercolor", "summary", "width");
+		whitelist.addAttributes("td", "border", "bordercolor", "abbr", "axis", "colspan", "rowspan", "width");
+		whitelist.addAttributes("th", "border", "bordercolor", "abbr", "axis", "colspan", "rowspan", "scope", "width");
+		whitelist.addAttributes("ul", "type");
+
+		whitelist.addProtocols("a", "href", "ftp", "http", "https", "mailto");
+		whitelist.addProtocols("blockquote", "cite", "http", "https");
+		whitelist.addProtocols("img", "src", "http", "https");
+		whitelist.addProtocols("q", "cite", "http", "https");
+
+		whitelist.addEnforcedAttribute("a", "target", "_blank");
+		return whitelist;
+	}
+
+	/**
+	 * Detect feed encoding by using the declared encoding in the xml processing instruction and by detecting the characters used in the
+	 * feed
+	 * 
+	 */
+	public static String guessEncoding(byte[] bytes) {
+		String extracted = extractDeclaredEncoding(bytes);
+		if (StringUtils.startsWithIgnoreCase(extracted, "iso-8859-")) {
+			if (StringUtils.endsWith(extracted, "1") == false) {
+				return extracted;
+			}
+		} else if (StringUtils.startsWithIgnoreCase(extracted, "windows-")) {
+			return extracted;
+		}
+		return detectEncoding(bytes);
+	}
+
+	/**
+	 * Detect encoding by analyzing characters in the array
+	 */
+	public static String detectEncoding(byte[] bytes) {
+		String DEFAULT_ENCODING = "UTF-8";
+		UniversalDetector detector = new UniversalDetector(null);
+		detector.handleData(bytes, 0, bytes.length);
+		detector.dataEnd();
+		String encoding = detector.getDetectedCharset();
+		detector.reset();
+		if (encoding == null) {
+			encoding = DEFAULT_ENCODING;
+		} else if (encoding.equalsIgnoreCase("ISO-8859-1")) {
+			encoding = "windows-1252";
+		}
+		return encoding;
+	}
+
+	public static String replaceHtmlEntitiesWithNumericEntities(String source) {
+		String result = source;
+		for (String entity : HtmlEntities.NUMERIC_MAPPING.keySet()) {
+			result = result.replace(entity, HtmlEntities.NUMERIC_MAPPING.get(entity));
+		}
+		return result;
+	}
+
+	/**
+	 * Normalize the url. The resulting url is not meant to be fetched but rather used as a mean to identify a feed and avoid duplicates
+	 */
+	public static String normalizeURL(String url) {
+		if (url == null) {
+			return null;
+		}
+		String normalized = URLCanonicalizer.getCanonicalURL(url);
+		if (normalized == null) {
+			normalized = url;
+		}
+
+		// convert to lower case, the url probably won't work in some cases
+		// after that but we don't care we just want to compare urls to avoid
+		// duplicates
+		normalized = normalized.toLowerCase();
+
+		// store all urls as http
+		if (normalized.startsWith("https")) {
+			normalized = "http" + normalized.substring(5);
+		}
+
+		// remove the www. part
+		normalized = normalized.replace("//www.", "//");
+
+		// feedproxy redirects to feedburner
+		normalized = normalized.replace("feedproxy.google.com", "feeds.feedburner.com");
+
+		// feedburner feeds have a special treatment
+		if (normalized.split(ESCAPED_QUESTION_MARK)[0].contains("feedburner.com")) {
+			normalized = normalized.replace("feeds2.feedburner.com", "feeds.feedburner.com");
+			normalized = normalized.split(ESCAPED_QUESTION_MARK)[0];
+			normalized = StringUtils.removeEnd(normalized, "/");
+		}
+
+		return normalized;
+	}
+
+	/**
+	 * Extract the declared encoding from the xml
+	 */
+	public static String extractDeclaredEncoding(byte[] bytes) {
+		int index = ArrayUtils.indexOf(bytes, (byte) '>');
+		if (index == -1) {
+			return null;
+		}
+
+		String pi = new String(ArrayUtils.subarray(bytes, 0, index + 1));
+		index = StringUtils.indexOf(pi, "encoding=\"");
+		if (index == -1) {
+			return null;
+		}
+		String encoding = pi.substring(index + 10, pi.length());
+		encoding = encoding.substring(0, encoding.indexOf('"'));
+		return encoding;
+	}
+
+	public static String handleContent(String content, String baseUri, boolean keepTextOnly) {
+		if (StringUtils.isNotBlank(content)) {
+			baseUri = StringUtils.trimToEmpty(baseUri);
+
+			Document dirty = Jsoup.parseBodyFragment(content, baseUri);
+			Cleaner cleaner = new Cleaner(WHITELIST);
+			Document clean = cleaner.clean(dirty);
+
+			for (Element e : clean.select("iframe[style]")) {
+				String style = e.attr("style");
+				String escaped = escapeIFrameCss(style);
+				e.attr("style", escaped);
+			}
+
+			for (Element e : clean.select("img[style]")) {
+				String style = e.attr("style");
+				String escaped = escapeImgCss(style);
+				e.attr("style", escaped);
+			}
+
+			clean.outputSettings(new OutputSettings().escapeMode(EscapeMode.base).prettyPrint(false));
+			Element body = clean.body();
+			if (keepTextOnly) {
+				content = body.text();
+			} else {
+				content = body.html();
+			}
+		}
+		return content;
+	}
+
+	public static String escapeIFrameCss(String orig) {
+		String rule = "";
+		CSSOMParser parser = new CSSOMParser();
+		try {
+			List<String> rules = Lists.newArrayList();
+			CSSStyleDeclaration decl = parser.parseStyleDeclaration(new InputSource(new StringReader(orig)));
+
+			for (int i = 0; i < decl.getLength(); i++) {
+				String property = decl.item(i);
+				String value = decl.getPropertyValue(property);
+				if (StringUtils.isBlank(property) || StringUtils.isBlank(value)) {
+					continue;
+				}
+
+				if (ALLOWED_IFRAME_CSS_RULES.contains(property) && StringUtils.containsNone(value, FORBIDDEN_CSS_RULE_CHARACTERS)) {
+					rules.add(property + ":" + decl.getPropertyValue(property) + ";");
+				}
+			}
+			rule = StringUtils.join(rules, "");
+		} catch (Exception e) {
+			log.error(e.getMessage(), e);
+		}
+		return rule;
+	}
+
+	public static String escapeImgCss(String orig) {
+		String rule = "";
+		CSSOMParser parser = new CSSOMParser();
+		try {
+			List<String> rules = Lists.newArrayList();
+			CSSStyleDeclaration decl = parser.parseStyleDeclaration(new InputSource(new StringReader(orig)));
+
+			for (int i = 0; i < decl.getLength(); i++) {
+				String property = decl.item(i);
+				String value = decl.getPropertyValue(property);
+				if (StringUtils.isBlank(property) || StringUtils.isBlank(value)) {
+					continue;
+				}
+
+				if (ALLOWED_IMG_CSS_RULES.contains(property) && StringUtils.containsNone(value, FORBIDDEN_CSS_RULE_CHARACTERS)) {
+					rules.add(property + ":" + decl.getPropertyValue(property) + ";");
+				}
+			}
+			rule = StringUtils.join(rules, "");
+		} catch (Exception e) {
+			log.error(e.getMessage(), e);
+		}
+		return rule;
+	}
+
+	public static boolean isRTL(FeedEntry entry) {
+		String text = entry.getContent().getContent();
+
+		if (StringUtils.isBlank(text)) {
+			text = entry.getContent().getTitle();
+		}
+
+		if (StringUtils.isBlank(text)) {
+			return false;
+		}
+
+		text = Jsoup.parse(text).text();
+		if (StringUtils.isBlank(text)) {
+			return false;
+		}
+
+		Direction direction = BidiUtils.get().estimateDirection(text);
+		return direction == Direction.RTL;
+	}
+
+	public static String trimInvalidXmlCharacters(String xml) {
+		if (StringUtils.isBlank(xml)) {
+			return null;
+		}
+		StringBuilder sb = new StringBuilder();
+
+		boolean firstTagFound = false;
+		for (int i = 0; i < xml.length(); i++) {
+			char c = xml.charAt(i);
+
+			if (!firstTagFound) {
+				if (c == '<') {
+					firstTagFound = true;
+				} else {
+					continue;
+				}
+			}
+
+			if (c >= 32 || c == 9 || c == 10 || c == 13) {
+				if (!Character.isHighSurrogate(c) && !Character.isLowSurrogate(c)) {
+					sb.append(c);
+				}
+			}
+		}
+		return sb.toString();
+	}
+
+	/**
+	 * When there was an error fetching the feed
+	 * 
+	 */
+	public static Date buildDisabledUntil(int errorCount) {
+		Date now = new Date();
+		int retriesBeforeDisable = 3;
+
+		if (errorCount >= retriesBeforeDisable) {
+			int disabledHours = errorCount - retriesBeforeDisable + 1;
+			disabledHours = Math.min(24 * 7, disabledHours);
+			return DateUtils.addHours(now, disabledHours);
+		}
+		return now;
+	}
+
+	/**
+	 * When the feed was refreshed successfully
+	 */
+	public static Date buildDisabledUntil(Date publishedDate, Long averageEntryInterval, Date defaultRefreshInterval) {
+		Date now = new Date();
+
+		if (publishedDate == null) {
+			// feed with no entries, recheck in 24 hours
+			return DateUtils.addHours(now, 24);
+		} else if (publishedDate.before(DateUtils.addMonths(now, -1))) {
+			// older than a month, recheck in 24 hours
+			return DateUtils.addHours(now, 24);
+		} else if (publishedDate.before(DateUtils.addDays(now, -14))) {
+			// older than two weeks, recheck in 12 hours
+			return DateUtils.addHours(now, 12);
+		} else if (publishedDate.before(DateUtils.addDays(now, -7))) {
+			// older than a week, recheck in 6 hours
+			return DateUtils.addHours(now, 6);
+		} else if (averageEntryInterval != null) {
+			// use average time between entries to decide when to refresh next, divided by factor
+			int factor = 2;
+
+			// not more than 6 hours
+			long date = Math.min(DateUtils.addHours(now, 6).getTime(), now.getTime() + averageEntryInterval / factor);
+
+			// not less than default refresh interval
+			date = Math.max(defaultRefreshInterval.getTime(), date);
+
+			return new Date(date);
+		} else {
+			// unknown case, recheck in 24 hours
+			return DateUtils.addHours(now, 24);
+		}
+	}
+
+	public static Long averageTimeBetweenEntries(List<FeedEntry> entries) {
+		if (entries.isEmpty() || entries.size() == 1) {
+			return null;
+		}
+
+		List<Long> timestamps = getSortedTimestamps(entries);
+
+		SummaryStatistics stats = new SummaryStatistics();
+		for (int i = 0; i < timestamps.size() - 1; i++) {
+			long diff = Math.abs(timestamps.get(i) - timestamps.get(i + 1));
+			stats.addValue(diff);
+		}
+		return (long) stats.getMean();
+	}
+
+	public static List<Long> getSortedTimestamps(List<FeedEntry> entries) {
+		List<Long> timestamps = Lists.newArrayList();
+		for (FeedEntry entry : entries) {
+			timestamps.add(entry.getUpdated().getTime());
+		}
+		Collections.sort(timestamps);
+		Collections.reverse(timestamps);
+		return timestamps;
+	}
+
+	public static String removeTrailingSlash(String url) {
+		if (url.endsWith("/")) {
+			url = url.substring(0, url.length() - 1);
+		}
+		return url;
+	}
+
+	/**
+	 * 
+	 * @param url
+	 *            the url of the entry
+	 * @param feedLink
+	 *            the url of the feed as described in the feed
+	 * @param feedUrl
+	 *            the url of the feed that we used to fetch the feed
+	 * @return an absolute url pointing to the entry
+	 */
+	public static String toAbsoluteUrl(String url, String feedLink, String feedUrl) {
+		url = StringUtils.trimToNull(StringUtils.normalizeSpace(url));
+		if (url == null || url.startsWith("http")) {
+			return url;
+		}
+
+		String baseUrl = (feedLink == null || isRelative(feedLink)) ? feedUrl : feedLink;
+
+		if (baseUrl == null) {
+			return url;
+		}
+
+		String result = null;
+		try {
+			result = new URL(new URL(baseUrl), url).toString();
+		} catch (MalformedURLException e) {
+			log.debug("could not parse url : " + e.getMessage(), e);
+			result = url;
+		}
+
+		return result;
+	}
+
+	public static boolean isRelative(final String url) {
+		// the regex means "doesn't start with 'scheme://'"
+		if ((url != null) && (url.startsWith("/") == false) && (!url.matches("^\\w+\\:\\/\\/.*")) && !(url.startsWith("#"))) {
+			return true;
+		} else {
+			return false;
+		}
+	}
+
+	public static String getFaviconUrl(FeedSubscription subscription, String publicUrl) {
+		return removeTrailingSlash(publicUrl) + "/rest/feed/favicon/" + subscription.getId();
+	}
+
+	public static String proxyImages(String content, String publicUrl, boolean proxyImages) {
+		if (!proxyImages) {
+			return content;
+		}
+		if (StringUtils.isBlank(content)) {
+			return content;
+		}
+
+		Document doc = Jsoup.parse(content);
+		Elements elements = doc.select("img");
+		for (Element element : elements) {
+			String href = element.attr("src");
+			if (href != null) {
+				String proxy = removeTrailingSlash(publicUrl) + "/rest/server/proxy?u=" + imageProxyEncoder(href);
+				element.attr("src", proxy);
+			}
+		}
+
+		return doc.body().html();
+	}
+
+	public static String rot13(String msg) {
+		StringBuilder message = new StringBuilder();
+
+		for (char c : msg.toCharArray()) {
+			if (c >= 'a' && c <= 'm')
+				c += 13;
+			else if (c >= 'n' && c <= 'z')
+				c -= 13;
+			else if (c >= 'A' && c <= 'M')
+				c += 13;
+			else if (c >= 'N' && c <= 'Z')
+				c -= 13;
+			message.append(c);
+		}
+
+		return message.toString();
+	}
+
+	public static String imageProxyEncoder(String url) {
+		return Base64.encodeBase64String(rot13(url).getBytes());
+	}
+
+	public static String imageProxyDecoder(String code) {
+		return rot13(new String(Base64.decodeBase64(code)));
+	}
+
+	public static void removeUnwantedFromSearch(List<Entry> entries, String keywords) {
+		if (StringUtils.isBlank(keywords)) {
+			return;
+		}
+
+		Iterator<Entry> it = entries.iterator();
+		while (it.hasNext()) {
+			Entry entry = it.next();
+			boolean keep = true;
+			for (String keyword : keywords.split(" ")) {
+				String title = Jsoup.parse(entry.getTitle()).text();
+				String content = Jsoup.parse(entry.getContent()).text();
+				if (!StringUtils.containsIgnoreCase(content, keyword) && !StringUtils.containsIgnoreCase(title, keyword)) {
+					keep = false;
+					break;
+				}
+			}
+			if (!keep) {
+				it.remove();
+			}
+		}
+	}
+
+}