Files
Athou_commafeed/src/main/java/com/commafeed/backend/feeds/FaviconFetcher.java

165 lines
3.8 KiB
Java
Raw Normal View History

2013-06-11 07:18:04 +02:00
package com.commafeed.backend.feeds;
import java.util.Arrays;
import java.util.List;
import javax.inject.Inject;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.commafeed.backend.HttpGetter;
import com.commafeed.backend.HttpGetter.HttpResult;
/**
* Inspired/Ported from https://github.com/potatolondon/getfavicon
*
*/
public class FaviconFetcher {
private static Logger log = LoggerFactory.getLogger(FeedFetcher.class);
private static long MIN_ICON_LENGTH = 100;
private static long MAX_ICON_LENGTH = 20000;
private static int TIMEOUT = 4000;
2013-06-11 07:18:04 +02:00
protected static List<String> ICON_MIMETYPES = Arrays.asList(
"image/x-icon", "image/vnd.microsoft.icon", "image/ico",
"image/icon", "text/ico", "application/ico", "image/x-ms-bmp",
"image/x-bmp", "image/gif", "image/png", "image/jpeg");
private static List<String> ICON_MIMETYPE_BLACKLIST = Arrays.asList(
"application/xml", "text/html");
@Inject
HttpGetter getter;
public byte[] fetch(String url) {
if (url == null) {
2013-06-11 13:52:27 +02:00
log.debug("url is null");
return null;
}
int doubleSlash = url.indexOf("//");
if (doubleSlash == -1) {
doubleSlash = 0;
} else {
doubleSlash += 2;
}
int firstSlash = url.indexOf('/', doubleSlash);
if (firstSlash != -1) {
url = url.substring(0, firstSlash);
}
byte[] icon = getIconAtRoot(url);
2013-06-11 07:18:04 +02:00
if (icon == null) {
icon = getIconInPage(url);
2013-06-11 07:18:04 +02:00
}
return icon;
}
private byte[] getIconAtRoot(String url) {
2013-06-11 07:18:04 +02:00
byte[] bytes = null;
String contentType = null;
try {
url = FeedUtils.removeTrailingSlash(url) + "/favicon.ico";
2013-06-11 13:52:27 +02:00
log.debug("getting root icon at {}", url);
HttpResult result = getter.getBinary(url, TIMEOUT);
2013-06-11 07:18:04 +02:00
bytes = result.getContent();
contentType = result.getContentType();
} catch (Exception e) {
log.debug("Failed to retrieve iconAtRoot: " + e.getMessage(), e);
2013-06-11 07:18:04 +02:00
}
if (!isValidIconResponse(bytes, contentType)) {
bytes = null;
}
return bytes;
}
boolean isValidIconResponse(byte[] content, String contentType) {
2013-06-11 08:08:27 +02:00
if (content == null) {
return false;
}
2013-06-11 07:18:04 +02:00
long length = content.length;
if (StringUtils.isNotBlank(contentType)) {
2013-06-11 07:18:04 +02:00
contentType = contentType.split(";")[0];
}
if (ICON_MIMETYPE_BLACKLIST.contains(contentType)) {
2013-06-11 13:52:27 +02:00
log.debug("Content-Type {} is blacklisted", contentType);
2013-06-11 07:18:04 +02:00
return false;
}
if (length < MIN_ICON_LENGTH) {
2013-06-11 13:52:27 +02:00
log.debug("Length {} below MIN_ICON_LENGTH {}", length,
2013-06-11 07:18:04 +02:00
MIN_ICON_LENGTH);
return false;
}
if (length > MAX_ICON_LENGTH) {
2013-06-11 13:52:27 +02:00
log.debug("Length {} greater than MAX_ICON_LENGTH {}", length,
2013-06-11 07:18:04 +02:00
MAX_ICON_LENGTH);
return false;
}
return true;
}
private byte[] getIconInPage(String url) {
2013-06-11 07:18:04 +02:00
Document doc = null;
2013-06-11 07:18:04 +02:00
try {
HttpResult result = getter.getBinary(url, TIMEOUT);
doc = Jsoup.parse(new String(result.getContent()), url);
2013-06-11 07:18:04 +02:00
} catch (Exception e) {
2013-06-11 13:52:27 +02:00
log.debug("Failed to retrieve page to find icon");
2013-06-11 07:18:04 +02:00
return null;
}
Elements icons = doc
.select("link[rel~=(?i)^(shortcut|icon|shortcut icon)$]");
if (icons.isEmpty()) {
2013-06-11 13:52:27 +02:00
log.debug("No icon found in page {}", url);
2013-06-11 07:18:04 +02:00
return null;
}
String href = icons.get(0).attr("abs:href");
if (StringUtils.isBlank(href)) {
2013-06-11 13:52:27 +02:00
log.debug("No icon found in page");
2013-06-11 07:18:04 +02:00
return null;
}
2013-06-11 13:52:27 +02:00
log.debug("Found unconfirmed iconInPage at {}", href);
2013-06-11 07:18:04 +02:00
byte[] bytes = null;
String contentType = null;
try {
HttpResult result = getter.getBinary(href, TIMEOUT);
2013-06-11 07:18:04 +02:00
bytes = result.getContent();
contentType = result.getContentType();
} catch (Exception e) {
2013-06-11 13:52:27 +02:00
log.debug("Failed to retrieve icon found in page {}", href);
2013-06-11 07:18:04 +02:00
return null;
}
if (!isValidIconResponse(bytes, contentType)) {
2013-06-11 13:52:27 +02:00
log.debug("Invalid icon found for {}", href);
2013-06-11 07:18:04 +02:00
return null;
}
return bytes;
}
}