Files
Athou_commafeed/src/main/java/com/commafeed/backend/feeds/FeedFetcher.java

65 lines
1.7 KiB
Java
Raw Normal View History

2013-03-20 20:33:42 +01:00
package com.commafeed.backend.feeds;
2013-04-11 12:49:54 +02:00
import java.io.IOException;
2013-03-20 20:33:42 +01:00
import javax.inject.Inject;
2013-04-11 12:49:54 +02:00
import org.apache.commons.codec.binary.StringUtils;
import org.apache.http.client.ClientProtocolException;
2013-03-31 09:53:19 +02:00
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
2013-03-20 20:33:42 +01:00
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
2013-04-03 15:53:57 +02:00
import com.commafeed.backend.HttpGetter;
2013-03-23 16:17:19 +01:00
import com.commafeed.backend.model.Feed;
2013-03-27 09:37:15 +01:00
import com.sun.syndication.io.FeedException;
2013-03-20 20:33:42 +01:00
public class FeedFetcher {
private static Logger log = LoggerFactory.getLogger(FeedFetcher.class);
@Inject
FeedParser parser;
2013-04-03 15:53:57 +02:00
@Inject
HttpGetter getter;
2013-04-14 18:51:12 +02:00
public Feed fetch(String feedUrl, boolean extractFeedUrlFromHtml)
throws FeedException, ClientProtocolException, IOException {
2013-03-20 20:33:42 +01:00
log.debug("Fetching feed {}", feedUrl);
Feed feed = null;
2013-04-11 12:49:54 +02:00
byte[] content = getter.getBinary(feedUrl);
2013-04-14 18:51:12 +02:00
if (extractFeedUrlFromHtml) {
String extractedUrl = extractFeedUrl(StringUtils
.newStringUtf8(content));
if (extractedUrl != null) {
content = getter.getBinary(extractedUrl);
feedUrl = extractedUrl;
}
2013-03-20 20:33:42 +01:00
}
2013-04-11 12:49:54 +02:00
feed = parser.parse(feedUrl, content);
2013-03-27 09:37:15 +01:00
return feed;
2013-03-20 20:33:42 +01:00
}
2013-03-31 18:47:17 +02:00
private String extractFeedUrl(String html) {
2013-03-31 14:30:44 +02:00
String foundUrl = null;
2013-03-31 09:53:19 +02:00
2013-03-31 14:30:44 +02:00
Document doc = Jsoup.parse(html);
String root = doc.children().get(0).tagName();
if ("html".equals(root)) {
Elements rss = doc.select("link[type=application/rss+xml]");
Elements atom = doc.select("link[type=application/atom+xml]");
if (!rss.isEmpty()) {
foundUrl = rss.get(0).attr("abs:href").toString();
} else if (!atom.isEmpty()) {
foundUrl = atom.get(0).attr("abs:href").toString();
}
2013-03-31 09:53:19 +02:00
}
2013-03-31 14:30:44 +02:00
return foundUrl;
2013-03-31 09:53:19 +02:00
}
2013-03-20 20:33:42 +01:00
}