Files
Athou_commafeed/src/main/java/com/commafeed/backend/feeds/FeedParser.java

211 lines
6.4 KiB
Java
Raw Normal View History

2013-03-20 20:33:42 +01:00
package com.commafeed.backend.feeds;
import java.io.StringReader;
2013-03-20 20:33:42 +01:00
import java.util.Calendar;
import java.util.Date;
2013-03-20 20:33:42 +01:00
import java.util.List;
2013-04-14 18:28:48 +02:00
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.SystemUtils;
2013-05-21 13:36:20 +02:00
import org.jdom.Element;
import org.jdom.Namespace;
2013-05-20 14:06:09 +02:00
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.InputSource;
2013-03-23 16:17:19 +01:00
import com.commafeed.backend.model.Feed;
import com.commafeed.backend.model.FeedEntry;
import com.commafeed.backend.model.FeedEntryContent;
import com.google.common.base.Function;
import com.google.common.collect.Collections2;
2013-04-09 13:37:00 +02:00
import com.google.common.collect.Iterables;
import com.sun.syndication.feed.synd.SyndContent;
2013-04-09 13:37:00 +02:00
import com.sun.syndication.feed.synd.SyndEnclosure;
2013-03-20 20:33:42 +01:00
import com.sun.syndication.feed.synd.SyndEntry;
import com.sun.syndication.feed.synd.SyndFeed;
2013-05-20 14:06:09 +02:00
import com.sun.syndication.feed.synd.SyndLink;
2013-05-21 13:36:20 +02:00
import com.sun.syndication.feed.synd.SyndLinkImpl;
2013-03-20 20:33:42 +01:00
import com.sun.syndication.io.FeedException;
import com.sun.syndication.io.SyndFeedInput;
public class FeedParser {
2013-05-20 14:06:09 +02:00
private static Logger log = LoggerFactory.getLogger(FeedParser.class);
2013-05-21 13:36:20 +02:00
private static final String ATOM_10_URI = "http://www.w3.org/2005/Atom";
private static final Namespace ATOM_10_NS = Namespace
.getNamespace(ATOM_10_URI);
private static final Date START = new Date(0);
private static final Date END = new Date(1000l * Integer.MAX_VALUE);
2013-04-14 18:51:12 +02:00
private static final Function<SyndContent, String> CONTENT_TO_STRING = new Function<SyndContent, String>() {
public String apply(SyndContent content) {
return content.getValue();
}
};
2013-03-20 20:33:42 +01:00
@SuppressWarnings("unchecked")
2013-04-25 12:30:21 +02:00
public FetchedFeed parse(String feedUrl, byte[] xml) throws FeedException {
FetchedFeed fetchedFeed = new FetchedFeed();
Feed feed = fetchedFeed.getFeed();
List<FeedEntry> entries = fetchedFeed.getEntries();
2013-03-20 20:33:42 +01:00
feed.setLastUpdated(Calendar.getInstance().getTime());
try {
String encoding = FeedUtils.guessEncoding(xml);
String xmlString = FeedUtils.trimInvalidXmlCharacters(new String(
xml, encoding));
2013-05-22 13:33:22 +02:00
if (xmlString == null) {
throw new FeedException("Input string is null for url "
+ feedUrl);
}
InputSource source = new InputSource(new StringReader(xmlString));
SyndFeed rss = new SyndFeedInput().build(source);
2013-05-21 13:36:20 +02:00
handleForeignMarkup(rss);
2013-04-25 12:30:21 +02:00
fetchedFeed.setTitle(rss.getTitle());
2013-05-20 14:06:09 +02:00
fetchedFeed.setHub(findHub(rss));
fetchedFeed.setTopic(findSelf(rss));
feed.setUrl(feedUrl);
2013-04-03 15:53:57 +02:00
feed.setLink(rss.getLink());
2013-03-20 20:33:42 +01:00
List<SyndEntry> items = rss.getEntries();
for (SyndEntry item : items) {
FeedEntry entry = new FeedEntry();
2013-05-28 21:53:58 +02:00
String guid = item.getUri();
if (StringUtils.isBlank(guid)) {
guid = item.getLink();
}
if (StringUtils.isBlank(guid)) {
// no guid and no link, skip entry
continue;
}
entry.setGuid(FeedUtils.truncate(guid, 2048));
entry.setGuidHash(DigestUtils.sha1Hex(guid));
entry.setUrl(FeedUtils.truncate(
FeedUtils.toAbsoluteUrl(item.getLink(), feed.getLink()),
2048));
entry.setUpdated(validateDate(getUpdateDate(item)));
entry.setAuthor(FeedUtils.truncate(item.getAuthor(), 128));
2013-03-20 20:33:42 +01:00
FeedEntryContent content = new FeedEntryContent();
content.setContent(FeedUtils.handleContent(getContent(item),
feed.getLink()));
content.setTitle(FeedUtils.truncate(FeedUtils.handleContent(
item.getTitle(), feed.getLink()), 2048));
2013-04-09 13:37:00 +02:00
SyndEnclosure enclosure = (SyndEnclosure) Iterables.getFirst(
item.getEnclosures(), null);
if (enclosure != null) {
content.setEnclosureUrl(FeedUtils.truncate(
enclosure.getUrl(), 2048));
content.setEnclosureType(enclosure.getType());
2013-04-09 13:37:00 +02:00
}
entry.setContent(content);
2013-04-09 13:37:00 +02:00
2013-04-25 12:30:21 +02:00
entries.add(entry);
2013-03-20 20:33:42 +01:00
}
Date publishedDate = null;
if (!entries.isEmpty()) {
Long timestamp = FeedUtils.getSortedTimestamps(entries).get(0);
publishedDate = new Date(timestamp);
}
2013-04-25 12:30:21 +02:00
fetchedFeed.setPublishedDate(publishedDate);
2013-03-20 20:33:42 +01:00
} catch (Exception e) {
2013-03-25 12:24:00 +01:00
throw new FeedException(String.format(
"Could not parse feed from %s : %s", feedUrl,
e.getMessage()), e);
2013-03-20 20:33:42 +01:00
}
2013-04-25 12:30:21 +02:00
return fetchedFeed;
2013-03-20 20:33:42 +01:00
}
2013-05-21 13:36:20 +02:00
/**
* Adds atom links for rss feeds
*/
@SuppressWarnings({ "unchecked", "rawtypes" })
private void handleForeignMarkup(SyndFeed feed) {
Object foreignMarkup = feed.getForeignMarkup();
if (foreignMarkup == null) {
return;
}
if (foreignMarkup instanceof List) {
List elements = (List) foreignMarkup;
for (Object object : elements) {
if (object instanceof Element) {
Element element = (Element) object;
if ("link".equals(element.getName())
&& ATOM_10_NS.equals(element.getNamespace())) {
SyndLink link = new SyndLinkImpl();
link.setRel(element.getAttributeValue("rel"));
link.setHref(element.getAttributeValue("href"));
feed.getLinks().add(link);
}
}
}
}
}
private Date getUpdateDate(SyndEntry item) {
Date date = item.getUpdatedDate();
if (date == null) {
date = item.getPublishedDate();
}
if (date == null) {
date = new Date();
}
return date;
}
private Date validateDate(Date date) {
if (date == null) {
2013-04-21 23:07:19 +02:00
return new Date();
}
if (date.before(START) || date.after(END)) {
2013-04-21 23:07:19 +02:00
return new Date();
}
return date;
}
@SuppressWarnings("unchecked")
private String getContent(SyndEntry item) {
String content = null;
if (item.getContents().isEmpty()) {
content = item.getDescription() == null ? null : item
.getDescription().getValue();
} else {
content = StringUtils.join(Collections2.transform(
2013-04-14 18:51:12 +02:00
item.getContents(), CONTENT_TO_STRING),
SystemUtils.LINE_SEPARATOR);
}
return content;
}
2013-03-23 09:13:04 +01:00
2013-05-20 14:06:09 +02:00
@SuppressWarnings("unchecked")
private String findHub(SyndFeed feed) {
for (SyndLink l : (List<SyndLink>) feed.getLinks()) {
if ("hub".equalsIgnoreCase(l.getRel())) {
log.debug("found hub {} for feed {}", l.getHref(),
feed.getLink());
2013-05-20 14:06:09 +02:00
return l.getHref();
}
}
return null;
}
@SuppressWarnings("unchecked")
private String findSelf(SyndFeed feed) {
for (SyndLink l : (List<SyndLink>) feed.getLinks()) {
if ("self".equalsIgnoreCase(l.getRel())) {
log.debug("found self {} for feed {}", l.getHref(),
feed.getLink());
2013-05-20 14:06:09 +02:00
return l.getHref();
}
}
return null;
}
2013-03-20 20:33:42 +01:00
}