2013-03-20 20:33:42 +01:00
|
|
|
package com.commafeed.backend.feeds;
|
|
|
|
|
|
2013-04-23 07:20:21 +02:00
|
|
|
import java.io.StringReader;
|
2013-03-20 20:33:42 +01:00
|
|
|
import java.util.Calendar;
|
2013-04-09 12:52:50 +02:00
|
|
|
import java.util.Date;
|
2013-03-20 20:33:42 +01:00
|
|
|
import java.util.List;
|
|
|
|
|
|
2013-04-14 18:28:48 +02:00
|
|
|
import org.apache.commons.codec.digest.DigestUtils;
|
2013-03-22 23:37:01 +01:00
|
|
|
import org.apache.commons.lang.StringUtils;
|
|
|
|
|
import org.apache.commons.lang.SystemUtils;
|
2013-05-20 14:06:09 +02:00
|
|
|
import org.slf4j.Logger;
|
|
|
|
|
import org.slf4j.LoggerFactory;
|
2013-04-05 22:38:35 +02:00
|
|
|
import org.xml.sax.InputSource;
|
2013-03-22 23:37:01 +01:00
|
|
|
|
2013-03-23 16:17:19 +01:00
|
|
|
import com.commafeed.backend.model.Feed;
|
|
|
|
|
import com.commafeed.backend.model.FeedEntry;
|
2013-04-11 10:27:20 +02:00
|
|
|
import com.commafeed.backend.model.FeedEntryContent;
|
2013-03-22 23:37:01 +01:00
|
|
|
import com.google.common.base.Function;
|
|
|
|
|
import com.google.common.collect.Collections2;
|
2013-04-09 13:37:00 +02:00
|
|
|
import com.google.common.collect.Iterables;
|
2013-03-22 23:37:01 +01:00
|
|
|
import com.sun.syndication.feed.synd.SyndContent;
|
2013-04-09 13:37:00 +02:00
|
|
|
import com.sun.syndication.feed.synd.SyndEnclosure;
|
2013-03-20 20:33:42 +01:00
|
|
|
import com.sun.syndication.feed.synd.SyndEntry;
|
|
|
|
|
import com.sun.syndication.feed.synd.SyndFeed;
|
2013-05-20 14:06:09 +02:00
|
|
|
import com.sun.syndication.feed.synd.SyndLink;
|
2013-03-20 20:33:42 +01:00
|
|
|
import com.sun.syndication.io.FeedException;
|
|
|
|
|
import com.sun.syndication.io.SyndFeedInput;
|
|
|
|
|
|
|
|
|
|
public class FeedParser {
|
|
|
|
|
|
2013-05-20 14:06:09 +02:00
|
|
|
private static Logger log = LoggerFactory.getLogger(FeedParser.class);
|
|
|
|
|
|
2013-04-21 16:56:09 +02:00
|
|
|
private static final Date START = new Date(0);
|
2013-04-22 09:05:04 +02:00
|
|
|
private static final Date END = new Date(1000l * Integer.MAX_VALUE);
|
2013-04-21 16:56:09 +02:00
|
|
|
|
2013-04-14 18:51:12 +02:00
|
|
|
private static final Function<SyndContent, String> CONTENT_TO_STRING = new Function<SyndContent, String>() {
|
|
|
|
|
public String apply(SyndContent content) {
|
|
|
|
|
return content.getValue();
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
2013-03-20 20:33:42 +01:00
|
|
|
@SuppressWarnings("unchecked")
|
2013-04-25 12:30:21 +02:00
|
|
|
public FetchedFeed parse(String feedUrl, byte[] xml) throws FeedException {
|
|
|
|
|
FetchedFeed fetchedFeed = new FetchedFeed();
|
|
|
|
|
Feed feed = fetchedFeed.getFeed();
|
2013-05-20 12:18:53 +02:00
|
|
|
List<FeedEntry> entries = fetchedFeed.getEntries();
|
2013-03-20 20:33:42 +01:00
|
|
|
feed.setLastUpdated(Calendar.getInstance().getTime());
|
|
|
|
|
|
|
|
|
|
try {
|
2013-04-23 07:20:21 +02:00
|
|
|
String encoding = FeedUtils.guessEncoding(xml);
|
|
|
|
|
String xmlString = FeedUtils.trimInvalidXmlCharacters(new String(
|
|
|
|
|
xml, encoding));
|
|
|
|
|
|
|
|
|
|
InputSource source = new InputSource(new StringReader(xmlString));
|
|
|
|
|
|
2013-04-06 16:54:49 +02:00
|
|
|
SyndFeed rss = new SyndFeedInput().build(source);
|
2013-04-25 12:30:21 +02:00
|
|
|
fetchedFeed.setTitle(rss.getTitle());
|
2013-05-20 14:06:09 +02:00
|
|
|
fetchedFeed.setHub(findHub(rss));
|
|
|
|
|
fetchedFeed.setTopic(findSelf(rss));
|
2013-04-01 09:16:24 +02:00
|
|
|
feed.setUrl(feedUrl);
|
2013-04-03 15:53:57 +02:00
|
|
|
feed.setLink(rss.getLink());
|
2013-03-20 20:33:42 +01:00
|
|
|
List<SyndEntry> items = rss.getEntries();
|
|
|
|
|
for (SyndEntry item : items) {
|
|
|
|
|
FeedEntry entry = new FeedEntry();
|
|
|
|
|
entry.setGuid(item.getUri());
|
2013-04-14 18:28:48 +02:00
|
|
|
entry.setGuidHash(DigestUtils.sha1Hex(item.getUri()));
|
2013-03-20 20:33:42 +01:00
|
|
|
entry.setUrl(item.getLink());
|
2013-04-21 16:56:09 +02:00
|
|
|
entry.setUpdated(validateDate(getUpdateDate(item)));
|
2013-05-19 06:47:37 +02:00
|
|
|
entry.setAuthor(item.getAuthor());
|
2013-03-20 20:33:42 +01:00
|
|
|
|
2013-04-11 10:27:20 +02:00
|
|
|
FeedEntryContent content = new FeedEntryContent();
|
2013-04-14 18:51:12 +02:00
|
|
|
content.setContent(getContent(item));
|
|
|
|
|
content.setTitle(item.getTitle());
|
2013-04-09 13:37:00 +02:00
|
|
|
SyndEnclosure enclosure = (SyndEnclosure) Iterables.getFirst(
|
|
|
|
|
item.getEnclosures(), null);
|
|
|
|
|
if (enclosure != null) {
|
2013-04-11 10:27:20 +02:00
|
|
|
content.setEnclosureUrl(enclosure.getUrl());
|
|
|
|
|
content.setEnclosureType(enclosure.getType());
|
2013-04-09 13:37:00 +02:00
|
|
|
}
|
2013-04-11 10:27:20 +02:00
|
|
|
entry.setContent(content);
|
2013-04-09 13:37:00 +02:00
|
|
|
|
2013-04-25 12:30:21 +02:00
|
|
|
entries.add(entry);
|
2013-03-20 20:33:42 +01:00
|
|
|
}
|
2013-05-20 12:18:53 +02:00
|
|
|
Date publishedDate = null;
|
|
|
|
|
if (!entries.isEmpty()) {
|
|
|
|
|
Long timestamp = FeedUtils.getSortedTimestamps(entries).get(0);
|
|
|
|
|
publishedDate = new Date(timestamp);
|
2013-04-19 11:51:40 +02:00
|
|
|
}
|
2013-04-25 12:30:21 +02:00
|
|
|
fetchedFeed.setPublishedDate(publishedDate);
|
2013-04-19 11:51:40 +02:00
|
|
|
|
2013-03-20 20:33:42 +01:00
|
|
|
} catch (Exception e) {
|
2013-03-25 12:24:00 +01:00
|
|
|
throw new FeedException(String.format(
|
|
|
|
|
"Could not parse feed from %s : %s", feedUrl,
|
|
|
|
|
e.getMessage()), e);
|
2013-03-20 20:33:42 +01:00
|
|
|
}
|
2013-04-25 12:30:21 +02:00
|
|
|
return fetchedFeed;
|
2013-03-20 20:33:42 +01:00
|
|
|
}
|
|
|
|
|
|
2013-04-09 12:52:50 +02:00
|
|
|
private Date getUpdateDate(SyndEntry item) {
|
|
|
|
|
Date date = item.getUpdatedDate();
|
|
|
|
|
if (date == null) {
|
|
|
|
|
date = item.getPublishedDate();
|
|
|
|
|
}
|
|
|
|
|
if (date == null) {
|
|
|
|
|
date = new Date();
|
|
|
|
|
}
|
|
|
|
|
return date;
|
|
|
|
|
}
|
|
|
|
|
|
2013-04-21 16:56:09 +02:00
|
|
|
private Date validateDate(Date date) {
|
|
|
|
|
if (date == null) {
|
2013-04-21 23:07:19 +02:00
|
|
|
return new Date();
|
2013-04-21 16:56:09 +02:00
|
|
|
}
|
|
|
|
|
if (date.before(START) || date.after(END)) {
|
2013-04-21 23:07:19 +02:00
|
|
|
return new Date();
|
2013-04-21 16:56:09 +02:00
|
|
|
}
|
|
|
|
|
return date;
|
|
|
|
|
}
|
|
|
|
|
|
2013-03-22 23:37:01 +01:00
|
|
|
@SuppressWarnings("unchecked")
|
|
|
|
|
private String getContent(SyndEntry item) {
|
|
|
|
|
String content = null;
|
|
|
|
|
if (item.getContents().isEmpty()) {
|
|
|
|
|
content = item.getDescription() == null ? null : item
|
|
|
|
|
.getDescription().getValue();
|
|
|
|
|
} else {
|
|
|
|
|
content = StringUtils.join(Collections2.transform(
|
2013-04-14 18:51:12 +02:00
|
|
|
item.getContents(), CONTENT_TO_STRING),
|
|
|
|
|
SystemUtils.LINE_SEPARATOR);
|
2013-03-22 23:37:01 +01:00
|
|
|
}
|
|
|
|
|
return content;
|
|
|
|
|
}
|
2013-03-23 09:13:04 +01:00
|
|
|
|
2013-05-20 14:06:09 +02:00
|
|
|
@SuppressWarnings("unchecked")
|
|
|
|
|
private String findHub(SyndFeed feed) {
|
|
|
|
|
for (SyndLink l : (List<SyndLink>) feed.getLinks()) {
|
|
|
|
|
if ("hub".equalsIgnoreCase(l.getRel())) {
|
2013-05-20 18:12:11 +02:00
|
|
|
log.debug("found hub {} for feed {}", l.getHref(),
|
|
|
|
|
feed.getLink());
|
2013-05-20 14:06:09 +02:00
|
|
|
return l.getHref();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@SuppressWarnings("unchecked")
|
|
|
|
|
private String findSelf(SyndFeed feed) {
|
|
|
|
|
for (SyndLink l : (List<SyndLink>) feed.getLinks()) {
|
|
|
|
|
if ("self".equalsIgnoreCase(l.getRel())) {
|
2013-05-20 18:12:11 +02:00
|
|
|
log.debug("found self {} for feed {}", l.getHref(),
|
|
|
|
|
feed.getLink());
|
2013-05-20 14:06:09 +02:00
|
|
|
return l.getHref();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
2013-03-20 20:33:42 +01:00
|
|
|
}
|