Files
Athou_commafeed/src/main/java/com/commafeed/backend/feed/FeedParser.java

208 lines
6.6 KiB
Java
Raw Normal View History

package com.commafeed.backend.feed;
2013-03-20 20:33:42 +01:00
import java.io.StringReader;
2014-12-12 10:12:56 +01:00
import java.nio.charset.Charset;
2013-06-04 15:11:35 +02:00
import java.text.DateFormat;
import java.util.Date;
2013-03-20 20:33:42 +01:00
import java.util.List;
2014-12-12 10:06:23 +01:00
import java.util.stream.Collectors;
2013-03-20 20:33:42 +01:00
2014-08-17 14:16:30 +02:00
import javax.inject.Inject;
import javax.inject.Singleton;
import lombok.RequiredArgsConstructor;
2013-08-11 11:45:32 +02:00
import lombok.extern.slf4j.Slf4j;
2014-10-28 16:36:09 +01:00
import org.apache.commons.lang3.StringUtils;
2014-08-15 13:51:13 +02:00
import org.jdom2.Element;
import org.jdom2.Namespace;
import org.xml.sax.InputSource;
2013-03-23 16:17:19 +01:00
import com.commafeed.backend.model.Feed;
import com.commafeed.backend.model.FeedEntry;
import com.commafeed.backend.model.FeedEntryContent;
2013-04-09 13:37:00 +02:00
import com.google.common.collect.Iterables;
2014-08-15 13:51:13 +02:00
import com.rometools.rome.feed.synd.SyndEnclosure;
import com.rometools.rome.feed.synd.SyndEntry;
import com.rometools.rome.feed.synd.SyndFeed;
import com.rometools.rome.feed.synd.SyndLink;
import com.rometools.rome.feed.synd.SyndLinkImpl;
import com.rometools.rome.io.FeedException;
import com.rometools.rome.io.SyndFeedInput;
2013-03-20 20:33:42 +01:00
2013-08-11 11:45:32 +02:00
@Slf4j
2014-08-17 14:16:30 +02:00
@RequiredArgsConstructor(onConstructor = @__({ @Inject }))
@Singleton
2013-03-20 20:33:42 +01:00
public class FeedParser {
2013-05-21 13:36:20 +02:00
private static final String ATOM_10_URI = "http://www.w3.org/2005/Atom";
2013-07-25 09:17:33 +02:00
private static final Namespace ATOM_10_NS = Namespace.getNamespace(ATOM_10_URI);
2013-05-21 13:36:20 +02:00
private static final Date START = new Date(86400000);
2013-07-25 09:17:33 +02:00
private static final Date END = new Date(1000l * Integer.MAX_VALUE - 86400000);
2013-04-25 12:30:21 +02:00
public FetchedFeed parse(String feedUrl, byte[] xml) throws FeedException {
FetchedFeed fetchedFeed = new FetchedFeed();
Feed feed = fetchedFeed.getFeed();
List<FeedEntry> entries = fetchedFeed.getEntries();
2013-03-20 20:33:42 +01:00
try {
2014-12-12 10:12:56 +01:00
Charset encoding = FeedUtils.guessEncoding(xml);
2013-07-25 09:17:33 +02:00
String xmlString = FeedUtils.trimInvalidXmlCharacters(new String(xml, encoding));
2013-05-22 13:33:22 +02:00
if (xmlString == null) {
2013-07-25 09:17:33 +02:00
throw new FeedException("Input string is null for url " + feedUrl);
2013-05-22 13:33:22 +02:00
}
xmlString = FeedUtils.replaceHtmlEntitiesWithNumericEntities(xmlString);
InputSource source = new InputSource(new StringReader(xmlString));
SyndFeed rss = new SyndFeedInput().build(source);
2013-05-21 13:36:20 +02:00
handleForeignMarkup(rss);
2013-04-25 12:30:21 +02:00
fetchedFeed.setTitle(rss.getTitle());
feed.setPushHub(findHub(rss));
feed.setPushTopic(findSelf(rss));
feed.setUrl(feedUrl);
2013-04-03 15:53:57 +02:00
feed.setLink(rss.getLink());
2013-03-20 20:33:42 +01:00
List<SyndEntry> items = rss.getEntries();
2013-03-20 20:33:42 +01:00
for (SyndEntry item : items) {
FeedEntry entry = new FeedEntry();
2013-05-28 21:53:58 +02:00
String guid = item.getUri();
if (StringUtils.isBlank(guid)) {
guid = item.getLink();
}
if (StringUtils.isBlank(guid)) {
// no guid and no link, skip entry
continue;
}
entry.setGuid(FeedUtils.truncate(guid, 2048));
2013-07-03 07:56:52 +02:00
entry.setUpdated(validateDate(getEntryUpdateDate(item), true));
entry.setUrl(FeedUtils.truncate(FeedUtils.toAbsoluteUrl(item.getLink(), feed.getLink(), feedUrl), 2048));
2014-08-15 13:51:13 +02:00
// if link is empty but guid is used as url
if (StringUtils.isBlank(entry.getUrl()) && StringUtils.startsWith(entry.getGuid(), "http")) {
entry.setUrl(entry.getGuid());
}
2013-03-20 20:33:42 +01:00
FeedEntryContent content = new FeedEntryContent();
content.setContent(getContent(item));
content.setCategories(FeedUtils.truncate(
item.getCategories().stream().map(c -> c.getName()).collect(Collectors.joining(", ")), 4096));
2013-06-04 15:11:35 +02:00
content.setTitle(getTitle(item));
2013-07-26 08:15:23 +02:00
content.setAuthor(StringUtils.trimToNull(item.getAuthor()));
2014-08-15 13:51:13 +02:00
SyndEnclosure enclosure = Iterables.getFirst(item.getEnclosures(), null);
2013-04-09 13:37:00 +02:00
if (enclosure != null) {
2013-07-25 09:17:33 +02:00
content.setEnclosureUrl(FeedUtils.truncate(enclosure.getUrl(), 2048));
content.setEnclosureType(enclosure.getType());
2013-04-09 13:37:00 +02:00
}
entry.setContent(content);
2013-04-09 13:37:00 +02:00
2013-04-25 12:30:21 +02:00
entries.add(entry);
2013-03-20 20:33:42 +01:00
}
Date lastEntryDate = null;
2013-07-03 07:56:52 +02:00
Date publishedDate = validateDate(rss.getPublishedDate(), false);
if (!entries.isEmpty()) {
2013-07-25 09:17:33 +02:00
List<Long> sortedTimestamps = FeedUtils.getSortedTimestamps(entries);
Long timestamp = sortedTimestamps.get(0);
lastEntryDate = new Date(timestamp);
2013-07-31 12:07:29 +02:00
publishedDate = (publishedDate == null || publishedDate.before(lastEntryDate)) ? lastEntryDate : publishedDate;
}
2013-07-31 12:07:29 +02:00
feed.setLastPublishedDate(publishedDate);
2013-07-25 09:17:33 +02:00
feed.setAverageEntryInterval(FeedUtils.averageTimeBetweenEntries(entries));
feed.setLastEntryDate(lastEntryDate);
2013-03-20 20:33:42 +01:00
} catch (Exception e) {
2013-07-25 09:17:33 +02:00
throw new FeedException(String.format("Could not parse feed from %s : %s", feedUrl, e.getMessage()), e);
2013-03-20 20:33:42 +01:00
}
2013-04-25 12:30:21 +02:00
return fetchedFeed;
2013-03-20 20:33:42 +01:00
}
2013-05-21 13:36:20 +02:00
/**
* Adds atom links for rss feeds
*/
private void handleForeignMarkup(SyndFeed feed) {
2014-08-15 13:51:13 +02:00
List<Element> foreignMarkup = feed.getForeignMarkup();
2013-05-21 13:36:20 +02:00
if (foreignMarkup == null) {
return;
}
2014-08-15 13:51:13 +02:00
for (Element element : foreignMarkup) {
if ("link".equals(element.getName()) && ATOM_10_NS.equals(element.getNamespace())) {
SyndLink link = new SyndLinkImpl();
link.setRel(element.getAttributeValue("rel"));
link.setHref(element.getAttributeValue("href"));
feed.getLinks().add(link);
2013-05-21 13:36:20 +02:00
}
}
}
2013-05-21 13:36:20 +02:00
private Date getEntryUpdateDate(SyndEntry item) {
Date date = item.getUpdatedDate();
if (date == null) {
date = item.getPublishedDate();
}
if (date == null) {
date = new Date();
}
return date;
}
2013-07-03 07:56:52 +02:00
private Date validateDate(Date date, boolean nullToNow) {
Date now = new Date();
if (date == null) {
2013-07-03 07:56:52 +02:00
return nullToNow ? now : null;
}
if (date.before(START) || date.after(END)) {
2013-06-08 22:00:22 +02:00
return now;
}
if (date.after(now)) {
return now;
}
return date;
}
private String getContent(SyndEntry item) {
String content = null;
if (item.getContents().isEmpty()) {
2013-07-25 09:17:33 +02:00
content = item.getDescription() == null ? null : item.getDescription().getValue();
} else {
2014-12-12 10:06:23 +01:00
content = item.getContents().stream().map(c -> c.getValue()).collect(Collectors.joining(System.lineSeparator()));
}
2013-07-26 08:15:23 +02:00
return StringUtils.trimToNull(content);
}
2013-03-23 09:13:04 +01:00
2013-06-04 15:11:35 +02:00
private String getTitle(SyndEntry item) {
String title = item.getTitle();
if (StringUtils.isBlank(title)) {
Date date = item.getPublishedDate();
if (date != null) {
title = DateFormat.getInstance().format(date);
} else {
title = "(no title)";
}
}
2013-07-26 08:15:23 +02:00
return StringUtils.trimToNull(title);
2013-06-04 15:11:35 +02:00
}
2013-05-20 14:06:09 +02:00
private String findHub(SyndFeed feed) {
2014-08-15 13:51:13 +02:00
for (SyndLink l : feed.getLinks()) {
2013-05-20 14:06:09 +02:00
if ("hub".equalsIgnoreCase(l.getRel())) {
2013-07-25 09:17:33 +02:00
log.debug("found hub {} for feed {}", l.getHref(), feed.getLink());
2013-05-20 14:06:09 +02:00
return l.getHref();
}
}
return null;
}
private String findSelf(SyndFeed feed) {
2014-08-15 13:51:13 +02:00
for (SyndLink l : feed.getLinks()) {
2013-05-20 14:06:09 +02:00
if ("self".equalsIgnoreCase(l.getRel())) {
2013-07-25 09:17:33 +02:00
log.debug("found self {} for feed {}", l.getHref(), feed.getLink());
2013-05-20 14:06:09 +02:00
return l.getHref();
}
}
return null;
}
2013-03-20 20:33:42 +01:00
}