2013-03-20 20:33:42 +01:00
|
|
|
package com.commafeed.backend.feeds;
|
|
|
|
|
|
2013-04-05 22:38:35 +02:00
|
|
|
import java.io.ByteArrayInputStream;
|
2013-03-20 20:33:42 +01:00
|
|
|
import java.util.Calendar;
|
|
|
|
|
import java.util.List;
|
|
|
|
|
|
2013-03-22 23:37:01 +01:00
|
|
|
import org.apache.commons.lang.StringUtils;
|
|
|
|
|
import org.apache.commons.lang.SystemUtils;
|
2013-03-23 09:13:04 +01:00
|
|
|
import org.jsoup.Jsoup;
|
2013-03-27 16:42:05 +01:00
|
|
|
import org.jsoup.safety.Whitelist;
|
2013-04-05 22:38:35 +02:00
|
|
|
import org.xml.sax.InputSource;
|
2013-03-22 23:37:01 +01:00
|
|
|
|
2013-03-23 16:17:19 +01:00
|
|
|
import com.commafeed.backend.model.Feed;
|
|
|
|
|
import com.commafeed.backend.model.FeedEntry;
|
2013-03-22 23:37:01 +01:00
|
|
|
import com.google.common.base.Function;
|
|
|
|
|
import com.google.common.collect.Collections2;
|
|
|
|
|
import com.sun.syndication.feed.synd.SyndContent;
|
2013-03-20 20:33:42 +01:00
|
|
|
import com.sun.syndication.feed.synd.SyndEntry;
|
|
|
|
|
import com.sun.syndication.feed.synd.SyndFeed;
|
|
|
|
|
import com.sun.syndication.io.FeedException;
|
|
|
|
|
import com.sun.syndication.io.SyndFeedInput;
|
|
|
|
|
|
|
|
|
|
public class FeedParser {
|
|
|
|
|
|
|
|
|
|
@SuppressWarnings("unchecked")
|
2013-04-05 22:38:35 +02:00
|
|
|
public Feed parse(String feedUrl, byte[] xml) throws FeedException {
|
2013-03-20 20:33:42 +01:00
|
|
|
Feed feed = new Feed();
|
|
|
|
|
feed.setLastUpdated(Calendar.getInstance().getTime());
|
|
|
|
|
|
|
|
|
|
try {
|
2013-04-05 22:38:35 +02:00
|
|
|
SyndFeed rss = new SyndFeedInput().build(new InputSource(
|
|
|
|
|
new ByteArrayInputStream(xml)));
|
2013-04-01 09:16:24 +02:00
|
|
|
feed.setUrl(feedUrl);
|
2013-03-30 20:51:51 +01:00
|
|
|
feed.setTitle(rss.getTitle());
|
2013-04-03 15:53:57 +02:00
|
|
|
feed.setLink(rss.getLink());
|
2013-03-20 20:33:42 +01:00
|
|
|
List<SyndEntry> items = rss.getEntries();
|
|
|
|
|
for (SyndEntry item : items) {
|
|
|
|
|
FeedEntry entry = new FeedEntry();
|
|
|
|
|
entry.setGuid(item.getUri());
|
2013-03-28 13:58:20 +01:00
|
|
|
entry.setTitle(handleContent(item.getTitle()));
|
2013-03-22 23:37:01 +01:00
|
|
|
entry.setContent(getContent(item));
|
2013-03-20 20:33:42 +01:00
|
|
|
entry.setUrl(item.getLink());
|
|
|
|
|
entry.setUpdated(item.getUpdatedDate() != null ? item
|
|
|
|
|
.getUpdatedDate() : item.getPublishedDate());
|
|
|
|
|
|
|
|
|
|
feed.getEntries().add(entry);
|
|
|
|
|
}
|
|
|
|
|
} catch (Exception e) {
|
2013-03-25 12:24:00 +01:00
|
|
|
throw new FeedException(String.format(
|
|
|
|
|
"Could not parse feed from %s : %s", feedUrl,
|
|
|
|
|
e.getMessage()), e);
|
2013-03-20 20:33:42 +01:00
|
|
|
}
|
|
|
|
|
return feed;
|
|
|
|
|
}
|
|
|
|
|
|
2013-03-22 23:37:01 +01:00
|
|
|
@SuppressWarnings("unchecked")
|
|
|
|
|
private String getContent(SyndEntry item) {
|
|
|
|
|
String content = null;
|
|
|
|
|
if (item.getContents().isEmpty()) {
|
|
|
|
|
content = item.getDescription() == null ? null : item
|
|
|
|
|
.getDescription().getValue();
|
|
|
|
|
} else {
|
|
|
|
|
content = StringUtils.join(Collections2.transform(
|
|
|
|
|
item.getContents(), new Function<SyndContent, String>() {
|
|
|
|
|
public String apply(SyndContent content) {
|
|
|
|
|
return content.getValue();
|
|
|
|
|
}
|
|
|
|
|
}), SystemUtils.LINE_SEPARATOR);
|
|
|
|
|
}
|
2013-03-23 09:13:04 +01:00
|
|
|
content = handleContent(content);
|
2013-03-22 23:37:01 +01:00
|
|
|
return content;
|
|
|
|
|
}
|
2013-03-23 09:13:04 +01:00
|
|
|
|
|
|
|
|
private String handleContent(String content) {
|
2013-04-05 22:38:35 +02:00
|
|
|
if (StringUtils.isNotBlank(content)) {
|
|
|
|
|
Whitelist whitelist = Whitelist.relaxed();
|
|
|
|
|
whitelist.addEnforcedAttribute("a", "target", "_blank");
|
2013-03-27 16:42:05 +01:00
|
|
|
|
2013-04-05 22:38:35 +02:00
|
|
|
// TODO evaluate potential security issues
|
|
|
|
|
whitelist.addTags("iframe");
|
|
|
|
|
whitelist.addAttributes("iframe", "src", "height", "width",
|
|
|
|
|
"allowfullscreen", "frameborder");
|
2013-03-27 16:42:05 +01:00
|
|
|
|
2013-04-05 22:38:35 +02:00
|
|
|
content = Jsoup.clean(content, whitelist);
|
|
|
|
|
}
|
|
|
|
|
return content;
|
2013-03-23 09:13:04 +01:00
|
|
|
}
|
2013-03-20 20:33:42 +01:00
|
|
|
}
|