store normalized urls

This commit is contained in:
Athou
2013-07-02 14:33:53 +02:00
parent d8a9022c97
commit f230ad74b1
6 changed files with 67 additions and 1 deletions

View File

@@ -14,6 +14,7 @@ import javax.persistence.criteria.SetJoin;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang.StringUtils;
import com.commafeed.backend.feeds.FeedUtils;
import com.commafeed.backend.model.Feed;
import com.commafeed.backend.model.FeedSubscription;
import com.commafeed.backend.model.FeedSubscription_;
@@ -74,6 +75,14 @@ public class FeedDAO extends GenericDAO<Feed> {
if (feed != null && StringUtils.equals(url, feed.getUrl())) {
return feed;
}
String normalized = FeedUtils.normalizeURL(url);
feeds = findByField(Feed_.normalizedUrlHash, DigestUtils.sha1Hex(normalized));
feed = Iterables.getFirst(feeds, null);
if (feed != null && StringUtils.equals(normalized, feed.getNormalizedUrl())) {
return feed;
}
return null;
}

View File

@@ -12,6 +12,7 @@ import javax.annotation.PreDestroy;
import javax.enterprise.context.ApplicationScoped;
import javax.inject.Inject;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang.time.DateUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -158,6 +159,9 @@ public class FeedRefreshTaskGiver {
}
public void giveBack(Feed feed) {
String normalized = FeedUtils.normalizeURL(feed.getUrl());
feed.setNormalizedUrl(normalized);
feed.setNormalizedUrlHash(DigestUtils.sha1Hex(normalized));
giveBackQueue.add(feed);
}

View File

@@ -87,11 +87,25 @@ public class FeedUtils {
return encoding;
}
/**
* Normalize the url. The resulting url is not meant to be used but rather
* as a mean to identify a feed and avoid duplicates
*/
public static String normalizeURL(String url) {
if (url == null) {
return null;
}
return URLCanonicalizer.getCanonicalURL(url);
String normalized = URLCanonicalizer.getCanonicalURL(url);
if (normalized == null) {
return url;
}
normalized = normalized.toLowerCase();
if (normalized.startsWith("https")) {
normalized = "http" + normalized.substring(5);
}
normalized = normalized.replace("feeds2.feedburner.com", "feeds.feedburner.com");
return normalized;
}
/**

View File

@@ -34,6 +34,12 @@ public class Feed extends AbstractModel {
@Column(length = 40, nullable = false)
private String urlHash;
@Column(length = 2048, nullable = false)
private String normalizedUrl;
@Column(length = 40, nullable = false)
private String normalizedUrlHash;
/**
* The url of the website, extracted from the feed
@@ -315,4 +321,20 @@ public class Feed extends AbstractModel {
this.urgent = urgent;
}
public String getNormalizedUrl() {
return normalizedUrl;
}
public void setNormalizedUrl(String normalizedUrl) {
this.normalizedUrl = normalizedUrl;
}
public String getNormalizedUrlHash() {
return normalizedUrlHash;
}
public void setNormalizedUrlHash(String normalizedUrlHash) {
this.normalizedUrlHash = normalizedUrlHash;
}
}

View File

@@ -8,6 +8,7 @@ import javax.inject.Inject;
import org.apache.commons.codec.digest.DigestUtils;
import com.commafeed.backend.dao.FeedDAO;
import com.commafeed.backend.feeds.FeedUtils;
import com.commafeed.backend.model.Feed;
@Singleton
@@ -20,9 +21,12 @@ public class FeedService {
public Feed findOrCreate(String url) {
Feed feed = feedDAO.findByUrl(url);
if (feed == null) {
String normalized = FeedUtils.normalizeURL(url);
feed = new Feed();
feed.setUrl(url);
feed.setUrlHash(DigestUtils.sha1Hex(url));
feed.setNormalizedUrl(normalized);
feed.setNormalizedUrlHash(DigestUtils.sha1Hex(normalized));
feedDAO.saveOrUpdate(feed);
}
return feed;