From f230ad74b15a24a7bd215dd03cc4bfbd52d80b7c Mon Sep 17 00:00:00 2001 From: Athou Date: Tue, 2 Jul 2013 14:33:53 +0200 Subject: [PATCH] store normalized urls --- .../com/commafeed/backend/dao/FeedDAO.java | 9 ++++++++ .../backend/feeds/FeedRefreshTaskGiver.java | 4 ++++ .../commafeed/backend/feeds/FeedUtils.java | 16 +++++++++++++- .../com/commafeed/backend/model/Feed.java | 22 +++++++++++++++++++ .../backend/services/FeedService.java | 4 ++++ .../resources/changelogs/db.changelog-1.1.xml | 13 +++++++++++ 6 files changed, 67 insertions(+), 1 deletion(-) diff --git a/src/main/java/com/commafeed/backend/dao/FeedDAO.java b/src/main/java/com/commafeed/backend/dao/FeedDAO.java index dd93e76a..5f0fdefc 100644 --- a/src/main/java/com/commafeed/backend/dao/FeedDAO.java +++ b/src/main/java/com/commafeed/backend/dao/FeedDAO.java @@ -14,6 +14,7 @@ import javax.persistence.criteria.SetJoin; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang.StringUtils; +import com.commafeed.backend.feeds.FeedUtils; import com.commafeed.backend.model.Feed; import com.commafeed.backend.model.FeedSubscription; import com.commafeed.backend.model.FeedSubscription_; @@ -74,6 +75,14 @@ public class FeedDAO extends GenericDAO { if (feed != null && StringUtils.equals(url, feed.getUrl())) { return feed; } + + String normalized = FeedUtils.normalizeURL(url); + feeds = findByField(Feed_.normalizedUrlHash, DigestUtils.sha1Hex(normalized)); + feed = Iterables.getFirst(feeds, null); + if (feed != null && StringUtils.equals(normalized, feed.getNormalizedUrl())) { + return feed; + } + return null; } diff --git a/src/main/java/com/commafeed/backend/feeds/FeedRefreshTaskGiver.java b/src/main/java/com/commafeed/backend/feeds/FeedRefreshTaskGiver.java index 5c035393..333366d7 100644 --- a/src/main/java/com/commafeed/backend/feeds/FeedRefreshTaskGiver.java +++ b/src/main/java/com/commafeed/backend/feeds/FeedRefreshTaskGiver.java @@ -12,6 +12,7 @@ import javax.annotation.PreDestroy; import javax.enterprise.context.ApplicationScoped; import javax.inject.Inject; +import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang.time.DateUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -158,6 +159,9 @@ public class FeedRefreshTaskGiver { } public void giveBack(Feed feed) { + String normalized = FeedUtils.normalizeURL(feed.getUrl()); + feed.setNormalizedUrl(normalized); + feed.setNormalizedUrlHash(DigestUtils.sha1Hex(normalized)); giveBackQueue.add(feed); } diff --git a/src/main/java/com/commafeed/backend/feeds/FeedUtils.java b/src/main/java/com/commafeed/backend/feeds/FeedUtils.java index 2c379d94..3531f2e8 100644 --- a/src/main/java/com/commafeed/backend/feeds/FeedUtils.java +++ b/src/main/java/com/commafeed/backend/feeds/FeedUtils.java @@ -87,11 +87,25 @@ public class FeedUtils { return encoding; } + /** + * Normalize the url. The resulting url is not meant to be used but rather + * as a mean to identify a feed and avoid duplicates + */ public static String normalizeURL(String url) { if (url == null) { return null; } - return URLCanonicalizer.getCanonicalURL(url); + String normalized = URLCanonicalizer.getCanonicalURL(url); + if (normalized == null) { + return url; + } + normalized = normalized.toLowerCase(); + + if (normalized.startsWith("https")) { + normalized = "http" + normalized.substring(5); + } + normalized = normalized.replace("feeds2.feedburner.com", "feeds.feedburner.com"); + return normalized; } /** diff --git a/src/main/java/com/commafeed/backend/model/Feed.java b/src/main/java/com/commafeed/backend/model/Feed.java index 924b2759..ed7f16d9 100644 --- a/src/main/java/com/commafeed/backend/model/Feed.java +++ b/src/main/java/com/commafeed/backend/model/Feed.java @@ -34,6 +34,12 @@ public class Feed extends AbstractModel { @Column(length = 40, nullable = false) private String urlHash; + + @Column(length = 2048, nullable = false) + private String normalizedUrl; + + @Column(length = 40, nullable = false) + private String normalizedUrlHash; /** * The url of the website, extracted from the feed @@ -315,4 +321,20 @@ public class Feed extends AbstractModel { this.urgent = urgent; } + public String getNormalizedUrl() { + return normalizedUrl; + } + + public void setNormalizedUrl(String normalizedUrl) { + this.normalizedUrl = normalizedUrl; + } + + public String getNormalizedUrlHash() { + return normalizedUrlHash; + } + + public void setNormalizedUrlHash(String normalizedUrlHash) { + this.normalizedUrlHash = normalizedUrlHash; + } + } diff --git a/src/main/java/com/commafeed/backend/services/FeedService.java b/src/main/java/com/commafeed/backend/services/FeedService.java index 146ea613..f7300f9b 100644 --- a/src/main/java/com/commafeed/backend/services/FeedService.java +++ b/src/main/java/com/commafeed/backend/services/FeedService.java @@ -8,6 +8,7 @@ import javax.inject.Inject; import org.apache.commons.codec.digest.DigestUtils; import com.commafeed.backend.dao.FeedDAO; +import com.commafeed.backend.feeds.FeedUtils; import com.commafeed.backend.model.Feed; @Singleton @@ -20,9 +21,12 @@ public class FeedService { public Feed findOrCreate(String url) { Feed feed = feedDAO.findByUrl(url); if (feed == null) { + String normalized = FeedUtils.normalizeURL(url); feed = new Feed(); feed.setUrl(url); feed.setUrlHash(DigestUtils.sha1Hex(url)); + feed.setNormalizedUrl(normalized); + feed.setNormalizedUrlHash(DigestUtils.sha1Hex(normalized)); feedDAO.saveOrUpdate(feed); } return feed; diff --git a/src/main/resources/changelogs/db.changelog-1.1.xml b/src/main/resources/changelogs/db.changelog-1.1.xml index e00d1cba..24d58c7c 100644 --- a/src/main/resources/changelogs/db.changelog-1.1.xml +++ b/src/main/resources/changelogs/db.changelog-1.1.xml @@ -243,5 +243,18 @@ + + + + + + + + + + + +