mirror of
https://github.com/Athou/commafeed.git
synced 2026-03-21 21:37:29 +00:00
store normalized urls
This commit is contained in:
@@ -14,6 +14,7 @@ import javax.persistence.criteria.SetJoin;
|
|||||||
import org.apache.commons.codec.digest.DigestUtils;
|
import org.apache.commons.codec.digest.DigestUtils;
|
||||||
import org.apache.commons.lang.StringUtils;
|
import org.apache.commons.lang.StringUtils;
|
||||||
|
|
||||||
|
import com.commafeed.backend.feeds.FeedUtils;
|
||||||
import com.commafeed.backend.model.Feed;
|
import com.commafeed.backend.model.Feed;
|
||||||
import com.commafeed.backend.model.FeedSubscription;
|
import com.commafeed.backend.model.FeedSubscription;
|
||||||
import com.commafeed.backend.model.FeedSubscription_;
|
import com.commafeed.backend.model.FeedSubscription_;
|
||||||
@@ -74,6 +75,14 @@ public class FeedDAO extends GenericDAO<Feed> {
|
|||||||
if (feed != null && StringUtils.equals(url, feed.getUrl())) {
|
if (feed != null && StringUtils.equals(url, feed.getUrl())) {
|
||||||
return feed;
|
return feed;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
String normalized = FeedUtils.normalizeURL(url);
|
||||||
|
feeds = findByField(Feed_.normalizedUrlHash, DigestUtils.sha1Hex(normalized));
|
||||||
|
feed = Iterables.getFirst(feeds, null);
|
||||||
|
if (feed != null && StringUtils.equals(normalized, feed.getNormalizedUrl())) {
|
||||||
|
return feed;
|
||||||
|
}
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ import javax.annotation.PreDestroy;
|
|||||||
import javax.enterprise.context.ApplicationScoped;
|
import javax.enterprise.context.ApplicationScoped;
|
||||||
import javax.inject.Inject;
|
import javax.inject.Inject;
|
||||||
|
|
||||||
|
import org.apache.commons.codec.digest.DigestUtils;
|
||||||
import org.apache.commons.lang.time.DateUtils;
|
import org.apache.commons.lang.time.DateUtils;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@@ -158,6 +159,9 @@ public class FeedRefreshTaskGiver {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public void giveBack(Feed feed) {
|
public void giveBack(Feed feed) {
|
||||||
|
String normalized = FeedUtils.normalizeURL(feed.getUrl());
|
||||||
|
feed.setNormalizedUrl(normalized);
|
||||||
|
feed.setNormalizedUrlHash(DigestUtils.sha1Hex(normalized));
|
||||||
giveBackQueue.add(feed);
|
giveBackQueue.add(feed);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -87,11 +87,25 @@ public class FeedUtils {
|
|||||||
return encoding;
|
return encoding;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalize the url. The resulting url is not meant to be used but rather
|
||||||
|
* as a mean to identify a feed and avoid duplicates
|
||||||
|
*/
|
||||||
public static String normalizeURL(String url) {
|
public static String normalizeURL(String url) {
|
||||||
if (url == null) {
|
if (url == null) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
return URLCanonicalizer.getCanonicalURL(url);
|
String normalized = URLCanonicalizer.getCanonicalURL(url);
|
||||||
|
if (normalized == null) {
|
||||||
|
return url;
|
||||||
|
}
|
||||||
|
normalized = normalized.toLowerCase();
|
||||||
|
|
||||||
|
if (normalized.startsWith("https")) {
|
||||||
|
normalized = "http" + normalized.substring(5);
|
||||||
|
}
|
||||||
|
normalized = normalized.replace("feeds2.feedburner.com", "feeds.feedburner.com");
|
||||||
|
return normalized;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -34,6 +34,12 @@ public class Feed extends AbstractModel {
|
|||||||
|
|
||||||
@Column(length = 40, nullable = false)
|
@Column(length = 40, nullable = false)
|
||||||
private String urlHash;
|
private String urlHash;
|
||||||
|
|
||||||
|
@Column(length = 2048, nullable = false)
|
||||||
|
private String normalizedUrl;
|
||||||
|
|
||||||
|
@Column(length = 40, nullable = false)
|
||||||
|
private String normalizedUrlHash;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The url of the website, extracted from the feed
|
* The url of the website, extracted from the feed
|
||||||
@@ -315,4 +321,20 @@ public class Feed extends AbstractModel {
|
|||||||
this.urgent = urgent;
|
this.urgent = urgent;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String getNormalizedUrl() {
|
||||||
|
return normalizedUrl;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setNormalizedUrl(String normalizedUrl) {
|
||||||
|
this.normalizedUrl = normalizedUrl;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getNormalizedUrlHash() {
|
||||||
|
return normalizedUrlHash;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setNormalizedUrlHash(String normalizedUrlHash) {
|
||||||
|
this.normalizedUrlHash = normalizedUrlHash;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ import javax.inject.Inject;
|
|||||||
import org.apache.commons.codec.digest.DigestUtils;
|
import org.apache.commons.codec.digest.DigestUtils;
|
||||||
|
|
||||||
import com.commafeed.backend.dao.FeedDAO;
|
import com.commafeed.backend.dao.FeedDAO;
|
||||||
|
import com.commafeed.backend.feeds.FeedUtils;
|
||||||
import com.commafeed.backend.model.Feed;
|
import com.commafeed.backend.model.Feed;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
@@ -20,9 +21,12 @@ public class FeedService {
|
|||||||
public Feed findOrCreate(String url) {
|
public Feed findOrCreate(String url) {
|
||||||
Feed feed = feedDAO.findByUrl(url);
|
Feed feed = feedDAO.findByUrl(url);
|
||||||
if (feed == null) {
|
if (feed == null) {
|
||||||
|
String normalized = FeedUtils.normalizeURL(url);
|
||||||
feed = new Feed();
|
feed = new Feed();
|
||||||
feed.setUrl(url);
|
feed.setUrl(url);
|
||||||
feed.setUrlHash(DigestUtils.sha1Hex(url));
|
feed.setUrlHash(DigestUtils.sha1Hex(url));
|
||||||
|
feed.setNormalizedUrl(normalized);
|
||||||
|
feed.setNormalizedUrlHash(DigestUtils.sha1Hex(normalized));
|
||||||
feedDAO.saveOrUpdate(feed);
|
feedDAO.saveOrUpdate(feed);
|
||||||
}
|
}
|
||||||
return feed;
|
return feed;
|
||||||
|
|||||||
@@ -243,5 +243,18 @@
|
|||||||
<column name="queryTimeout" valueNumeric="0"></column>
|
<column name="queryTimeout" valueNumeric="0"></column>
|
||||||
</update>
|
</update>
|
||||||
</changeSet>
|
</changeSet>
|
||||||
|
|
||||||
|
<changeSet author="athou" id="add-normalized-url">
|
||||||
|
<addColumn tableName="FEEDS">
|
||||||
|
<column name="normalizedUrl" type="VARCHAR(2048)" />
|
||||||
|
</addColumn>
|
||||||
|
<addColumn tableName="FEEDS">
|
||||||
|
<column name="normalizedUrlHash" type="VARCHAR(40)" />
|
||||||
|
</addColumn>
|
||||||
|
<createIndex indexName="norm_url_hash_index" tableName="FEEDS"
|
||||||
|
unique="false">
|
||||||
|
<column name="normalizedUrlHash" />
|
||||||
|
</createIndex>
|
||||||
|
</changeSet>
|
||||||
|
|
||||||
</databaseChangeLog>
|
</databaseChangeLog>
|
||||||
|
|||||||
Reference in New Issue
Block a user