index title hash

This commit is contained in:
Athou
2013-07-26 08:15:23 +02:00
parent 6f29af1710
commit 9cdc364fde
6 changed files with 38 additions and 25 deletions

View File

@@ -9,8 +9,6 @@ import javax.persistence.criteria.JoinType;
import javax.persistence.criteria.Predicate;
import javax.persistence.criteria.Root;
import org.apache.commons.codec.digest.DigestUtils;
import com.commafeed.backend.model.FeedEntry;
import com.commafeed.backend.model.FeedEntryContent;
import com.commafeed.backend.model.FeedEntryContent_;
@@ -19,27 +17,15 @@ import com.google.common.collect.Iterables;
public class FeedEntryContentDAO extends GenericDAO<FeedEntryContent> {
public FeedEntryContent findExisting(FeedEntryContent content) {
public FeedEntryContent findExisting(String contentHash, String titleHash) {
CriteriaQuery<FeedEntryContent> query = builder.createQuery(getType());
Root<FeedEntryContent> root = query.from(getType());
Predicate p1 = builder.equal(root.get(FeedEntryContent_.contentHash), DigestUtils.sha1Hex(content.getContent()));
Predicate p2 = null;
if (content.getTitle() == null) {
p2 = builder.isNull(root.get(FeedEntryContent_.title));
} else {
p2 = builder.equal(root.get(FeedEntryContent_.title), content.getTitle());
}
Predicate p1 = builder.equal(root.get(FeedEntryContent_.contentHash), contentHash);
Predicate p2 = builder.equal(root.get(FeedEntryContent_.titleHash), titleHash);
Predicate p3 = null;
if (content.getAuthor() == null) {
p3 = builder.isNull(root.get(FeedEntryContent_.author));
} else {
p3 = builder.equal(root.get(FeedEntryContent_.author), content.getAuthor());
}
query.where(p1, p2, p3);
query.where(p1, p2);
TypedQuery<FeedEntryContent> q = em.createQuery(query);
return Iterables.getFirst(q.getResultList(), null);

View File

@@ -90,7 +90,7 @@ public class FeedParser {
FeedEntryContent content = new FeedEntryContent();
content.setContent(getContent(item));
content.setTitle(getTitle(item));
content.setAuthor(item.getAuthor());
content.setAuthor(StringUtils.trimToNull(item.getAuthor()));
SyndEnclosure enclosure = (SyndEnclosure) Iterables.getFirst(item.getEnclosures(), null);
if (enclosure != null) {
content.setEnclosureUrl(FeedUtils.truncate(enclosure.getUrl(), 2048));
@@ -187,7 +187,7 @@ public class FeedParser {
} else {
content = StringUtils.join(Collections2.transform(item.getContents(), CONTENT_TO_STRING), SystemUtils.LINE_SEPARATOR);
}
return StringUtils.trimToEmpty(content);
return StringUtils.trimToNull(content);
}
private String getTitle(SyndEntry item) {
@@ -200,7 +200,7 @@ public class FeedParser {
title = "(no title)";
}
}
return title;
return StringUtils.trimToNull(title);
}
@SuppressWarnings("unchecked")

View File

@@ -151,7 +151,7 @@ public class FeedRefreshUpdater {
// lock on content, make sure we are not updating the same entry
// twice at the same time
FeedEntryContent content = entry.getContent();
String key2 = DigestUtils.sha1Hex(content.getContent() + content.getTitle() + content.getAuthor());
String key2 = DigestUtils.sha1Hex(StringUtils.trimToEmpty(content.getContent() + content.getTitle()));
Iterator<Lock> iterator = locks.bulkGet(Arrays.asList(key1, key2)).iterator();
Lock lock1 = iterator.next();

View File

@@ -21,6 +21,9 @@ public class FeedEntryContent extends AbstractModel {
@Column(length = 2048)
private String title;
@Column(length = 40)
private String titleHash;
@Lob
@Column(length = Integer.MAX_VALUE)
@@ -97,4 +100,12 @@ public class FeedEntryContent extends AbstractModel {
this.entries = entries;
}
public String getTitleHash() {
return titleHash;
}
public void setTitleHash(String titleHash) {
this.titleHash = titleHash;
}
}

View File

@@ -3,6 +3,7 @@ package com.commafeed.backend.services;
import javax.inject.Inject;
import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang.StringUtils;
import com.commafeed.backend.dao.FeedEntryContentDAO;
import com.commafeed.backend.feeds.FeedUtils;
@@ -17,12 +18,16 @@ public class FeedEntryContentService {
* this is NOT thread-safe
*/
public FeedEntryContent findOrCreate(FeedEntryContent content, String baseUrl) {
FeedEntryContent existing = feedEntryContentDAO.findExisting(content);
String contentHash = DigestUtils.sha1Hex(StringUtils.trimToEmpty(content.getContent()));
String titleHash = DigestUtils.sha1Hex(StringUtils.trimToEmpty(content.getTitle()));
FeedEntryContent existing = feedEntryContentDAO.findExisting(contentHash, titleHash);
if (existing == null) {
content.setContentHash(contentHash);
content.setTitleHash(titleHash);
content.setAuthor(FeedUtils.truncate(FeedUtils.handleContent(content.getAuthor(), baseUrl, true), 128));
content.setTitle(FeedUtils.truncate(FeedUtils.handleContent(content.getTitle(), baseUrl, true), 2048));
content.setContentHash(DigestUtils.sha1Hex(content.getContent()));
content.setContent(FeedUtils.handleContent(content.getContent(), baseUrl, false));
existing = content;
feedEntryContentDAO.saveOrUpdate(existing);