index title hash

This commit is contained in:
Athou
2013-07-26 08:15:23 +02:00
parent 6f29af1710
commit 9cdc364fde
6 changed files with 38 additions and 25 deletions

View File

@@ -9,8 +9,6 @@ import javax.persistence.criteria.JoinType;
import javax.persistence.criteria.Predicate; import javax.persistence.criteria.Predicate;
import javax.persistence.criteria.Root; import javax.persistence.criteria.Root;
import org.apache.commons.codec.digest.DigestUtils;
import com.commafeed.backend.model.FeedEntry; import com.commafeed.backend.model.FeedEntry;
import com.commafeed.backend.model.FeedEntryContent; import com.commafeed.backend.model.FeedEntryContent;
import com.commafeed.backend.model.FeedEntryContent_; import com.commafeed.backend.model.FeedEntryContent_;
@@ -19,27 +17,15 @@ import com.google.common.collect.Iterables;
public class FeedEntryContentDAO extends GenericDAO<FeedEntryContent> { public class FeedEntryContentDAO extends GenericDAO<FeedEntryContent> {
public FeedEntryContent findExisting(FeedEntryContent content) { public FeedEntryContent findExisting(String contentHash, String titleHash) {
CriteriaQuery<FeedEntryContent> query = builder.createQuery(getType()); CriteriaQuery<FeedEntryContent> query = builder.createQuery(getType());
Root<FeedEntryContent> root = query.from(getType()); Root<FeedEntryContent> root = query.from(getType());
Predicate p1 = builder.equal(root.get(FeedEntryContent_.contentHash), DigestUtils.sha1Hex(content.getContent())); Predicate p1 = builder.equal(root.get(FeedEntryContent_.contentHash), contentHash);
Predicate p2 = null; Predicate p2 = builder.equal(root.get(FeedEntryContent_.titleHash), titleHash);
if (content.getTitle() == null) {
p2 = builder.isNull(root.get(FeedEntryContent_.title));
} else {
p2 = builder.equal(root.get(FeedEntryContent_.title), content.getTitle());
}
Predicate p3 = null; query.where(p1, p2);
if (content.getAuthor() == null) {
p3 = builder.isNull(root.get(FeedEntryContent_.author));
} else {
p3 = builder.equal(root.get(FeedEntryContent_.author), content.getAuthor());
}
query.where(p1, p2, p3);
TypedQuery<FeedEntryContent> q = em.createQuery(query); TypedQuery<FeedEntryContent> q = em.createQuery(query);
return Iterables.getFirst(q.getResultList(), null); return Iterables.getFirst(q.getResultList(), null);

View File

@@ -90,7 +90,7 @@ public class FeedParser {
FeedEntryContent content = new FeedEntryContent(); FeedEntryContent content = new FeedEntryContent();
content.setContent(getContent(item)); content.setContent(getContent(item));
content.setTitle(getTitle(item)); content.setTitle(getTitle(item));
content.setAuthor(item.getAuthor()); content.setAuthor(StringUtils.trimToNull(item.getAuthor()));
SyndEnclosure enclosure = (SyndEnclosure) Iterables.getFirst(item.getEnclosures(), null); SyndEnclosure enclosure = (SyndEnclosure) Iterables.getFirst(item.getEnclosures(), null);
if (enclosure != null) { if (enclosure != null) {
content.setEnclosureUrl(FeedUtils.truncate(enclosure.getUrl(), 2048)); content.setEnclosureUrl(FeedUtils.truncate(enclosure.getUrl(), 2048));
@@ -187,7 +187,7 @@ public class FeedParser {
} else { } else {
content = StringUtils.join(Collections2.transform(item.getContents(), CONTENT_TO_STRING), SystemUtils.LINE_SEPARATOR); content = StringUtils.join(Collections2.transform(item.getContents(), CONTENT_TO_STRING), SystemUtils.LINE_SEPARATOR);
} }
return StringUtils.trimToEmpty(content); return StringUtils.trimToNull(content);
} }
private String getTitle(SyndEntry item) { private String getTitle(SyndEntry item) {
@@ -200,7 +200,7 @@ public class FeedParser {
title = "(no title)"; title = "(no title)";
} }
} }
return title; return StringUtils.trimToNull(title);
} }
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")

View File

@@ -151,7 +151,7 @@ public class FeedRefreshUpdater {
// lock on content, make sure we are not updating the same entry // lock on content, make sure we are not updating the same entry
// twice at the same time // twice at the same time
FeedEntryContent content = entry.getContent(); FeedEntryContent content = entry.getContent();
String key2 = DigestUtils.sha1Hex(content.getContent() + content.getTitle() + content.getAuthor()); String key2 = DigestUtils.sha1Hex(StringUtils.trimToEmpty(content.getContent() + content.getTitle()));
Iterator<Lock> iterator = locks.bulkGet(Arrays.asList(key1, key2)).iterator(); Iterator<Lock> iterator = locks.bulkGet(Arrays.asList(key1, key2)).iterator();
Lock lock1 = iterator.next(); Lock lock1 = iterator.next();

View File

@@ -22,6 +22,9 @@ public class FeedEntryContent extends AbstractModel {
@Column(length = 2048) @Column(length = 2048)
private String title; private String title;
@Column(length = 40)
private String titleHash;
@Lob @Lob
@Column(length = Integer.MAX_VALUE) @Column(length = Integer.MAX_VALUE)
private String content; private String content;
@@ -97,4 +100,12 @@ public class FeedEntryContent extends AbstractModel {
this.entries = entries; this.entries = entries;
} }
public String getTitleHash() {
return titleHash;
}
public void setTitleHash(String titleHash) {
this.titleHash = titleHash;
}
} }

View File

@@ -3,6 +3,7 @@ package com.commafeed.backend.services;
import javax.inject.Inject; import javax.inject.Inject;
import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang.StringUtils;
import com.commafeed.backend.dao.FeedEntryContentDAO; import com.commafeed.backend.dao.FeedEntryContentDAO;
import com.commafeed.backend.feeds.FeedUtils; import com.commafeed.backend.feeds.FeedUtils;
@@ -18,11 +19,15 @@ public class FeedEntryContentService {
*/ */
public FeedEntryContent findOrCreate(FeedEntryContent content, String baseUrl) { public FeedEntryContent findOrCreate(FeedEntryContent content, String baseUrl) {
FeedEntryContent existing = feedEntryContentDAO.findExisting(content); String contentHash = DigestUtils.sha1Hex(StringUtils.trimToEmpty(content.getContent()));
String titleHash = DigestUtils.sha1Hex(StringUtils.trimToEmpty(content.getTitle()));
FeedEntryContent existing = feedEntryContentDAO.findExisting(contentHash, titleHash);
if (existing == null) { if (existing == null) {
content.setContentHash(contentHash);
content.setTitleHash(titleHash);
content.setAuthor(FeedUtils.truncate(FeedUtils.handleContent(content.getAuthor(), baseUrl, true), 128)); content.setAuthor(FeedUtils.truncate(FeedUtils.handleContent(content.getAuthor(), baseUrl, true), 128));
content.setTitle(FeedUtils.truncate(FeedUtils.handleContent(content.getTitle(), baseUrl, true), 2048)); content.setTitle(FeedUtils.truncate(FeedUtils.handleContent(content.getTitle(), baseUrl, true), 2048));
content.setContentHash(DigestUtils.sha1Hex(content.getContent()));
content.setContent(FeedUtils.handleContent(content.getContent(), baseUrl, false)); content.setContent(FeedUtils.handleContent(content.getContent(), baseUrl, false));
existing = content; existing = content;
feedEntryContentDAO.saveOrUpdate(existing); feedEntryContentDAO.saveOrUpdate(existing);

View File

@@ -90,4 +90,15 @@
</createIndex> </createIndex>
</changeSet> </changeSet>
<changeSet author="athou" id="add-title-hashes">
<addColumn tableName="FEEDENTRYCONTENTS">
<column name="titleHash" type="VARCHAR(40)" />
</addColumn>
<createIndex tableName="FEEDENTRYCONTENTS" indexName="content_title_index">
<column name="contentHash" />
<column name="titleHash" />
</createIndex>
<dropIndex tableName="FEEDENTRYCONTENTS" indexName="content_hash_index" />
</changeSet>
</databaseChangeLog> </databaseChangeLog>