compare feed entry content after cleanup because that's what saved in the database

This commit is contained in:
Athou
2024-01-07 14:57:40 +01:00
parent ed45746f52
commit 789857b09f
2 changed files with 34 additions and 38 deletions

View File

@@ -3,6 +3,7 @@ package com.commafeed.backend.model;
import java.sql.Types; import java.sql.Types;
import java.util.Set; import java.util.Set;
import org.apache.commons.lang3.builder.EqualsBuilder;
import org.hibernate.annotations.JdbcTypeCode; import org.hibernate.annotations.JdbcTypeCode;
import jakarta.persistence.Column; import jakarta.persistence.Column;
@@ -60,4 +61,22 @@ public class FeedEntryContent extends AbstractModel {
@OneToMany(mappedBy = "content") @OneToMany(mappedBy = "content")
private Set<FeedEntry> entries; private Set<FeedEntry> entries;
public boolean equivalentTo(FeedEntryContent c) {
if (c == null) {
return false;
}
return new EqualsBuilder().append(title, c.title)
.append(content, c.content)
.append(author, c.author)
.append(categories, c.categories)
.append(enclosureUrl, c.enclosureUrl)
.append(enclosureType, c.enclosureType)
.append(mediaDescription, c.mediaDescription)
.append(mediaThumbnailUrl, c.mediaThumbnailUrl)
.append(mediaThumbnailWidth, c.mediaThumbnailWidth)
.append(mediaThumbnailHeight, c.mediaThumbnailHeight)
.build();
}
} }

View File

@@ -1,11 +1,9 @@
package com.commafeed.backend.service; package com.commafeed.backend.service;
import java.util.List;
import java.util.Optional; import java.util.Optional;
import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.builder.EqualsBuilder;
import com.commafeed.backend.dao.FeedEntryContentDAO; import com.commafeed.backend.dao.FeedEntryContentDAO;
import com.commafeed.backend.feed.FeedUtils; import com.commafeed.backend.feed.FeedUtils;
@@ -29,25 +27,25 @@ public class FeedEntryContentService {
* this is NOT thread-safe * this is NOT thread-safe
*/ */
public FeedEntryContent findOrCreate(Content content, String baseUrl) { public FeedEntryContent findOrCreate(Content content, String baseUrl) {
String title = FeedUtils.truncate(cleaningService.clean(content.title(), baseUrl, true), 2048); FeedEntryContent entryContent = buildContent(content, baseUrl);
String titleHash = DigestUtils.sha1Hex(StringUtils.trimToEmpty(title)); Optional<FeedEntryContent> existing = feedEntryContentDAO.findExisting(entryContent.getContentHash(), entryContent.getTitleHash())
.stream()
String contentString = cleaningService.clean(content.content(), baseUrl, false); .filter(entryContent::equivalentTo)
String contentHash = DigestUtils.sha1Hex(StringUtils.trimToEmpty(contentString));
List<FeedEntryContent> existing = feedEntryContentDAO.findExisting(contentHash, titleHash);
Optional<FeedEntryContent> equivalentContent = existing.stream()
.filter(c -> isEquivalent(c, content, title, contentString))
.findFirst(); .findFirst();
if (equivalentContent.isPresent()) { if (existing.isPresent()) {
return equivalentContent.get(); return existing.get();
} else {
feedEntryContentDAO.saveOrUpdate(entryContent);
return entryContent;
} }
}
private FeedEntryContent buildContent(Content content, String baseUrl) {
FeedEntryContent entryContent = new FeedEntryContent(); FeedEntryContent entryContent = new FeedEntryContent();
entryContent.setTitle(title); entryContent.setTitleHash(DigestUtils.sha1Hex(StringUtils.trimToEmpty(content.title())));
entryContent.setTitleHash(titleHash); entryContent.setContentHash(DigestUtils.sha1Hex(StringUtils.trimToEmpty(content.content())));
entryContent.setContent(contentString); entryContent.setTitle(FeedUtils.truncate(cleaningService.clean(content.title(), baseUrl, true), 2048));
entryContent.setContentHash(contentHash); entryContent.setContent(cleaningService.clean(content.content(), baseUrl, false));
entryContent.setAuthor(FeedUtils.truncate(cleaningService.clean(content.author(), baseUrl, true), 128)); entryContent.setAuthor(FeedUtils.truncate(cleaningService.clean(content.author(), baseUrl, true), 128));
entryContent.setCategories(FeedUtils.truncate(content.categories(), 4096)); entryContent.setCategories(FeedUtils.truncate(content.categories(), 4096));
@@ -65,28 +63,7 @@ public class FeedEntryContentService {
entryContent.setMediaThumbnailHeight(media.thumbnailHeight()); entryContent.setMediaThumbnailHeight(media.thumbnailHeight());
} }
feedEntryContentDAO.saveOrUpdate(entryContent);
return entryContent; return entryContent;
} }
private boolean isEquivalent(FeedEntryContent content, Content c, String title, String contentString) {
EqualsBuilder builder = new EqualsBuilder().append(content.getTitle(), title)
.append(content.getContent(), contentString)
.append(content.getAuthor(), c.author())
.append(content.getCategories(), c.categories());
if (c.enclosure() != null) {
builder.append(content.getEnclosureUrl(), c.enclosure().url()).append(content.getEnclosureType(), c.enclosure().type());
}
if (c.media() != null) {
builder.append(content.getMediaDescription(), c.media().description())
.append(content.getMediaThumbnailUrl(), c.media().thumbnailUrl())
.append(content.getMediaThumbnailWidth(), c.media().thumbnailWidth())
.append(content.getMediaThumbnailHeight(), c.media().thumbnailHeight());
}
return builder.build();
}
} }