only compute rtl once by storing it in the database on fetch

This commit is contained in:
Athou
2024-09-13 22:22:41 +02:00
parent 9a89b39b62
commit f4a43e9950
6 changed files with 42 additions and 20 deletions

View File

@@ -17,7 +17,6 @@ import org.netpreserve.urlcanon.ParsedUrl;
import com.commafeed.backend.feed.FeedEntryKeyword.Mode;
import com.commafeed.backend.feed.parser.TextDirectionDetector;
import com.commafeed.backend.model.FeedEntry;
import com.commafeed.backend.model.FeedSubscription;
import com.commafeed.frontend.model.Entry;
@@ -92,24 +91,18 @@ public class FeedUtils {
return normalized;
}
public static boolean isRTL(FeedEntry entry) {
String text = entry.getContent().getContent();
if (StringUtils.isBlank(text)) {
text = entry.getContent().getTitle();
}
public static boolean isRTL(String title, String content) {
String text = StringUtils.isNotBlank(content) ? content : title;
if (StringUtils.isBlank(text)) {
return false;
}
text = Jsoup.parse(text).text();
if (StringUtils.isBlank(text)) {
String stripped = Jsoup.parse(text).text();
if (StringUtils.isBlank(stripped)) {
return false;
}
TextDirectionDetector.Direction direction = TextDirectionDetector.detect(text);
return direction == TextDirectionDetector.Direction.RIGHT_TO_LEFT;
return TextDirectionDetector.detect(stripped) == TextDirectionDetector.Direction.RIGHT_TO_LEFT;
}
public static String removeTrailingSlash(String url) {

View File

@@ -1,7 +1,6 @@
package com.commafeed.backend.feed.parser;
import java.text.Bidi;
import java.util.concurrent.atomic.AtomicLong;
import java.util.regex.Pattern;
import org.apache.commons.lang3.math.NumberUtils;
@@ -22,8 +21,8 @@ public class TextDirectionDetector {
return Direction.LEFT_TO_RIGHT;
}
AtomicLong rtl = new AtomicLong();
AtomicLong total = new AtomicLong();
long rtl = 0;
long total = 0;
for (String token : WORDS_PATTERN.split(input)) {
// skip urls
if (URL_PATTERN.matcher(token).matches()) {
@@ -39,18 +38,18 @@ public class TextDirectionDetector {
if (requiresBidi) {
Bidi bidi = new Bidi(token, Bidi.DIRECTION_DEFAULT_LEFT_TO_RIGHT);
if (bidi.getBaseLevel() == 1) {
rtl.incrementAndGet();
rtl++;
}
}
total.incrementAndGet();
total++;
}
if (total.longValue() == 0) {
if (total == 0) {
return Direction.LEFT_TO_RIGHT;
}
double ratio = rtl.doubleValue() / total.doubleValue();
double ratio = (double) rtl / total;
return ratio > RTL_THRESHOLD ? Direction.RIGHT_TO_LEFT : Direction.LEFT_TO_RIGHT;
}

View File

@@ -6,8 +6,12 @@ import java.util.Set;
import org.apache.commons.lang3.builder.EqualsBuilder;
import org.hibernate.annotations.JdbcTypeCode;
import com.commafeed.backend.feed.FeedUtils;
import jakarta.persistence.Column;
import jakarta.persistence.Entity;
import jakarta.persistence.EnumType;
import jakarta.persistence.Enumerated;
import jakarta.persistence.Lob;
import jakarta.persistence.OneToMany;
import jakarta.persistence.Table;
@@ -21,6 +25,10 @@ import lombok.Setter;
@Setter
public class FeedEntryContent extends AbstractModel {
public enum Direction {
ltr, rtl, unknown
}
@Column(length = 2048)
private String title;
@@ -58,6 +66,10 @@ public class FeedEntryContent extends AbstractModel {
@Column(length = 4096)
private String categories;
@Column
@Enumerated(EnumType.STRING)
private Direction direction = Direction.unknown;
@OneToMany(mappedBy = "content")
private Set<FeedEntry> entries;
@@ -79,4 +91,14 @@ public class FeedEntryContent extends AbstractModel {
.build();
}
public boolean isRTL() {
if (direction == Direction.rtl) {
return true;
} else if (direction == Direction.ltr) {
return false;
} else {
// detect on the fly for content that was inserted before the direction field was added
return FeedUtils.isRTL(title, content);
}
}
}

View File

@@ -47,6 +47,8 @@ public class FeedEntryContentService {
entryContent.setContent(cleaningService.clean(content.content(), baseUrl, false));
entryContent.setAuthor(FeedUtils.truncate(cleaningService.clean(content.author(), baseUrl, true), 128));
entryContent.setCategories(FeedUtils.truncate(content.categories(), 4096));
entryContent.setDirection(
FeedUtils.isRTL(content.title(), content.content()) ? FeedEntryContent.Direction.rtl : FeedEntryContent.Direction.ltr);
Enclosure enclosure = content.enclosure();
if (enclosure != null) {

View File

@@ -128,7 +128,7 @@ public class Entry implements Serializable {
entry.setTags(status.getTags().stream().map(FeedEntryTag::getName).toList());
if (content != null) {
entry.setRtl(FeedUtils.isRTL(feedEntry));
entry.setRtl(content.isRTL());
entry.setTitle(content.getTitle());
entry.setContent(proxyImages ? FeedUtils.proxyImages(content.getContent()) : content.getContent());
entry.setAuthor(content.getAuthor());

View File

@@ -10,4 +10,10 @@
</column>
</addColumn>
</changeSet>
<changeSet id="content-direction" author="athou">
<addColumn tableName="FEEDENTRYCONTENTS">
<column name="direction" type="varchar(16)" />
</addColumn>
</changeSet>
</databaseChangeLog>