only compute rtl once by storing it in the database on fetch

This commit is contained in:
Athou
2024-09-13 22:22:41 +02:00
parent 9a89b39b62
commit f4a43e9950
6 changed files with 42 additions and 20 deletions

View File

@@ -17,7 +17,6 @@ import org.netpreserve.urlcanon.ParsedUrl;
import com.commafeed.backend.feed.FeedEntryKeyword.Mode; import com.commafeed.backend.feed.FeedEntryKeyword.Mode;
import com.commafeed.backend.feed.parser.TextDirectionDetector; import com.commafeed.backend.feed.parser.TextDirectionDetector;
import com.commafeed.backend.model.FeedEntry;
import com.commafeed.backend.model.FeedSubscription; import com.commafeed.backend.model.FeedSubscription;
import com.commafeed.frontend.model.Entry; import com.commafeed.frontend.model.Entry;
@@ -92,24 +91,18 @@ public class FeedUtils {
return normalized; return normalized;
} }
public static boolean isRTL(FeedEntry entry) { public static boolean isRTL(String title, String content) {
String text = entry.getContent().getContent(); String text = StringUtils.isNotBlank(content) ? content : title;
if (StringUtils.isBlank(text)) {
text = entry.getContent().getTitle();
}
if (StringUtils.isBlank(text)) { if (StringUtils.isBlank(text)) {
return false; return false;
} }
text = Jsoup.parse(text).text(); String stripped = Jsoup.parse(text).text();
if (StringUtils.isBlank(text)) { if (StringUtils.isBlank(stripped)) {
return false; return false;
} }
TextDirectionDetector.Direction direction = TextDirectionDetector.detect(text); return TextDirectionDetector.detect(stripped) == TextDirectionDetector.Direction.RIGHT_TO_LEFT;
return direction == TextDirectionDetector.Direction.RIGHT_TO_LEFT;
} }
public static String removeTrailingSlash(String url) { public static String removeTrailingSlash(String url) {

View File

@@ -1,7 +1,6 @@
package com.commafeed.backend.feed.parser; package com.commafeed.backend.feed.parser;
import java.text.Bidi; import java.text.Bidi;
import java.util.concurrent.atomic.AtomicLong;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import org.apache.commons.lang3.math.NumberUtils; import org.apache.commons.lang3.math.NumberUtils;
@@ -22,8 +21,8 @@ public class TextDirectionDetector {
return Direction.LEFT_TO_RIGHT; return Direction.LEFT_TO_RIGHT;
} }
AtomicLong rtl = new AtomicLong(); long rtl = 0;
AtomicLong total = new AtomicLong(); long total = 0;
for (String token : WORDS_PATTERN.split(input)) { for (String token : WORDS_PATTERN.split(input)) {
// skip urls // skip urls
if (URL_PATTERN.matcher(token).matches()) { if (URL_PATTERN.matcher(token).matches()) {
@@ -39,18 +38,18 @@ public class TextDirectionDetector {
if (requiresBidi) { if (requiresBidi) {
Bidi bidi = new Bidi(token, Bidi.DIRECTION_DEFAULT_LEFT_TO_RIGHT); Bidi bidi = new Bidi(token, Bidi.DIRECTION_DEFAULT_LEFT_TO_RIGHT);
if (bidi.getBaseLevel() == 1) { if (bidi.getBaseLevel() == 1) {
rtl.incrementAndGet(); rtl++;
} }
} }
total.incrementAndGet(); total++;
} }
if (total.longValue() == 0) { if (total == 0) {
return Direction.LEFT_TO_RIGHT; return Direction.LEFT_TO_RIGHT;
} }
double ratio = rtl.doubleValue() / total.doubleValue(); double ratio = (double) rtl / total;
return ratio > RTL_THRESHOLD ? Direction.RIGHT_TO_LEFT : Direction.LEFT_TO_RIGHT; return ratio > RTL_THRESHOLD ? Direction.RIGHT_TO_LEFT : Direction.LEFT_TO_RIGHT;
} }

View File

@@ -6,8 +6,12 @@ import java.util.Set;
import org.apache.commons.lang3.builder.EqualsBuilder; import org.apache.commons.lang3.builder.EqualsBuilder;
import org.hibernate.annotations.JdbcTypeCode; import org.hibernate.annotations.JdbcTypeCode;
import com.commafeed.backend.feed.FeedUtils;
import jakarta.persistence.Column; import jakarta.persistence.Column;
import jakarta.persistence.Entity; import jakarta.persistence.Entity;
import jakarta.persistence.EnumType;
import jakarta.persistence.Enumerated;
import jakarta.persistence.Lob; import jakarta.persistence.Lob;
import jakarta.persistence.OneToMany; import jakarta.persistence.OneToMany;
import jakarta.persistence.Table; import jakarta.persistence.Table;
@@ -21,6 +25,10 @@ import lombok.Setter;
@Setter @Setter
public class FeedEntryContent extends AbstractModel { public class FeedEntryContent extends AbstractModel {
public enum Direction {
ltr, rtl, unknown
}
@Column(length = 2048) @Column(length = 2048)
private String title; private String title;
@@ -58,6 +66,10 @@ public class FeedEntryContent extends AbstractModel {
@Column(length = 4096) @Column(length = 4096)
private String categories; private String categories;
@Column
@Enumerated(EnumType.STRING)
private Direction direction = Direction.unknown;
@OneToMany(mappedBy = "content") @OneToMany(mappedBy = "content")
private Set<FeedEntry> entries; private Set<FeedEntry> entries;
@@ -79,4 +91,14 @@ public class FeedEntryContent extends AbstractModel {
.build(); .build();
} }
public boolean isRTL() {
if (direction == Direction.rtl) {
return true;
} else if (direction == Direction.ltr) {
return false;
} else {
// detect on the fly for content that was inserted before the direction field was added
return FeedUtils.isRTL(title, content);
}
}
} }

View File

@@ -47,6 +47,8 @@ public class FeedEntryContentService {
entryContent.setContent(cleaningService.clean(content.content(), baseUrl, false)); entryContent.setContent(cleaningService.clean(content.content(), baseUrl, false));
entryContent.setAuthor(FeedUtils.truncate(cleaningService.clean(content.author(), baseUrl, true), 128)); entryContent.setAuthor(FeedUtils.truncate(cleaningService.clean(content.author(), baseUrl, true), 128));
entryContent.setCategories(FeedUtils.truncate(content.categories(), 4096)); entryContent.setCategories(FeedUtils.truncate(content.categories(), 4096));
entryContent.setDirection(
FeedUtils.isRTL(content.title(), content.content()) ? FeedEntryContent.Direction.rtl : FeedEntryContent.Direction.ltr);
Enclosure enclosure = content.enclosure(); Enclosure enclosure = content.enclosure();
if (enclosure != null) { if (enclosure != null) {

View File

@@ -128,7 +128,7 @@ public class Entry implements Serializable {
entry.setTags(status.getTags().stream().map(FeedEntryTag::getName).toList()); entry.setTags(status.getTags().stream().map(FeedEntryTag::getName).toList());
if (content != null) { if (content != null) {
entry.setRtl(FeedUtils.isRTL(feedEntry)); entry.setRtl(content.isRTL());
entry.setTitle(content.getTitle()); entry.setTitle(content.getTitle());
entry.setContent(proxyImages ? FeedUtils.proxyImages(content.getContent()) : content.getContent()); entry.setContent(proxyImages ? FeedUtils.proxyImages(content.getContent()) : content.getContent());
entry.setAuthor(content.getAuthor()); entry.setAuthor(content.getAuthor());

View File

@@ -10,4 +10,10 @@
</column> </column>
</addColumn> </addColumn>
</changeSet> </changeSet>
<changeSet id="content-direction" author="athou">
<addColumn tableName="FEEDENTRYCONTENTS">
<column name="direction" type="varchar(16)" />
</addColumn>
</changeSet>
</databaseChangeLog> </databaseChangeLog>