diff --git a/pom.xml b/pom.xml index 59450d66..322b1814 100644 --- a/pom.xml +++ b/pom.xml @@ -116,7 +116,7 @@ - + com.commafeed.CommaFeedApplication @@ -279,11 +279,11 @@ - - io.dropwizard.metrics - metrics-graphite - 3.1.2 - + + io.dropwizard.metrics + metrics-graphite + 3.1.2 + org.apache.httpcomponents @@ -386,7 +386,11 @@ jdom2 2.0.6 - + + org.ahocorasick + ahocorasick + 0.4.0 + org.jsoup jsoup diff --git a/src/main/java/com/commafeed/backend/feed/FeedUtils.java b/src/main/java/com/commafeed/backend/feed/FeedUtils.java index e0e90c52..7b8ab930 100644 --- a/src/main/java/com/commafeed/backend/feed/FeedUtils.java +++ b/src/main/java/com/commafeed/backend/feed/FeedUtils.java @@ -6,6 +6,7 @@ import java.net.URL; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collection; import java.util.Collections; import java.util.Date; import java.util.Iterator; @@ -13,6 +14,9 @@ import java.util.List; import java.util.regex.Pattern; import java.util.stream.Collectors; +import org.ahocorasick.trie.Emit; +import org.ahocorasick.trie.Trie; +import org.ahocorasick.trie.Trie.TrieBuilder; import org.apache.commons.codec.binary.Base64; import org.apache.commons.lang3.ArrayUtils; import org.apache.commons.lang3.StringUtils; @@ -132,7 +136,32 @@ public class FeedUtils { } public static String replaceHtmlEntitiesWithNumericEntities(String source) { - return StringUtils.replaceEach(source, HtmlEntities.HTML_ENTITIES, HtmlEntities.NUMERIC_ENTITIES); + // Create a buffer sufficiently large that re-allocations are minimized. + StringBuilder sb = new StringBuilder(source.length() << 1); + + TrieBuilder builder = Trie.builder(); + builder.ignoreOverlaps(); + + for (String key : HtmlEntities.HTML_ENTITIES) { + builder.addKeyword(key); + } + + Trie trie = builder.build(); + Collection emits = trie.parseText(source); + + int prevIndex = 0; + for (Emit emit : emits) { + int matchIndex = emit.getStart(); + + sb.append(source.substring(prevIndex, matchIndex)); + sb.append(HtmlEntities.HTML_TO_NUMERIC_MAP.get(emit.getKeyword())); + prevIndex = emit.getEnd() + 1; + } + + // Add the remainder of the string (contains no more matches). + sb.append(source.substring(prevIndex)); + + return sb.toString(); } /** diff --git a/src/main/java/com/commafeed/backend/feed/HtmlEntities.java b/src/main/java/com/commafeed/backend/feed/HtmlEntities.java index 3e927c67..b6735671 100644 --- a/src/main/java/com/commafeed/backend/feed/HtmlEntities.java +++ b/src/main/java/com/commafeed/backend/feed/HtmlEntities.java @@ -1,9 +1,11 @@ package com.commafeed.backend.feed; +import java.util.Collections; import java.util.LinkedHashMap; import java.util.Map; public class HtmlEntities { + public static final Map HTML_TO_NUMERIC_MAP; public static final String[] HTML_ENTITIES; public static final String[] NUMERIC_ENTITIES; @@ -260,6 +262,7 @@ public class HtmlEntities { map.put("‍", "‍"); map.put("‌", "‌"); + HTML_TO_NUMERIC_MAP = Collections.unmodifiableMap(map); HTML_ENTITIES = map.keySet().toArray(new String[map.size()]); NUMERIC_ENTITIES = map.values().toArray(new String[map.size()]); }