faster replace for large feeds (#881)

This commit is contained in:
Athou
2018-07-11 17:07:55 +02:00
parent 72083b7e87
commit b3ce43eaf7
3 changed files with 44 additions and 8 deletions

View File

@@ -6,6 +6,7 @@ import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.Iterator;
@@ -13,6 +14,9 @@ import java.util.List;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.ahocorasick.trie.Emit;
import org.ahocorasick.trie.Trie;
import org.ahocorasick.trie.Trie.TrieBuilder;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
@@ -132,7 +136,32 @@ public class FeedUtils {
}
public static String replaceHtmlEntitiesWithNumericEntities(String source) {
return StringUtils.replaceEach(source, HtmlEntities.HTML_ENTITIES, HtmlEntities.NUMERIC_ENTITIES);
// Create a buffer sufficiently large that re-allocations are minimized.
StringBuilder sb = new StringBuilder(source.length() << 1);
TrieBuilder builder = Trie.builder();
builder.ignoreOverlaps();
for (String key : HtmlEntities.HTML_ENTITIES) {
builder.addKeyword(key);
}
Trie trie = builder.build();
Collection<Emit> emits = trie.parseText(source);
int prevIndex = 0;
for (Emit emit : emits) {
int matchIndex = emit.getStart();
sb.append(source.substring(prevIndex, matchIndex));
sb.append(HtmlEntities.HTML_TO_NUMERIC_MAP.get(emit.getKeyword()));
prevIndex = emit.getEnd() + 1;
}
// Add the remainder of the string (contains no more matches).
sb.append(source.substring(prevIndex));
return sb.toString();
}
/**

View File

@@ -1,9 +1,11 @@
package com.commafeed.backend.feed;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.Map;
public class HtmlEntities {
public static final Map<String, String> HTML_TO_NUMERIC_MAP;
public static final String[] HTML_ENTITIES;
public static final String[] NUMERIC_ENTITIES;
@@ -260,6 +262,7 @@ public class HtmlEntities {
map.put("&zwj;", "&#8205;");
map.put("&zwnj;", "&#8204;");
HTML_TO_NUMERIC_MAP = Collections.unmodifiableMap(map);
HTML_ENTITIES = map.keySet().toArray(new String[map.size()]);
NUMERIC_ENTITIES = map.values().toArray(new String[map.size()]);
}