forked from Archives/Athou_commafeed
faster replace for large feeds (#881)
This commit is contained in:
18
pom.xml
18
pom.xml
@@ -116,7 +116,7 @@
|
|||||||
</goals>
|
</goals>
|
||||||
<configuration>
|
<configuration>
|
||||||
<transformers>
|
<transformers>
|
||||||
<transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
|
<transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
|
||||||
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
|
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
|
||||||
<mainClass>com.commafeed.CommaFeedApplication</mainClass>
|
<mainClass>com.commafeed.CommaFeedApplication</mainClass>
|
||||||
</transformer>
|
</transformer>
|
||||||
@@ -279,11 +279,11 @@
|
|||||||
</exclusion>
|
</exclusion>
|
||||||
</exclusions>
|
</exclusions>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>io.dropwizard.metrics</groupId>
|
<groupId>io.dropwizard.metrics</groupId>
|
||||||
<artifactId>metrics-graphite</artifactId>
|
<artifactId>metrics-graphite</artifactId>
|
||||||
<version>3.1.2</version>
|
<version>3.1.2</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.httpcomponents</groupId>
|
<groupId>org.apache.httpcomponents</groupId>
|
||||||
@@ -386,7 +386,11 @@
|
|||||||
<artifactId>jdom2</artifactId>
|
<artifactId>jdom2</artifactId>
|
||||||
<version>2.0.6</version>
|
<version>2.0.6</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.ahocorasick</groupId>
|
||||||
|
<artifactId>ahocorasick</artifactId>
|
||||||
|
<version>0.4.0</version>
|
||||||
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.jsoup</groupId>
|
<groupId>org.jsoup</groupId>
|
||||||
<artifactId>jsoup</artifactId>
|
<artifactId>jsoup</artifactId>
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ import java.net.URL;
|
|||||||
import java.nio.charset.Charset;
|
import java.nio.charset.Charset;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
import java.util.Collection;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
@@ -13,6 +14,9 @@ import java.util.List;
|
|||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.ahocorasick.trie.Emit;
|
||||||
|
import org.ahocorasick.trie.Trie;
|
||||||
|
import org.ahocorasick.trie.Trie.TrieBuilder;
|
||||||
import org.apache.commons.codec.binary.Base64;
|
import org.apache.commons.codec.binary.Base64;
|
||||||
import org.apache.commons.lang3.ArrayUtils;
|
import org.apache.commons.lang3.ArrayUtils;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
@@ -132,7 +136,32 @@ public class FeedUtils {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public static String replaceHtmlEntitiesWithNumericEntities(String source) {
|
public static String replaceHtmlEntitiesWithNumericEntities(String source) {
|
||||||
return StringUtils.replaceEach(source, HtmlEntities.HTML_ENTITIES, HtmlEntities.NUMERIC_ENTITIES);
|
// Create a buffer sufficiently large that re-allocations are minimized.
|
||||||
|
StringBuilder sb = new StringBuilder(source.length() << 1);
|
||||||
|
|
||||||
|
TrieBuilder builder = Trie.builder();
|
||||||
|
builder.ignoreOverlaps();
|
||||||
|
|
||||||
|
for (String key : HtmlEntities.HTML_ENTITIES) {
|
||||||
|
builder.addKeyword(key);
|
||||||
|
}
|
||||||
|
|
||||||
|
Trie trie = builder.build();
|
||||||
|
Collection<Emit> emits = trie.parseText(source);
|
||||||
|
|
||||||
|
int prevIndex = 0;
|
||||||
|
for (Emit emit : emits) {
|
||||||
|
int matchIndex = emit.getStart();
|
||||||
|
|
||||||
|
sb.append(source.substring(prevIndex, matchIndex));
|
||||||
|
sb.append(HtmlEntities.HTML_TO_NUMERIC_MAP.get(emit.getKeyword()));
|
||||||
|
prevIndex = emit.getEnd() + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add the remainder of the string (contains no more matches).
|
||||||
|
sb.append(source.substring(prevIndex));
|
||||||
|
|
||||||
|
return sb.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -1,9 +1,11 @@
|
|||||||
package com.commafeed.backend.feed;
|
package com.commafeed.backend.feed;
|
||||||
|
|
||||||
|
import java.util.Collections;
|
||||||
import java.util.LinkedHashMap;
|
import java.util.LinkedHashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
public class HtmlEntities {
|
public class HtmlEntities {
|
||||||
|
public static final Map<String, String> HTML_TO_NUMERIC_MAP;
|
||||||
public static final String[] HTML_ENTITIES;
|
public static final String[] HTML_ENTITIES;
|
||||||
public static final String[] NUMERIC_ENTITIES;
|
public static final String[] NUMERIC_ENTITIES;
|
||||||
|
|
||||||
@@ -260,6 +262,7 @@ public class HtmlEntities {
|
|||||||
map.put("‍", "‍");
|
map.put("‍", "‍");
|
||||||
map.put("‌", "‌");
|
map.put("‌", "‌");
|
||||||
|
|
||||||
|
HTML_TO_NUMERIC_MAP = Collections.unmodifiableMap(map);
|
||||||
HTML_ENTITIES = map.keySet().toArray(new String[map.size()]);
|
HTML_ENTITIES = map.keySet().toArray(new String[map.size()]);
|
||||||
NUMERIC_ENTITIES = map.values().toArray(new String[map.size()]);
|
NUMERIC_ENTITIES = map.values().toArray(new String[map.size()]);
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user