From 1239725083aa039118a181b4f1f681ac03565a8c Mon Sep 17 00:00:00 2001 From: Athou Date: Wed, 19 Jun 2013 15:23:40 +0200 Subject: [PATCH] fix character detection for eastern europe languages (fix #138) --- .../commafeed/backend/feeds/FeedUtils.java | 41 ++++++++++++++++++- 1 file changed, 39 insertions(+), 2 deletions(-) diff --git a/src/main/java/com/commafeed/backend/feeds/FeedUtils.java b/src/main/java/com/commafeed/backend/feeds/FeedUtils.java index 2ebeb323..09a3d96c 100644 --- a/src/main/java/com/commafeed/backend/feeds/FeedUtils.java +++ b/src/main/java/com/commafeed/backend/feeds/FeedUtils.java @@ -9,6 +9,7 @@ import java.util.Collections; import java.util.Date; import java.util.List; +import org.apache.commons.lang.ArrayUtils; import org.apache.commons.lang.ObjectUtils; import org.apache.commons.lang.StringUtils; import org.apache.commons.lang3.time.DateUtils; @@ -49,7 +50,25 @@ public class FeedUtils { return string; } + /** + * Detect feed encoding by using the declared encoding in the xml processing + * instruction and by detecting the characters used in the feed + * + */ public static String guessEncoding(byte[] bytes) { + String extracted = extractDeclaredEncoding(bytes); + if (StringUtils.startsWithIgnoreCase(extracted, "iso-8859-")) { + if (StringUtils.endsWith(extracted, "1") == false) { + return extracted; + } + } + return detectEncoding(bytes); + } + + /** + * Detect encoding by analyzing characters in the array + */ + public static String detectEncoding(byte[] bytes) { String DEFAULT_ENCODING = "UTF-8"; UniversalDetector detector = new UniversalDetector(null); detector.handleData(bytes, 0, bytes.length); @@ -58,13 +77,31 @@ public class FeedUtils { detector.reset(); if (encoding == null) { encoding = DEFAULT_ENCODING; - } else if (encoding.equalsIgnoreCase("ISO-8859-1") - || encoding.equalsIgnoreCase("ISO-8859-2")) { + } else if (encoding.equalsIgnoreCase("ISO-8859-1")) { encoding = "windows-1252"; } return encoding; } + /** + * Extract the declared encoding from the xml + */ + public static String extractDeclaredEncoding(byte[] bytes) { + int index = ArrayUtils.indexOf(bytes, (byte) '>'); + if (index == -1) { + return null; + } + + String pi = new String(ArrayUtils.subarray(bytes, 0, index + 1)); + index = StringUtils.indexOf(pi, "encoding=\""); + if (index == -1) { + return null; + } + String encoding = pi.substring(index + 10, pi.length()); + encoding = encoding.substring(0, encoding.indexOf('"')); + return encoding; + } + public static String handleContent(String content, String baseUri) { if (StringUtils.isNotBlank(content)) { baseUri = StringUtils.trimToEmpty(baseUri);