From 35bc63c66eb43352af738c7a3e8c19b3a3b5c603 Mon Sep 17 00:00:00 2001
From: William Ting <io@williamting.com>
Date: Tue, 7 Jan 2014 11:44:44 -0600
Subject: [PATCH] Fix encoding issues.

The original implementation used str.encode() on input and str.decode() on
output. However this would cause UnicodeDecodeError since certain characters
can't be encoded / decoded in ASCII.

The new solution is to use unicode() on all input strings and output UTF-8
encoded strings. This makes the assumption that the shell can handle UTF-8
strings.
---
 Makefile                     |  3 ++-
 bin/autojump                 | 26 ++++++++++-----------
 bin/autojump_data.py         |  7 ++----
 bin/autojump_utils.py        | 44 ++++++++++++++++--------------------
 tests/autojump_utils_test.py | 34 ++++++++++++++++++++++++----
 5 files changed, 66 insertions(+), 48 deletions(-)

diff --git a/Makefile b/Makefile
index 5eb2329..3337a2f 100644
--- a/Makefile
+++ b/Makefile
@@ -39,4 +39,5 @@ tar:
 	sha1sum autojump_v$(VERSION).tar.gz
 
 test:
-	testify -v tests
+	@find . -type f -iname "*.pyc" -delete
+	testify -v tests -x disabled
diff --git a/bin/autojump b/bin/autojump
index 82a2bb3..df716c8 100755
--- a/bin/autojump
+++ b/bin/autojump
@@ -45,8 +45,6 @@ from autojump_data import entriefy
 from autojump_data import Entry
 from autojump_data import load
 from autojump_data import save
-from autojump_utils import decode
-from autojump_utils import encode_local
 from autojump_utils import first
 from autojump_utils import get_tab_entry_info
 from autojump_utils import get_pwd
@@ -54,9 +52,11 @@ from autojump_utils import has_uppercase
 from autojump_utils import is_osx
 from autojump_utils import last
 from autojump_utils import print_entry
+from autojump_utils import print_local
 from autojump_utils import print_tab_menu
 from autojump_utils import sanitize
 from autojump_utils import take
+from autojump_utils import unico
 
 VERSION = '22.0.0-alpha'
 FUZZY_MATCH_THRESHOLD = 0.6
@@ -131,7 +131,7 @@ def add_path(data, path, weight=10):
     with resulting duplicate entries in the database than a single canonical
     path.
     """
-    path = decode(path).rstrip(os.sep)
+    path = unico(path).rstrip(os.sep)
     if path == os.path.expanduser('~'):
         return data, Entry(path, 0)
 
@@ -142,7 +142,7 @@ def add_path(data, path, weight=10):
 
 def decrease_path(data, path, weight=15):
     """Decrease or zero out a path."""
-    path = decode(path).rstrip(os.sep)
+    path = unico(path).rstrip(os.sep)
     data[path] = max(0, data.get(path, 0) - weight)
     return data, Entry(path, data[path])
 
@@ -189,11 +189,10 @@ def handle_tab_completion(needle, entries):
     tab_needle, tab_index, tab_path = get_tab_entry_info(needle, TAB_SEPARATOR)
 
     if tab_path:
-        print(encode_local(tab_path))
+        print_local(tab_path)
     elif tab_index:
         get_ith_path = lambda i, iterable: last(take(i, iterable)).path
-        print(encode_local(
-            get_ith_path(tab_index, find_matches(entries, tab_needle))))
+        print_local(get_ith_path(tab_index, find_matches(entries, tab_needle)))
     elif tab_needle:
         # found partial tab completion entry
         print_tab_menu(
@@ -326,7 +325,8 @@ def print_stats(data, data_path):
     print("%d:\t number of entries" % len(data))
 
     try:
-        print("%.2f:\t current directory weight" % data.get(os.getcwdu(), 0))
+        print_local(
+                "%.2f:\t current directory weight" % data.get(os.getcwdu(), 0))
     except OSError:
         # current directory no longer exists
         pass
@@ -362,7 +362,7 @@ def main(args):  # noqa
     elif not args.directory:
         # default return value so calling shell functions have an argument
         # to `cd` to
-        print(encode_local('.'))
+        print_local('.')
     else:
         entries = entriefy(load(config))
         needles = sanitize(args.directory)
@@ -370,13 +370,13 @@ def main(args):  # noqa
             get_tab_entry_info(first(needles), TAB_SEPARATOR)
 
         if tab_path:
-            print(encode_local(tab_path))
+            print_local(tab_path)
         elif tab_index:
             get_ith_path = lambda i, iterable: last(take(i, iterable)).path
-            print(encode_local(
-                get_ith_path(tab_index, find_matches(entries, tab_needle))))
+            print_local(
+                    get_ith_path(tab_index, find_matches(entries, tab_needle)))
         else:
-            print(encode_local(first(find_matches(entries, needles)).path))
+            print_local(first(find_matches(entries, needles)).path)
 
     return 0
 
diff --git a/bin/autojump_data.py b/bin/autojump_data.py
index 9b806a6..21ec70d 100644
--- a/bin/autojump_data.py
+++ b/bin/autojump_data.py
@@ -17,6 +17,7 @@ else:
     from itertools import imap
 
 from autojump_utils import create_dir
+from autojump_utils import unico
 from autojump_utils import is_osx
 from autojump_utils import is_python3
 from autojump_utils import move_file
@@ -124,11 +125,7 @@ def save(config, data):
                 encoding='utf-8',
                 errors='replace') as f:
             for path, weight in data.items():
-                if is_python3():
-                    f.write(("%s\t%s\n" % (weight, path)))
-                else:
-                    f.write(unicode(
-                        "%s\t%s\n" % (weight, path)).encode('utf-8'))
+                f.write(unico("%s\t%s\n" % (weight, path)))
 
             f.flush()
             os.fsync(f)
diff --git a/bin/autojump_utils.py b/bin/autojump_utils.py
index 1d88e1d..48f2185 100644
--- a/bin/autojump_utils.py
+++ b/bin/autojump_utils.py
@@ -28,27 +28,9 @@ def create_dir(path):
             raise
 
 
-def decode(string):
-    """Converts byte string to Unicode string."""
-    if is_python2():
-        # Python 2.6 does not support kwargs
-        return string.decode('utf-8', 'replace')
-    return string
-
-
-def encode(string):
-    """Converts Unicode string to byte string."""
-    if is_python2():
-        # Python 2.6 does not support kwargs
-        return string.encode('utf-8', 'replace')
-    return string
-
-
-def encode_local(string, encoding=None):
-    """Converts string into local filesystem encoding."""
-    if is_python2():
-        return decode(string).encode(encoding or sys.getfilesystemencoding())
-    return string
+def encode_local(string):
+    """Converts string into user's preferred encoding."""
+    return string.encode(sys.getfilesystemencoding() or 'utf-8')
 
 
 def first(xs):
@@ -153,7 +135,11 @@ def move_file(src, dst):
 
 
 def print_entry(entry):
-    print(encode_local("%.1f:\t%s" % (entry.weight, entry.path)))
+    print_local("%.1f:\t%s" % (entry.weight, entry.path))
+
+
+def print_local(string):
+    print(encode_local(string))
 
 
 def print_tab_menu(needle, tab_entries, separator):
@@ -166,17 +152,18 @@ def print_tab_menu(needle, tab_entries, separator):
     on subsequent calls.
     """
     for i, entry in enumerate(tab_entries):
-        print(encode_local(
+        print_local(
             '%s%s%d%s%s' % (
                 needle,
                 separator,
                 i + 1,
                 separator,
-                entry.path)))
+                entry.path))
 
 
 def sanitize(directories):
-    clean = lambda x: decode(x) if len(x) == 1 else decode(x).rstrip(os.sep)
+    # edge case to allow '/' as a valid path
+    clean = lambda x: unico(x) if x == os.sep else unico(x).rstrip(os.sep)
     return list(imap(clean, directories))
 
 
@@ -203,3 +190,10 @@ def surround_quotes(string):
 def take(n, iterable):
     """Return first n items of an iterable."""
     return islice(iterable, n)
+
+
+def unico(string):
+    """Converts into Unicode string."""
+    if is_python2() and not isinstance(string, unicode):
+        return unicode(string, encoding='utf-8', errors='replace')
+    return string
diff --git a/tests/autojump_utils_test.py b/tests/autojump_utils_test.py
index 235d5d9..a07e4e3 100644
--- a/tests/autojump_utils_test.py
+++ b/tests/autojump_utils_test.py
@@ -5,6 +5,7 @@ from shutil import rmtree
 from tempfile import gettempdir
 from tempfile import mkdtemp
 import os
+import sys
 
 import mock
 from testify import TestCase
@@ -16,11 +17,12 @@ from testify import class_setup
 from testify import class_teardown
 from testify import run
 from testify import setup
+from testify import suite
 from testify import teardown
 
 import autojump_utils
 from autojump_utils import create_dir
-from autojump_utils import decode
+from autojump_utils import encode_local
 from autojump_utils import first
 from autojump_utils import get_pwd
 from autojump_utils import get_tab_entry_info
@@ -32,12 +34,31 @@ from autojump_utils import sanitize
 from autojump_utils import second
 from autojump_utils import surround_quotes
 from autojump_utils import take
+from autojump_utils import unico
 
 
 class StringUnitTests(TestCase):
-    def test_decode(self):
-        assert_equal(decode(r'blah'), u'blah')
-        assert_equal(decode(r'日本語'), u'日本語')
+    @mock.patch.object(sys, 'getfilesystemencoding', return_value='ascii')
+    def test_encode_local_ascii(self, _):
+        assert_equal(encode_local(u'foo'), b'foo')
+
+    @suite('disabled', reason='#246')
+    def test_encode_local_ascii_fails(self):
+        with assert_raises(UnicodeDecodeError):
+            with mock.patch.object(
+                    sys,
+                    'getfilesystemencoding',
+                    return_value='ascii'):
+                encode_local(u'日本語')
+
+    @mock.patch.object(sys, 'getfilesystemencoding', return_value=None)
+    def test_encode_local_empty(self, _):
+        assert_equal(encode_local(b'foo'), u'foo')
+
+    @mock.patch.object(sys, 'getfilesystemencoding', return_value='utf-8')
+    def test_encode_local_unicode(self, _):
+        assert_equal(encode_local(b'foo'), u'foo')
+        assert_equal(encode_local(u'foo'), u'foo')
 
     def test_has_uppercase(self):
         assert_true(has_uppercase('Foo'))
@@ -57,6 +78,11 @@ class StringUnitTests(TestCase):
         assert_equal(sanitize([]), [])
         assert_equal(sanitize([r'/foo/bar/', r'/']), [u'/foo/bar', u'/'])
 
+    def test_unico(self):
+        assert_equal(unico(b'blah'), u'blah')
+        assert_equal(unico(b'日本語'), u'日本語')
+        assert_equal(unico(u'でもおれは中国人だ。'), u'でもおれは中国人だ。')
+
 
 class IterationUnitTests(TestCase):
     def test_first(self):