From 35bc63c66eb43352af738c7a3e8c19b3a3b5c603 Mon Sep 17 00:00:00 2001 From: William Ting Date: Tue, 7 Jan 2014 11:44:44 -0600 Subject: [PATCH] Fix encoding issues. The original implementation used str.encode() on input and str.decode() on output. However this would cause UnicodeDecodeError since certain characters can't be encoded / decoded in ASCII. The new solution is to use unicode() on all input strings and output UTF-8 encoded strings. This makes the assumption that the shell can handle UTF-8 strings. --- Makefile | 3 ++- bin/autojump | 26 ++++++++++----------- bin/autojump_data.py | 7 ++---- bin/autojump_utils.py | 44 ++++++++++++++++-------------------- tests/autojump_utils_test.py | 34 ++++++++++++++++++++++++---- 5 files changed, 66 insertions(+), 48 deletions(-) diff --git a/Makefile b/Makefile index 5eb2329..3337a2f 100644 --- a/Makefile +++ b/Makefile @@ -39,4 +39,5 @@ tar: sha1sum autojump_v$(VERSION).tar.gz test: - testify -v tests + @find . -type f -iname "*.pyc" -delete + testify -v tests -x disabled diff --git a/bin/autojump b/bin/autojump index 82a2bb3..df716c8 100755 --- a/bin/autojump +++ b/bin/autojump @@ -45,8 +45,6 @@ from autojump_data import entriefy from autojump_data import Entry from autojump_data import load from autojump_data import save -from autojump_utils import decode -from autojump_utils import encode_local from autojump_utils import first from autojump_utils import get_tab_entry_info from autojump_utils import get_pwd @@ -54,9 +52,11 @@ from autojump_utils import has_uppercase from autojump_utils import is_osx from autojump_utils import last from autojump_utils import print_entry +from autojump_utils import print_local from autojump_utils import print_tab_menu from autojump_utils import sanitize from autojump_utils import take +from autojump_utils import unico VERSION = '22.0.0-alpha' FUZZY_MATCH_THRESHOLD = 0.6 @@ -131,7 +131,7 @@ def add_path(data, path, weight=10): with resulting duplicate entries in the database than a single canonical path. """ - path = decode(path).rstrip(os.sep) + path = unico(path).rstrip(os.sep) if path == os.path.expanduser('~'): return data, Entry(path, 0) @@ -142,7 +142,7 @@ def add_path(data, path, weight=10): def decrease_path(data, path, weight=15): """Decrease or zero out a path.""" - path = decode(path).rstrip(os.sep) + path = unico(path).rstrip(os.sep) data[path] = max(0, data.get(path, 0) - weight) return data, Entry(path, data[path]) @@ -189,11 +189,10 @@ def handle_tab_completion(needle, entries): tab_needle, tab_index, tab_path = get_tab_entry_info(needle, TAB_SEPARATOR) if tab_path: - print(encode_local(tab_path)) + print_local(tab_path) elif tab_index: get_ith_path = lambda i, iterable: last(take(i, iterable)).path - print(encode_local( - get_ith_path(tab_index, find_matches(entries, tab_needle)))) + print_local(get_ith_path(tab_index, find_matches(entries, tab_needle))) elif tab_needle: # found partial tab completion entry print_tab_menu( @@ -326,7 +325,8 @@ def print_stats(data, data_path): print("%d:\t number of entries" % len(data)) try: - print("%.2f:\t current directory weight" % data.get(os.getcwdu(), 0)) + print_local( + "%.2f:\t current directory weight" % data.get(os.getcwdu(), 0)) except OSError: # current directory no longer exists pass @@ -362,7 +362,7 @@ def main(args): # noqa elif not args.directory: # default return value so calling shell functions have an argument # to `cd` to - print(encode_local('.')) + print_local('.') else: entries = entriefy(load(config)) needles = sanitize(args.directory) @@ -370,13 +370,13 @@ def main(args): # noqa get_tab_entry_info(first(needles), TAB_SEPARATOR) if tab_path: - print(encode_local(tab_path)) + print_local(tab_path) elif tab_index: get_ith_path = lambda i, iterable: last(take(i, iterable)).path - print(encode_local( - get_ith_path(tab_index, find_matches(entries, tab_needle)))) + print_local( + get_ith_path(tab_index, find_matches(entries, tab_needle))) else: - print(encode_local(first(find_matches(entries, needles)).path)) + print_local(first(find_matches(entries, needles)).path) return 0 diff --git a/bin/autojump_data.py b/bin/autojump_data.py index 9b806a6..21ec70d 100644 --- a/bin/autojump_data.py +++ b/bin/autojump_data.py @@ -17,6 +17,7 @@ else: from itertools import imap from autojump_utils import create_dir +from autojump_utils import unico from autojump_utils import is_osx from autojump_utils import is_python3 from autojump_utils import move_file @@ -124,11 +125,7 @@ def save(config, data): encoding='utf-8', errors='replace') as f: for path, weight in data.items(): - if is_python3(): - f.write(("%s\t%s\n" % (weight, path))) - else: - f.write(unicode( - "%s\t%s\n" % (weight, path)).encode('utf-8')) + f.write(unico("%s\t%s\n" % (weight, path))) f.flush() os.fsync(f) diff --git a/bin/autojump_utils.py b/bin/autojump_utils.py index 1d88e1d..48f2185 100644 --- a/bin/autojump_utils.py +++ b/bin/autojump_utils.py @@ -28,27 +28,9 @@ def create_dir(path): raise -def decode(string): - """Converts byte string to Unicode string.""" - if is_python2(): - # Python 2.6 does not support kwargs - return string.decode('utf-8', 'replace') - return string - - -def encode(string): - """Converts Unicode string to byte string.""" - if is_python2(): - # Python 2.6 does not support kwargs - return string.encode('utf-8', 'replace') - return string - - -def encode_local(string, encoding=None): - """Converts string into local filesystem encoding.""" - if is_python2(): - return decode(string).encode(encoding or sys.getfilesystemencoding()) - return string +def encode_local(string): + """Converts string into user's preferred encoding.""" + return string.encode(sys.getfilesystemencoding() or 'utf-8') def first(xs): @@ -153,7 +135,11 @@ def move_file(src, dst): def print_entry(entry): - print(encode_local("%.1f:\t%s" % (entry.weight, entry.path))) + print_local("%.1f:\t%s" % (entry.weight, entry.path)) + + +def print_local(string): + print(encode_local(string)) def print_tab_menu(needle, tab_entries, separator): @@ -166,17 +152,18 @@ def print_tab_menu(needle, tab_entries, separator): on subsequent calls. """ for i, entry in enumerate(tab_entries): - print(encode_local( + print_local( '%s%s%d%s%s' % ( needle, separator, i + 1, separator, - entry.path))) + entry.path)) def sanitize(directories): - clean = lambda x: decode(x) if len(x) == 1 else decode(x).rstrip(os.sep) + # edge case to allow '/' as a valid path + clean = lambda x: unico(x) if x == os.sep else unico(x).rstrip(os.sep) return list(imap(clean, directories)) @@ -203,3 +190,10 @@ def surround_quotes(string): def take(n, iterable): """Return first n items of an iterable.""" return islice(iterable, n) + + +def unico(string): + """Converts into Unicode string.""" + if is_python2() and not isinstance(string, unicode): + return unicode(string, encoding='utf-8', errors='replace') + return string diff --git a/tests/autojump_utils_test.py b/tests/autojump_utils_test.py index 235d5d9..a07e4e3 100644 --- a/tests/autojump_utils_test.py +++ b/tests/autojump_utils_test.py @@ -5,6 +5,7 @@ from shutil import rmtree from tempfile import gettempdir from tempfile import mkdtemp import os +import sys import mock from testify import TestCase @@ -16,11 +17,12 @@ from testify import class_setup from testify import class_teardown from testify import run from testify import setup +from testify import suite from testify import teardown import autojump_utils from autojump_utils import create_dir -from autojump_utils import decode +from autojump_utils import encode_local from autojump_utils import first from autojump_utils import get_pwd from autojump_utils import get_tab_entry_info @@ -32,12 +34,31 @@ from autojump_utils import sanitize from autojump_utils import second from autojump_utils import surround_quotes from autojump_utils import take +from autojump_utils import unico class StringUnitTests(TestCase): - def test_decode(self): - assert_equal(decode(r'blah'), u'blah') - assert_equal(decode(r'日本語'), u'日本語') + @mock.patch.object(sys, 'getfilesystemencoding', return_value='ascii') + def test_encode_local_ascii(self, _): + assert_equal(encode_local(u'foo'), b'foo') + + @suite('disabled', reason='#246') + def test_encode_local_ascii_fails(self): + with assert_raises(UnicodeDecodeError): + with mock.patch.object( + sys, + 'getfilesystemencoding', + return_value='ascii'): + encode_local(u'日本語') + + @mock.patch.object(sys, 'getfilesystemencoding', return_value=None) + def test_encode_local_empty(self, _): + assert_equal(encode_local(b'foo'), u'foo') + + @mock.patch.object(sys, 'getfilesystemencoding', return_value='utf-8') + def test_encode_local_unicode(self, _): + assert_equal(encode_local(b'foo'), u'foo') + assert_equal(encode_local(u'foo'), u'foo') def test_has_uppercase(self): assert_true(has_uppercase('Foo')) @@ -57,6 +78,11 @@ class StringUnitTests(TestCase): assert_equal(sanitize([]), []) assert_equal(sanitize([r'/foo/bar/', r'/']), [u'/foo/bar', u'/']) + def test_unico(self): + assert_equal(unico(b'blah'), u'blah') + assert_equal(unico(b'日本語'), u'日本語') + assert_equal(unico(u'でもおれは中国人だ。'), u'でもおれは中国人だ。') + class IterationUnitTests(TestCase): def test_first(self):