Fix encoding issues.

The original implementation used str.encode() on input and str.decode() on
output. However this would cause UnicodeDecodeError since certain characters
can't be encoded / decoded in ASCII.

The new solution is to use unicode() on all input strings and output UTF-8
encoded strings. This makes the assumption that the shell can handle UTF-8
strings.
pull/252/head
William Ting 11 years ago
parent 3f460fb3e9
commit 35bc63c66e

@ -39,4 +39,5 @@ tar:
sha1sum autojump_v$(VERSION).tar.gz sha1sum autojump_v$(VERSION).tar.gz
test: test:
testify -v tests @find . -type f -iname "*.pyc" -delete
testify -v tests -x disabled

@ -45,8 +45,6 @@ from autojump_data import entriefy
from autojump_data import Entry from autojump_data import Entry
from autojump_data import load from autojump_data import load
from autojump_data import save from autojump_data import save
from autojump_utils import decode
from autojump_utils import encode_local
from autojump_utils import first from autojump_utils import first
from autojump_utils import get_tab_entry_info from autojump_utils import get_tab_entry_info
from autojump_utils import get_pwd from autojump_utils import get_pwd
@ -54,9 +52,11 @@ from autojump_utils import has_uppercase
from autojump_utils import is_osx from autojump_utils import is_osx
from autojump_utils import last from autojump_utils import last
from autojump_utils import print_entry from autojump_utils import print_entry
from autojump_utils import print_local
from autojump_utils import print_tab_menu from autojump_utils import print_tab_menu
from autojump_utils import sanitize from autojump_utils import sanitize
from autojump_utils import take from autojump_utils import take
from autojump_utils import unico
VERSION = '22.0.0-alpha' VERSION = '22.0.0-alpha'
FUZZY_MATCH_THRESHOLD = 0.6 FUZZY_MATCH_THRESHOLD = 0.6
@ -131,7 +131,7 @@ def add_path(data, path, weight=10):
with resulting duplicate entries in the database than a single canonical with resulting duplicate entries in the database than a single canonical
path. path.
""" """
path = decode(path).rstrip(os.sep) path = unico(path).rstrip(os.sep)
if path == os.path.expanduser('~'): if path == os.path.expanduser('~'):
return data, Entry(path, 0) return data, Entry(path, 0)
@ -142,7 +142,7 @@ def add_path(data, path, weight=10):
def decrease_path(data, path, weight=15): def decrease_path(data, path, weight=15):
"""Decrease or zero out a path.""" """Decrease or zero out a path."""
path = decode(path).rstrip(os.sep) path = unico(path).rstrip(os.sep)
data[path] = max(0, data.get(path, 0) - weight) data[path] = max(0, data.get(path, 0) - weight)
return data, Entry(path, data[path]) return data, Entry(path, data[path])
@ -189,11 +189,10 @@ def handle_tab_completion(needle, entries):
tab_needle, tab_index, tab_path = get_tab_entry_info(needle, TAB_SEPARATOR) tab_needle, tab_index, tab_path = get_tab_entry_info(needle, TAB_SEPARATOR)
if tab_path: if tab_path:
print(encode_local(tab_path)) print_local(tab_path)
elif tab_index: elif tab_index:
get_ith_path = lambda i, iterable: last(take(i, iterable)).path get_ith_path = lambda i, iterable: last(take(i, iterable)).path
print(encode_local( print_local(get_ith_path(tab_index, find_matches(entries, tab_needle)))
get_ith_path(tab_index, find_matches(entries, tab_needle))))
elif tab_needle: elif tab_needle:
# found partial tab completion entry # found partial tab completion entry
print_tab_menu( print_tab_menu(
@ -326,7 +325,8 @@ def print_stats(data, data_path):
print("%d:\t number of entries" % len(data)) print("%d:\t number of entries" % len(data))
try: try:
print("%.2f:\t current directory weight" % data.get(os.getcwdu(), 0)) print_local(
"%.2f:\t current directory weight" % data.get(os.getcwdu(), 0))
except OSError: except OSError:
# current directory no longer exists # current directory no longer exists
pass pass
@ -362,7 +362,7 @@ def main(args): # noqa
elif not args.directory: elif not args.directory:
# default return value so calling shell functions have an argument # default return value so calling shell functions have an argument
# to `cd` to # to `cd` to
print(encode_local('.')) print_local('.')
else: else:
entries = entriefy(load(config)) entries = entriefy(load(config))
needles = sanitize(args.directory) needles = sanitize(args.directory)
@ -370,13 +370,13 @@ def main(args): # noqa
get_tab_entry_info(first(needles), TAB_SEPARATOR) get_tab_entry_info(first(needles), TAB_SEPARATOR)
if tab_path: if tab_path:
print(encode_local(tab_path)) print_local(tab_path)
elif tab_index: elif tab_index:
get_ith_path = lambda i, iterable: last(take(i, iterable)).path get_ith_path = lambda i, iterable: last(take(i, iterable)).path
print(encode_local( print_local(
get_ith_path(tab_index, find_matches(entries, tab_needle)))) get_ith_path(tab_index, find_matches(entries, tab_needle)))
else: else:
print(encode_local(first(find_matches(entries, needles)).path)) print_local(first(find_matches(entries, needles)).path)
return 0 return 0

@ -17,6 +17,7 @@ else:
from itertools import imap from itertools import imap
from autojump_utils import create_dir from autojump_utils import create_dir
from autojump_utils import unico
from autojump_utils import is_osx from autojump_utils import is_osx
from autojump_utils import is_python3 from autojump_utils import is_python3
from autojump_utils import move_file from autojump_utils import move_file
@ -124,11 +125,7 @@ def save(config, data):
encoding='utf-8', encoding='utf-8',
errors='replace') as f: errors='replace') as f:
for path, weight in data.items(): for path, weight in data.items():
if is_python3(): f.write(unico("%s\t%s\n" % (weight, path)))
f.write(("%s\t%s\n" % (weight, path)))
else:
f.write(unicode(
"%s\t%s\n" % (weight, path)).encode('utf-8'))
f.flush() f.flush()
os.fsync(f) os.fsync(f)

@ -28,27 +28,9 @@ def create_dir(path):
raise raise
def decode(string): def encode_local(string):
"""Converts byte string to Unicode string.""" """Converts string into user's preferred encoding."""
if is_python2(): return string.encode(sys.getfilesystemencoding() or 'utf-8')
# Python 2.6 does not support kwargs
return string.decode('utf-8', 'replace')
return string
def encode(string):
"""Converts Unicode string to byte string."""
if is_python2():
# Python 2.6 does not support kwargs
return string.encode('utf-8', 'replace')
return string
def encode_local(string, encoding=None):
"""Converts string into local filesystem encoding."""
if is_python2():
return decode(string).encode(encoding or sys.getfilesystemencoding())
return string
def first(xs): def first(xs):
@ -153,7 +135,11 @@ def move_file(src, dst):
def print_entry(entry): def print_entry(entry):
print(encode_local("%.1f:\t%s" % (entry.weight, entry.path))) print_local("%.1f:\t%s" % (entry.weight, entry.path))
def print_local(string):
print(encode_local(string))
def print_tab_menu(needle, tab_entries, separator): def print_tab_menu(needle, tab_entries, separator):
@ -166,17 +152,18 @@ def print_tab_menu(needle, tab_entries, separator):
on subsequent calls. on subsequent calls.
""" """
for i, entry in enumerate(tab_entries): for i, entry in enumerate(tab_entries):
print(encode_local( print_local(
'%s%s%d%s%s' % ( '%s%s%d%s%s' % (
needle, needle,
separator, separator,
i + 1, i + 1,
separator, separator,
entry.path))) entry.path))
def sanitize(directories): def sanitize(directories):
clean = lambda x: decode(x) if len(x) == 1 else decode(x).rstrip(os.sep) # edge case to allow '/' as a valid path
clean = lambda x: unico(x) if x == os.sep else unico(x).rstrip(os.sep)
return list(imap(clean, directories)) return list(imap(clean, directories))
@ -203,3 +190,10 @@ def surround_quotes(string):
def take(n, iterable): def take(n, iterable):
"""Return first n items of an iterable.""" """Return first n items of an iterable."""
return islice(iterable, n) return islice(iterable, n)
def unico(string):
"""Converts into Unicode string."""
if is_python2() and not isinstance(string, unicode):
return unicode(string, encoding='utf-8', errors='replace')
return string

@ -5,6 +5,7 @@ from shutil import rmtree
from tempfile import gettempdir from tempfile import gettempdir
from tempfile import mkdtemp from tempfile import mkdtemp
import os import os
import sys
import mock import mock
from testify import TestCase from testify import TestCase
@ -16,11 +17,12 @@ from testify import class_setup
from testify import class_teardown from testify import class_teardown
from testify import run from testify import run
from testify import setup from testify import setup
from testify import suite
from testify import teardown from testify import teardown
import autojump_utils import autojump_utils
from autojump_utils import create_dir from autojump_utils import create_dir
from autojump_utils import decode from autojump_utils import encode_local
from autojump_utils import first from autojump_utils import first
from autojump_utils import get_pwd from autojump_utils import get_pwd
from autojump_utils import get_tab_entry_info from autojump_utils import get_tab_entry_info
@ -32,12 +34,31 @@ from autojump_utils import sanitize
from autojump_utils import second from autojump_utils import second
from autojump_utils import surround_quotes from autojump_utils import surround_quotes
from autojump_utils import take from autojump_utils import take
from autojump_utils import unico
class StringUnitTests(TestCase): class StringUnitTests(TestCase):
def test_decode(self): @mock.patch.object(sys, 'getfilesystemencoding', return_value='ascii')
assert_equal(decode(r'blah'), u'blah') def test_encode_local_ascii(self, _):
assert_equal(decode(r'日本語'), u'日本語') assert_equal(encode_local(u'foo'), b'foo')
@suite('disabled', reason='#246')
def test_encode_local_ascii_fails(self):
with assert_raises(UnicodeDecodeError):
with mock.patch.object(
sys,
'getfilesystemencoding',
return_value='ascii'):
encode_local(u'日本語')
@mock.patch.object(sys, 'getfilesystemencoding', return_value=None)
def test_encode_local_empty(self, _):
assert_equal(encode_local(b'foo'), u'foo')
@mock.patch.object(sys, 'getfilesystemencoding', return_value='utf-8')
def test_encode_local_unicode(self, _):
assert_equal(encode_local(b'foo'), u'foo')
assert_equal(encode_local(u'foo'), u'foo')
def test_has_uppercase(self): def test_has_uppercase(self):
assert_true(has_uppercase('Foo')) assert_true(has_uppercase('Foo'))
@ -57,6 +78,11 @@ class StringUnitTests(TestCase):
assert_equal(sanitize([]), []) assert_equal(sanitize([]), [])
assert_equal(sanitize([r'/foo/bar/', r'/']), [u'/foo/bar', u'/']) assert_equal(sanitize([r'/foo/bar/', r'/']), [u'/foo/bar', u'/'])
def test_unico(self):
assert_equal(unico(b'blah'), u'blah')
assert_equal(unico(b'日本語'), u'日本語')
assert_equal(unico(u'でもおれは中国人だ。'), u'でもおれは中国人だ。')
class IterationUnitTests(TestCase): class IterationUnitTests(TestCase):
def test_first(self): def test_first(self):

Loading…
Cancel
Save