From 7a3b8a8e7f2a57a77b923d49850a2ca6a07c2d64 Mon Sep 17 00:00:00 2001 From: jez Date: Thu, 26 May 2011 18:22:00 +0000 Subject: [PATCH 1/8] Add profiling code. --- profile/profile.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 profile/profile.py diff --git a/profile/profile.py b/profile/profile.py new file mode 100644 index 0000000..cc7c4c0 --- /dev/null +++ b/profile/profile.py @@ -0,0 +1,26 @@ +from __future__ import division, print_function +import cProfile, sys, imp, os, pstats +autojump = imp.load_source('autojump', 'autojump') + +"""Profile the total time taken for autojump to generate completions as a +function of pattern length. This file must be run from the project root.""" + +if os.path.exists('./profile/autojump_py'): + autojump.CONFIG_DIR = './profile' + +if len(sys.argv) > 1: + outfile = open(sys.argv[1], 'w') +else: + outfile = open('profile_results', 'w') +outfile.write('Pattern length\tTime taken/s\n') + +# For maximum running time, we don't want to match any files. +test_search = '#' * 10 +for i in range(0, 10): + autojump.argv = ['', '--completion', test_search[:i+1]] + cProfile.run('autojump.shell_utility()', 'shellprof') + p = pstats.Stats('shellprof') + outfile.write("%s\t%s\n"% (i + 1, p.total_tt)) +p.sort_stats('time') +p.print_stats(10) + From 2a93e3c570c6a70bea5dcafec3dc12e650c6b418 Mon Sep 17 00:00:00 2001 From: jez Date: Fri, 27 May 2011 07:42:06 +0000 Subject: [PATCH 2/8] Factor out current directory check for speedup. --- autojump | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/autojump b/autojump index 2118056..f0c72ed 100755 --- a/autojump +++ b/autojump @@ -90,13 +90,6 @@ def clean_dict(sorted_dirs, path_dict): def match(path, pattern, ignore_case=False, only_end=False): """Check whether a path matches a particular pattern""" - try: - if os.path.realpath(os.curdir) == path : - return False - #Sometimes the current path doesn't exist anymore. - #In that case, jump if possible. - except OSError: - pass if only_end: match_string = "/".join(path.split('/')[-1-pattern.count('/'):]) else: @@ -200,7 +193,14 @@ def shell_utility(): endmatch = re.match("(.*)"+COMPLETION_SEPARATOR, patterns[-1]) if endmatch: patterns[-1] = endmatch.group(1) - dirs = list(path_dict.items()) + try: + cwd = os.path.realpath(os.curdir) + #Sometimes the current path doesn't exist anymore. + #In that case, jump if possible. + except OSError: + cwd = None + dirs = list((path, count) for path, count in path_dict.items() + if path != cwd) dirs.sort(key=itemgetter(1), reverse=True) if completion or userchoice != -1: max_matches = 9 From 9b977379eb96dc13186439f3181898ceec79eafa Mon Sep 17 00:00:00 2001 From: jez Date: Fri, 27 May 2011 07:56:48 +0000 Subject: [PATCH 3/8] Implement approximate matching via Levenshtein distance. --- autojump | 82 +++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 60 insertions(+), 22 deletions(-) diff --git a/autojump b/autojump index f0c72ed..928b5b5 100755 --- a/autojump +++ b/autojump @@ -30,6 +30,7 @@ import getopt from sys import argv, stderr, version_info, exit from tempfile import NamedTemporaryFile from operator import itemgetter +from copy import copy import os MAX_KEYWEIGHT = 1000 MAX_STORED_PATHS = 600 @@ -88,30 +89,62 @@ def clean_dict(sorted_dirs, path_dict): return True else: return False -def match(path, pattern, ignore_case=False, only_end=False): - """Check whether a path matches a particular pattern""" - if only_end: - match_string = "/".join(path.split('/')[-1-pattern.count('/'):]) - else: - match_string = path - if ignore_case: - does_match = (match_string.lower().find(pattern.lower()) != -1) - else: - does_match = (match_string.find(pattern) != -1) - #return True if there is a match and the path exists - #(useful in the case of external drives, for example) - return does_match and os.path.exists(path) +def approximatch(pat, text): + prev_col = list(range(0, len(pat)+1)) + col = [0] * (len(pat) + 1) + errors = len(pat) + for char1 in text: + col[0] = 0 + for i, char2 in enumerate(pat): + if char1 == char2: + col[i+1] = prev_col[i] + else: + col[i+1] = 1 + min(col[i], prev_col[i+1], prev_col[i]) + prev_col = copy(col) + errors = min(errors, col[-1]) + return errors -def find_matches(dirs, patterns, result_list, ignore_case, max_matches): +def find_matches(dirs, patterns, result_list, ignore_case, approx, max_matches): """Find max_matches paths that match the pattern, and add them to the result_list""" - for path, count in dirs: - if len(result_list) >= max_matches : - break + + def get_pattern_and_match(patterns, path): #For the last pattern, only match the end of the pattern - if all(match(path, p, ignore_case, - only_end=(n == len(patterns)-1)) for n, p in enumerate(patterns)): - uniqadd(result_list, path) + for n, pattern in enumerate(patterns): + if n == len(patterns) - 1: + match_string = "/".join(path.split('/')[-1-pattern.count('/'):]) + else: + match_string = path + if ignore_case: + pattern = pattern.lower() + match_string = match_string.lower() + yield (pattern, match_string) + + if approx: + one_error_paths = [] + two_error_paths = [] + for path, count in dirs: + if len(one_error_paths) >= max_matches: + break + errors = sum(approximatch(pattern, match_string) + for pattern, match_string in get_pattern_and_match(patterns, path)) + #Verify that the path exists + #(useful in the case of external drives, for example) + if errors <= 2 and os.path.exists(path): + if errors == 1: + uniqadd(one_error_paths, path) + elif errors == 2: + uniqadd(two_error_paths, path) + result_list.extend(one_error_paths) + result_list.extend(two_error_paths[:max_matches-len(one_error_paths)]) + else: + for path, count in dirs: + if len(result_list) >= max_matches: + break + if all(match_string.find(pattern) != -1 + for pattern, match_string in + get_pattern_and_match(patterns, path)) and os.path.exists(path): + uniqadd(result_list, path) def open_dic(dic_file, error_recovery=False): """Try hard to open the database file, recovering @@ -206,12 +239,17 @@ def shell_utility(): max_matches = 9 else: max_matches = 1 - find_matches(dirs, patterns, results, False, max_matches) + find_matches(dirs, patterns, results, False, False, max_matches) # If not found, try ignoring case. # On completion always show all results if completion or not results: find_matches(dirs, patterns, results, - ignore_case=True, max_matches=max_matches) + ignore_case=True, approx=False, max_matches=max_matches) + + if not results: + find_matches(dirs, patterns, results, + ignore_case=True, approx=True, max_matches=max_matches) + # Keep the database to a reasonable size if not completion and clean_dict(dirs, path_dict): save(path_dict, dic_file) From 544aefa17839c3eeb45a4a1d1e18f7fbe3665285 Mon Sep 17 00:00:00 2001 From: jez Date: Fri, 27 May 2011 08:16:09 +0000 Subject: [PATCH 4/8] Errors should not equal length of string. Otherwise a match is always possible. --- autojump | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/autojump b/autojump index 928b5b5..74b0cd9 100755 --- a/autojump +++ b/autojump @@ -126,14 +126,22 @@ def find_matches(dirs, patterns, result_list, ignore_case, approx, max_matches): for path, count in dirs: if len(one_error_paths) >= max_matches: break - errors = sum(approximatch(pattern, match_string) - for pattern, match_string in get_pattern_and_match(patterns, path)) + total_errors = 0 + bad_match = False + for pattern, match_string in get_pattern_and_match(patterns, path): + errors = approximatch(pattern, match_string) + if errors >= len(pattern) or errors >= len(match_string): + bad_match = True + break + total_errors += errors + if bad_match: + continue #Verify that the path exists #(useful in the case of external drives, for example) - if errors <= 2 and os.path.exists(path): - if errors == 1: + if total_errors <= 2 and os.path.exists(path): + if total_errors == 1: uniqadd(one_error_paths, path) - elif errors == 2: + elif total_errors == 2: uniqadd(two_error_paths, path) result_list.extend(one_error_paths) result_list.extend(two_error_paths[:max_matches-len(one_error_paths)]) From 4c2517dafa4f410e740bc5a07ec193689175aab6 Mon Sep 17 00:00:00 2001 From: jez Date: Wed, 1 Jun 2011 01:38:04 +0000 Subject: [PATCH 5/8] Implement Damerau-Levenshtein distance. --- autojump | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/autojump b/autojump index 74b0cd9..21e632a 100755 --- a/autojump +++ b/autojump @@ -90,18 +90,25 @@ def clean_dict(sorted_dirs, path_dict): else: return False def approximatch(pat, text): - prev_col = list(range(0, len(pat)+1)) - col = [0] * (len(pat) + 1) + cols = [list(range(0, len(pat)+1))] + cols.extend(copy(col) for col in [[0] * (len(pat) + 1)] * (len(text) + 1)) errors = len(pat) - for char1 in text: - col[0] = 0 - for i, char2 in enumerate(pat): + last_seen_in_text = {} + for i, char1 in enumerate(text): + cols[i+1][0] = 0 + last_seen_in_pat = 0 + for j, char2 in enumerate(pat): + i1 = last_seen_in_text[char2] if char2 in last_seen_in_text else 0 + j1 = last_seen_in_pat if char1 == char2: - col[i+1] = prev_col[i] + cols[i+1][j+1] = cols[i][j] + last_seen_in_pat = j + 1 else: - col[i+1] = 1 + min(col[i], prev_col[i+1], prev_col[i]) - prev_col = copy(col) - errors = min(errors, col[-1]) + cols[i+1][j+1] = 1 + min(cols[i+1][j], cols[i][j+1], cols[i][j]) + if i1 and j1: + cols[i+1][j+1] = min(cols[i+1][j+1], 1 + (i - i1) + (j - j1) + cols[i1-1][j1-1]) + errors = min(errors, cols[i+1][-1]) + last_seen_in_text[char1] = i + 1 return errors def find_matches(dirs, patterns, result_list, ignore_case, approx, max_matches): From 7ffb81d08e0a9cdb43d44f5b937842000f4fc16f Mon Sep 17 00:00:00 2001 From: jez Date: Tue, 31 May 2011 04:48:05 +0000 Subject: [PATCH 6/8] Optimize: Use append() instead of copy(). --- autojump | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/autojump b/autojump index 21e632a..b2ac7a4 100755 --- a/autojump +++ b/autojump @@ -30,7 +30,6 @@ import getopt from sys import argv, stderr, version_info, exit from tempfile import NamedTemporaryFile from operator import itemgetter -from copy import copy import os MAX_KEYWEIGHT = 1000 MAX_STORED_PATHS = 600 @@ -91,7 +90,7 @@ def clean_dict(sorted_dirs, path_dict): def approximatch(pat, text): cols = [list(range(0, len(pat)+1))] - cols.extend(copy(col) for col in [[0] * (len(pat) + 1)] * (len(text) + 1)) + for i in range(0, len(text)): cols.append([0] * (len(pat) + 1)) errors = len(pat) last_seen_in_text = {} for i, char1 in enumerate(text): From d6a92e4582f5a9e1f19ae78b3ce001c87b5d2921 Mon Sep 17 00:00:00 2001 From: jez Date: Tue, 31 May 2011 17:01:09 +0000 Subject: [PATCH 7/8] Implement Ukkonen's cut-off heuristic. --- autojump | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/autojump b/autojump index b2ac7a4..e56400f 100755 --- a/autojump +++ b/autojump @@ -88,10 +88,11 @@ def clean_dict(sorted_dirs, path_dict): return True else: return False -def approximatch(pat, text): +def approximatch(pat, text, max_errors): cols = [list(range(0, len(pat)+1))] - for i in range(0, len(text)): cols.append([0] * (len(pat) + 1)) errors = len(pat) + for i in range(0, len(text)): cols.append([errors] * (len(pat) + 1)) + last_active = min(max_errors, len(pat)) last_seen_in_text = {} for i, char1 in enumerate(text): cols[i+1][0] = 0 @@ -106,8 +107,17 @@ def approximatch(pat, text): cols[i+1][j+1] = 1 + min(cols[i+1][j], cols[i][j+1], cols[i][j]) if i1 and j1: cols[i+1][j+1] = min(cols[i+1][j+1], 1 + (i - i1) + (j - j1) + cols[i1-1][j1-1]) - errors = min(errors, cols[i+1][-1]) + + if j + 1 == len(pat): + errors = min(errors, cols[i+1][j+1]) + elif j + 1 == last_active + 1: + break + last_seen_in_text[char1] = i + 1 + + if last_active < len(pat): last_active += 1 + while cols[i+1][last_active] > max_errors: last_active -= 1 + return errors def find_matches(dirs, patterns, result_list, ignore_case, approx, max_matches): @@ -135,7 +145,7 @@ def find_matches(dirs, patterns, result_list, ignore_case, approx, max_matches): total_errors = 0 bad_match = False for pattern, match_string in get_pattern_and_match(patterns, path): - errors = approximatch(pattern, match_string) + errors = approximatch(pattern, match_string, 2) if errors >= len(pattern) or errors >= len(match_string): bad_match = True break From 2ce85ddc9a9c3c091afff1aa608ac30e413da452 Mon Sep 17 00:00:00 2001 From: jez Date: Wed, 1 Jun 2011 01:22:15 +0000 Subject: [PATCH 8/8] Add comments. --- autojump | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/autojump b/autojump index e56400f..f18eb7c 100755 --- a/autojump +++ b/autojump @@ -89,6 +89,11 @@ def clean_dict(sorted_dirs, path_dict): else: return False def approximatch(pat, text, max_errors): + """Calculate the Damerau-Levenshtein distance between :pat and :text, + minimized over all possible positions of :pat within :text. As an + optimization, this distance is only accurate if it is <= :max_errors. + Return values greater than :max_errors indicate that the distance is _at + least_ that much. Runs in O(:max_errors * len(:text)) time.""" cols = [list(range(0, len(pat)+1))] errors = len(pat) for i in range(0, len(text)): cols.append([errors] * (len(pat) + 1)) @@ -108,6 +113,9 @@ def approximatch(pat, text, max_errors): if i1 and j1: cols[i+1][j+1] = min(cols[i+1][j+1], 1 + (i - i1) + (j - j1) + cols[i1-1][j1-1]) + #Ukkonen's cut-off heuristic. See 'Theoretical and Empirical + #Comparisons of Approximate String Matching Algorithms by Chang and + #Lampe for details. if j + 1 == len(pat): errors = min(errors, cols[i+1][j+1]) elif j + 1 == last_active + 1: @@ -146,6 +154,8 @@ def find_matches(dirs, patterns, result_list, ignore_case, approx, max_matches): bad_match = False for pattern, match_string in get_pattern_and_match(patterns, path): errors = approximatch(pattern, match_string, 2) + #If the number of errors are >= than the string length, then a + #match is always possible, so this result is useless. if errors >= len(pattern) or errors >= len(match_string): bad_match = True break