From 7a3b8a8e7f2a57a77b923d49850a2ca6a07c2d64 Mon Sep 17 00:00:00 2001
From: jez <jezreel@gmail.com>
Date: Thu, 26 May 2011 18:22:00 +0000
Subject: [PATCH 1/8] Add profiling code.

---
 profile/profile.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 profile/profile.py

diff --git a/profile/profile.py b/profile/profile.py
new file mode 100644
index 0000000..cc7c4c0
--- /dev/null
+++ b/profile/profile.py
@@ -0,0 +1,26 @@
+from __future__ import division, print_function
+import cProfile, sys, imp, os, pstats
+autojump = imp.load_source('autojump', 'autojump')
+
+"""Profile the total time taken for autojump to generate completions as a
+function of pattern length. This file must be run from the project root."""
+
+if os.path.exists('./profile/autojump_py'):
+    autojump.CONFIG_DIR = './profile'
+
+if len(sys.argv) > 1:
+    outfile = open(sys.argv[1], 'w')
+else:
+    outfile = open('profile_results', 'w')
+outfile.write('Pattern length\tTime taken/s\n')
+
+# For maximum running time, we don't want to match any files.
+test_search = '#' * 10
+for i in range(0, 10):
+    autojump.argv = ['', '--completion', test_search[:i+1]]
+    cProfile.run('autojump.shell_utility()', 'shellprof')
+    p = pstats.Stats('shellprof')
+    outfile.write("%s\t%s\n"% (i + 1, p.total_tt))
+p.sort_stats('time')
+p.print_stats(10)
+

From 2a93e3c570c6a70bea5dcafec3dc12e650c6b418 Mon Sep 17 00:00:00 2001
From: jez <jezreel@gmail.com>
Date: Fri, 27 May 2011 07:42:06 +0000
Subject: [PATCH 2/8] Factor out current directory check for speedup.

---
 autojump | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/autojump b/autojump
index 2118056..f0c72ed 100755
--- a/autojump
+++ b/autojump
@@ -90,13 +90,6 @@ def clean_dict(sorted_dirs, path_dict):
 
 def match(path, pattern, ignore_case=False, only_end=False):
     """Check whether a path matches a particular pattern"""
-    try:
-        if os.path.realpath(os.curdir) == path :
-            return False
-    #Sometimes the current path doesn't exist anymore.
-    #In that case, jump if possible.
-    except OSError:
-        pass
     if only_end:
         match_string = "/".join(path.split('/')[-1-pattern.count('/'):])
     else:
@@ -200,7 +193,14 @@ def shell_utility():
                 endmatch = re.match("(.*)"+COMPLETION_SEPARATOR, patterns[-1])
                 if endmatch: patterns[-1] = endmatch.group(1)
 
-            dirs = list(path_dict.items())
+            try:
+                cwd = os.path.realpath(os.curdir)
+            #Sometimes the current path doesn't exist anymore.
+            #In that case, jump if possible.
+            except OSError:
+                cwd = None
+            dirs = list((path, count) for path, count in path_dict.items()
+                    if path != cwd)
             dirs.sort(key=itemgetter(1), reverse=True)
             if completion or userchoice != -1:
                 max_matches = 9

From 9b977379eb96dc13186439f3181898ceec79eafa Mon Sep 17 00:00:00 2001
From: jez <jezreel@gmail.com>
Date: Fri, 27 May 2011 07:56:48 +0000
Subject: [PATCH 3/8] Implement approximate matching via Levenshtein distance.

---
 autojump | 82 +++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 60 insertions(+), 22 deletions(-)

diff --git a/autojump b/autojump
index f0c72ed..928b5b5 100755
--- a/autojump
+++ b/autojump
@@ -30,6 +30,7 @@ import getopt
 from sys import argv, stderr, version_info, exit
 from tempfile import NamedTemporaryFile
 from operator import itemgetter
+from copy import copy
 import os
 MAX_KEYWEIGHT = 1000
 MAX_STORED_PATHS = 600
@@ -88,30 +89,62 @@ def clean_dict(sorted_dirs, path_dict):
         return True
     else: return False
 
-def match(path, pattern, ignore_case=False, only_end=False):
-    """Check whether a path matches a particular pattern"""
-    if only_end:
-        match_string = "/".join(path.split('/')[-1-pattern.count('/'):])
-    else:
-        match_string = path
-    if ignore_case:
-        does_match = (match_string.lower().find(pattern.lower()) != -1)
-    else:
-        does_match = (match_string.find(pattern) != -1)
-    #return True if there is a match and the path exists 
-    #(useful in the case of external drives, for example)
-    return does_match and os.path.exists(path) 
+def approximatch(pat, text):
+    prev_col = list(range(0, len(pat)+1))
+    col = [0] * (len(pat) + 1)
+    errors = len(pat)
+    for char1 in text:
+        col[0] = 0
+        for i, char2 in enumerate(pat):
+            if char1 == char2:
+                col[i+1] = prev_col[i]
+            else:
+                col[i+1] = 1 + min(col[i], prev_col[i+1], prev_col[i])
+        prev_col = copy(col)
+        errors = min(errors, col[-1])
+    return errors
 
-def find_matches(dirs, patterns, result_list, ignore_case, max_matches):
+def find_matches(dirs, patterns, result_list, ignore_case, approx, max_matches):
     """Find max_matches paths that match the pattern, 
     and add them to the result_list"""
-    for path, count in dirs:
-        if len(result_list) >= max_matches :
-            break
+
+    def get_pattern_and_match(patterns, path):
         #For the last pattern, only match the end of the pattern
-        if all(match(path, p, ignore_case,
-            only_end=(n == len(patterns)-1)) for n, p in enumerate(patterns)):
-            uniqadd(result_list, path)
+        for n, pattern in enumerate(patterns):
+            if n == len(patterns) - 1:
+                match_string = "/".join(path.split('/')[-1-pattern.count('/'):])
+            else:
+                match_string = path
+            if ignore_case:
+                pattern = pattern.lower()
+                match_string = match_string.lower()
+            yield (pattern, match_string)
+
+    if approx:
+        one_error_paths = []
+        two_error_paths = []
+        for path, count in dirs:
+            if len(one_error_paths) >= max_matches:
+                break
+            errors = sum(approximatch(pattern, match_string)
+                    for pattern, match_string in get_pattern_and_match(patterns, path))
+            #Verify that the path exists 
+            #(useful in the case of external drives, for example)
+            if errors <= 2 and os.path.exists(path):
+                if errors == 1:
+                    uniqadd(one_error_paths, path)
+                elif errors == 2:
+                    uniqadd(two_error_paths, path)
+        result_list.extend(one_error_paths)
+        result_list.extend(two_error_paths[:max_matches-len(one_error_paths)])
+    else:
+        for path, count in dirs:
+            if len(result_list) >= max_matches:
+                break
+            if all(match_string.find(pattern) != -1
+                    for pattern, match_string in
+                    get_pattern_and_match(patterns, path)) and os.path.exists(path):
+                uniqadd(result_list, path)
 
 def open_dic(dic_file, error_recovery=False):
     """Try hard to open the database file, recovering
@@ -206,12 +239,17 @@ def shell_utility():
                 max_matches = 9
             else:
                 max_matches = 1
-            find_matches(dirs, patterns, results, False, max_matches)
+            find_matches(dirs, patterns, results, False, False, max_matches)
             # If not found, try ignoring case.
             # On completion always show all results
             if completion or not results: 
                 find_matches(dirs, patterns, results,
-                        ignore_case=True, max_matches=max_matches) 
+                        ignore_case=True, approx=False, max_matches=max_matches) 
+
+            if not results:
+                find_matches(dirs, patterns, results,
+                        ignore_case=True, approx=True, max_matches=max_matches) 
+
             # Keep the database to a reasonable size
             if not completion and clean_dict(dirs, path_dict):
                 save(path_dict, dic_file)

From 544aefa17839c3eeb45a4a1d1e18f7fbe3665285 Mon Sep 17 00:00:00 2001
From: jez <jezreel@gmail.com>
Date: Fri, 27 May 2011 08:16:09 +0000
Subject: [PATCH 4/8] Errors should not equal length of string.

Otherwise a match is always possible.
---
 autojump | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/autojump b/autojump
index 928b5b5..74b0cd9 100755
--- a/autojump
+++ b/autojump
@@ -126,14 +126,22 @@ def find_matches(dirs, patterns, result_list, ignore_case, approx, max_matches):
         for path, count in dirs:
             if len(one_error_paths) >= max_matches:
                 break
-            errors = sum(approximatch(pattern, match_string)
-                    for pattern, match_string in get_pattern_and_match(patterns, path))
+            total_errors = 0
+            bad_match = False
+            for pattern, match_string in get_pattern_and_match(patterns, path):
+                errors = approximatch(pattern, match_string)
+                if errors >= len(pattern) or errors >= len(match_string):
+                    bad_match = True
+                    break
+                total_errors += errors
+            if bad_match:
+                continue
             #Verify that the path exists 
             #(useful in the case of external drives, for example)
-            if errors <= 2 and os.path.exists(path):
-                if errors == 1:
+            if total_errors <= 2 and os.path.exists(path):
+                if total_errors == 1:
                     uniqadd(one_error_paths, path)
-                elif errors == 2:
+                elif total_errors == 2:
                     uniqadd(two_error_paths, path)
         result_list.extend(one_error_paths)
         result_list.extend(two_error_paths[:max_matches-len(one_error_paths)])

From 4c2517dafa4f410e740bc5a07ec193689175aab6 Mon Sep 17 00:00:00 2001
From: jez <jezreel@gmail.com>
Date: Wed, 1 Jun 2011 01:38:04 +0000
Subject: [PATCH 5/8] Implement Damerau-Levenshtein distance.

---
 autojump | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/autojump b/autojump
index 74b0cd9..21e632a 100755
--- a/autojump
+++ b/autojump
@@ -90,18 +90,25 @@ def clean_dict(sorted_dirs, path_dict):
     else: return False
 
 def approximatch(pat, text):
-    prev_col = list(range(0, len(pat)+1))
-    col = [0] * (len(pat) + 1)
+    cols = [list(range(0, len(pat)+1))]
+    cols.extend(copy(col) for col in [[0] * (len(pat) + 1)] * (len(text) + 1))
     errors = len(pat)
-    for char1 in text:
-        col[0] = 0
-        for i, char2 in enumerate(pat):
+    last_seen_in_text = {}
+    for i, char1 in enumerate(text):
+        cols[i+1][0] = 0
+        last_seen_in_pat = 0
+        for j, char2 in enumerate(pat):
+            i1 = last_seen_in_text[char2] if char2 in last_seen_in_text else 0
+            j1 = last_seen_in_pat
             if char1 == char2:
-                col[i+1] = prev_col[i]
+                cols[i+1][j+1] = cols[i][j]
+                last_seen_in_pat = j + 1
             else:
-                col[i+1] = 1 + min(col[i], prev_col[i+1], prev_col[i])
-        prev_col = copy(col)
-        errors = min(errors, col[-1])
+                cols[i+1][j+1] = 1 + min(cols[i+1][j], cols[i][j+1], cols[i][j])
+            if i1 and j1:
+                cols[i+1][j+1] = min(cols[i+1][j+1], 1 + (i - i1) + (j - j1) + cols[i1-1][j1-1])
+        errors = min(errors, cols[i+1][-1])
+        last_seen_in_text[char1] = i + 1
     return errors
 
 def find_matches(dirs, patterns, result_list, ignore_case, approx, max_matches):

From 7ffb81d08e0a9cdb43d44f5b937842000f4fc16f Mon Sep 17 00:00:00 2001
From: jez <jezreel@gmail.com>
Date: Tue, 31 May 2011 04:48:05 +0000
Subject: [PATCH 6/8] Optimize: Use append() instead of copy().

---
 autojump | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/autojump b/autojump
index 21e632a..b2ac7a4 100755
--- a/autojump
+++ b/autojump
@@ -30,7 +30,6 @@ import getopt
 from sys import argv, stderr, version_info, exit
 from tempfile import NamedTemporaryFile
 from operator import itemgetter
-from copy import copy
 import os
 MAX_KEYWEIGHT = 1000
 MAX_STORED_PATHS = 600
@@ -91,7 +90,7 @@ def clean_dict(sorted_dirs, path_dict):
 
 def approximatch(pat, text):
     cols = [list(range(0, len(pat)+1))]
-    cols.extend(copy(col) for col in [[0] * (len(pat) + 1)] * (len(text) + 1))
+    for i in range(0, len(text)): cols.append([0] * (len(pat) + 1))
     errors = len(pat)
     last_seen_in_text = {}
     for i, char1 in enumerate(text):

From d6a92e4582f5a9e1f19ae78b3ce001c87b5d2921 Mon Sep 17 00:00:00 2001
From: jez <jezreel@gmail.com>
Date: Tue, 31 May 2011 17:01:09 +0000
Subject: [PATCH 7/8] Implement Ukkonen's cut-off heuristic.

---
 autojump | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/autojump b/autojump
index b2ac7a4..e56400f 100755
--- a/autojump
+++ b/autojump
@@ -88,10 +88,11 @@ def clean_dict(sorted_dirs, path_dict):
         return True
     else: return False
 
-def approximatch(pat, text):
+def approximatch(pat, text, max_errors):
     cols = [list(range(0, len(pat)+1))]
-    for i in range(0, len(text)): cols.append([0] * (len(pat) + 1))
     errors = len(pat)
+    for i in range(0, len(text)): cols.append([errors] * (len(pat) + 1))
+    last_active = min(max_errors, len(pat))
     last_seen_in_text = {}
     for i, char1 in enumerate(text):
         cols[i+1][0] = 0
@@ -106,8 +107,17 @@ def approximatch(pat, text):
                 cols[i+1][j+1] = 1 + min(cols[i+1][j], cols[i][j+1], cols[i][j])
             if i1 and j1:
                 cols[i+1][j+1] = min(cols[i+1][j+1], 1 + (i - i1) + (j - j1) + cols[i1-1][j1-1])
-        errors = min(errors, cols[i+1][-1])
+
+            if j + 1 == len(pat):
+                errors = min(errors, cols[i+1][j+1])
+            elif j + 1 == last_active + 1:
+                break
+
         last_seen_in_text[char1] = i + 1
+
+        if last_active < len(pat): last_active += 1
+        while cols[i+1][last_active] > max_errors: last_active -= 1
+
     return errors
 
 def find_matches(dirs, patterns, result_list, ignore_case, approx, max_matches):
@@ -135,7 +145,7 @@ def find_matches(dirs, patterns, result_list, ignore_case, approx, max_matches):
             total_errors = 0
             bad_match = False
             for pattern, match_string in get_pattern_and_match(patterns, path):
-                errors = approximatch(pattern, match_string)
+                errors = approximatch(pattern, match_string, 2)
                 if errors >= len(pattern) or errors >= len(match_string):
                     bad_match = True
                     break

From 2ce85ddc9a9c3c091afff1aa608ac30e413da452 Mon Sep 17 00:00:00 2001
From: jez <jezreel@gmail.com>
Date: Wed, 1 Jun 2011 01:22:15 +0000
Subject: [PATCH 8/8] Add comments.

---
 autojump | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/autojump b/autojump
index e56400f..f18eb7c 100755
--- a/autojump
+++ b/autojump
@@ -89,6 +89,11 @@ def clean_dict(sorted_dirs, path_dict):
     else: return False
 
 def approximatch(pat, text, max_errors):
+    """Calculate the Damerau-Levenshtein distance between :pat and :text,
+    minimized over all possible positions of :pat within :text. As an
+    optimization, this distance is only accurate if it is <= :max_errors.
+    Return values greater than :max_errors indicate that the distance is _at
+    least_ that much. Runs in O(:max_errors * len(:text)) time."""
     cols = [list(range(0, len(pat)+1))]
     errors = len(pat)
     for i in range(0, len(text)): cols.append([errors] * (len(pat) + 1))
@@ -108,6 +113,9 @@ def approximatch(pat, text, max_errors):
             if i1 and j1:
                 cols[i+1][j+1] = min(cols[i+1][j+1], 1 + (i - i1) + (j - j1) + cols[i1-1][j1-1])
 
+            #Ukkonen's cut-off heuristic. See 'Theoretical and Empirical
+            #Comparisons of Approximate String Matching Algorithms by Chang and
+            #Lampe for details.
             if j + 1 == len(pat):
                 errors = min(errors, cols[i+1][j+1])
             elif j + 1 == last_active + 1:
@@ -146,6 +154,8 @@ def find_matches(dirs, patterns, result_list, ignore_case, approx, max_matches):
             bad_match = False
             for pattern, match_string in get_pattern_and_match(patterns, path):
                 errors = approximatch(pattern, match_string, 2)
+                #If the number of errors are >= than the string length, then a
+                #match is always possible, so this result is useless.
                 if errors >= len(pattern) or errors >= len(match_string):
                     bad_match = True
                     break