From fb09b7afa0e5d695fa8b594734a54778bceda3e7 Mon Sep 17 00:00:00 2001
From: Alex Hall <alex.mojaki@gmail.com>
Date: Thu, 26 Oct 2023 15:14:14 +0200
Subject: [PATCH] (core) Avoid quadratic time complexity in fetch_table with
 query

Summary: While looking at webhooks code, I noticed that it calls `ActiveDoc.fetchQuery` (which I think typically leads to the sandbox method `fetch_table`) with a query on the `id` column, where the values could potentially be all row IDs in the table (e.g. if a new column was added, as in a recent discussion about webhooks gone wrong). This suggests that `fetch_table` should try to avoid using a list for values when possible. In practice this only starts to become an issue at about 10k rows, so I don't know if this has caused any real problems, but it seemed like something worth fixing.

Test Plan:
Extended unit tests for correctness. Tested performance manually. Made a doc with this formula:

```
import time
row_ids = list(Table2.table.row_ids)
query = {"id": row_ids}
start = time.time()
result = table.table._engine.fetch_table('Table2', query=query)
end = time.time()
assert result[1] == row_ids
end - start, len(row_ids)
```

Then put a bunch of rows in `Table2`. This diff made the returned elapsed time much less.

Reviewers: georgegevoian

Reviewed By: georgegevoian

Subscribers: jarek

Differential Revision: https://phab.getgrist.com/D4092
---
 sandbox/grist/engine.py      | 24 +++++++++++++++++++++---
 sandbox/grist/test_engine.py | 24 ++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 3 deletions(-)

diff --git a/sandbox/grist/engine.py b/sandbox/grist/engine.py
index 1e2dae2e..b9cf0b21 100644
--- a/sandbox/grist/engine.py
+++ b/sandbox/grist/engine.py
@@ -388,9 +388,27 @@ class Engine(object):
 
     query_cols = []
     if query:
-      query_cols = [(table.get_column(col_id), values) for (col_id, values) in six.iteritems(query)]
-    row_ids = [r for r in table.row_ids
-               if all((c.raw_get(r) in values) for (c, values) in query_cols)]
+      for col_id, values in six.iteritems(query):
+        col = table.get_column(col_id)
+        try:
+          # Try to use a set for speed.
+          values = set(values)
+        except TypeError:
+          # Values contains an unhashable value, leave it as a list.
+          pass
+        query_cols.append((col, values))
+    row_ids = []
+    for r in table.row_ids:
+      for (c, values) in query_cols:
+        try:
+          if c.raw_get(r) not in values:
+            break
+        except TypeError:
+          # values is a set but c.raw_get(r) is unhashable, so it's definitely not in values
+          break
+      else:
+        # No break, i.e. all columns matched
+        row_ids.append(r)
 
     for c in six.itervalues(table.all_columns):
       # pylint: disable=too-many-boolean-expressions
diff --git a/sandbox/grist/test_engine.py b/sandbox/grist/test_engine.py
index c99647b0..ae3a5756 100644
--- a/sandbox/grist/test_engine.py
+++ b/sandbox/grist/test_engine.py
@@ -557,6 +557,30 @@ class TestEngine(EngineTestCase):
     self.assertEqualDocData({'Address': data},
         {'Address': testutil.table_data_from_rows('Address', col_names, [])})
 
+    # Test unhashable values in the column and in the query
+    self.add_column('Address', 'list', type='Any', isFormula=True,
+                    formula='[1] if $id == 21 else 2')
+    col_names.append('list')
+
+    data = self.engine.fetch_table('Address', query={'list': [[1]]})
+    self.assertEqualDocData({'Address': data},
+        {'Address': testutil.table_data_from_rows('Address', col_names, [
+          [ 21,   "New York", "NY"   , 1, [1]],
+        ])})
+
+    data = self.engine.fetch_table('Address', query={'list': [2]})
+    self.assertEqualDocData({'Address': data},
+        {'Address': testutil.table_data_from_rows('Address', col_names, [
+          [ 22,   "Albany",   "NY"   , 2, 2],
+        ])})
+
+    data = self.engine.fetch_table('Address', query={'list': [[1], 2]})
+    self.assertEqualDocData({'Address': data},
+        {'Address': testutil.table_data_from_rows('Address', col_names, [
+          [ 21,   "New York", "NY"   , 1, [1]],
+          [ 22,   "Albany",   "NY"   , 2, 2],
+        ])})
+
   def test_schema_restore_on_error(self):
     # Simulate an error inside a DocAction, and make sure we restore the schema (don't leave it in
     # inconsistent with metadata).