(core) Avoid quadratic time complexity in fetch_table with query

Summary: While looking at webhooks code, I noticed that it calls `ActiveDoc.fetchQuery` (which I think typically leads to the sandbox method `fetch_table`) with a query on the `id` column, where the values could potentially be all row IDs in the table (e.g. if a new column was added, as in a recent discussion about webhooks gone wrong). This suggests that `fetch_table` should try to avoid using a list for values when possible. In practice this only starts to become an issue at about 10k rows, so I don't know if this has caused any real problems, but it seemed like something worth fixing. Test Plan: Extended unit tests for correctness. Tested performance manually. Made a doc with this formula: ``` import time row_ids = list(Table2.table.row_ids) query = {"id": row_ids} start = time.time() result = table.table._engine.fetch_table('Table2', query=query) end = time.time() assert result[1] == row_ids end - start, len(row_ids) ``` Then put a bunch of rows in `Table2`. This diff made the returned elapsed time much less. Reviewers: georgegevoian Reviewed By: georgegevoian Subscribers: jarek Differential Revision: https://phab.getgrist.com/D4092
2025-12-08 17:21:55 +00:00 · 2023-10-26 15:14:14 +02:00 · 2023-10-26 15:14:14 +02:00 · fb09b7afa0
commit fb09b7afa0
parent 7a85aaa7a1
2 changed files with 45 additions and 3 deletions
--- a/sandbox/grist/engine.py
+++ b/sandbox/grist/engine.py
@ -388,9 +388,27 @@ class Engine(object):

    query_cols = []
    if query:
-      query_cols = [(table.get_column(col_id), values) for (col_id, values) in six.iteritems(query)]
-    row_ids = [r for r in table.row_ids
-               if all((c.raw_get(r) in values) for (c, values) in query_cols)]
+      for col_id, values in six.iteritems(query):
+        col = table.get_column(col_id)
+        try:
+          # Try to use a set for speed.
+          values = set(values)
+        except TypeError:
+          # Values contains an unhashable value, leave it as a list.
+          pass
+        query_cols.append((col, values))
+    row_ids = []
+    for r in table.row_ids:
+      for (c, values) in query_cols:
+        try:
+          if c.raw_get(r) not in values:
+            break
+        except TypeError:
+          # values is a set but c.raw_get(r) is unhashable, so it's definitely not in values
+          break
+      else:
+        # No break, i.e. all columns matched
+        row_ids.append(r)

    for c in six.itervalues(table.all_columns):
      # pylint: disable=too-many-boolean-expressions
--- a/sandbox/grist/test_engine.py
+++ b/sandbox/grist/test_engine.py
@ -557,6 +557,30 @@ class TestEngine(EngineTestCase):
    self.assertEqualDocData({'Address': data},
        {'Address': testutil.table_data_from_rows('Address', col_names, [])})

+    # Test unhashable values in the column and in the query
+    self.add_column('Address', 'list', type='Any', isFormula=True,
+                    formula='[1] if $id == 21 else 2')
+    col_names.append('list')
+
+    data = self.engine.fetch_table('Address', query={'list': [[1]]})
+    self.assertEqualDocData({'Address': data},
+        {'Address': testutil.table_data_from_rows('Address', col_names, [
+          [ 21,   "New York", "NY"   , 1, [1]],
+        ])})
+
+    data = self.engine.fetch_table('Address', query={'list': [2]})
+    self.assertEqualDocData({'Address': data},
+        {'Address': testutil.table_data_from_rows('Address', col_names, [
+          [ 22,   "Albany",   "NY"   , 2, 2],
+        ])})
+
+    data = self.engine.fetch_table('Address', query={'list': [[1], 2]})
+    self.assertEqualDocData({'Address': data},
+        {'Address': testutil.table_data_from_rows('Address', col_names, [
+          [ 21,   "New York", "NY"   , 1, [1]],
+          [ 22,   "Albany",   "NY"   , 2, 2],
+        ])})
+
  def test_schema_restore_on_error(self):
    # Simulate an error inside a DocAction, and make sure we restore the schema (don't leave it in
    # inconsistent with metadata).