From fb09b7afa0e5d695fa8b594734a54778bceda3e7 Mon Sep 17 00:00:00 2001 From: Alex Hall Date: Thu, 26 Oct 2023 15:14:14 +0200 Subject: [PATCH] (core) Avoid quadratic time complexity in fetch_table with query Summary: While looking at webhooks code, I noticed that it calls `ActiveDoc.fetchQuery` (which I think typically leads to the sandbox method `fetch_table`) with a query on the `id` column, where the values could potentially be all row IDs in the table (e.g. if a new column was added, as in a recent discussion about webhooks gone wrong). This suggests that `fetch_table` should try to avoid using a list for values when possible. In practice this only starts to become an issue at about 10k rows, so I don't know if this has caused any real problems, but it seemed like something worth fixing. Test Plan: Extended unit tests for correctness. Tested performance manually. Made a doc with this formula: ``` import time row_ids = list(Table2.table.row_ids) query = {"id": row_ids} start = time.time() result = table.table._engine.fetch_table('Table2', query=query) end = time.time() assert result[1] == row_ids end - start, len(row_ids) ``` Then put a bunch of rows in `Table2`. This diff made the returned elapsed time much less. Reviewers: georgegevoian Reviewed By: georgegevoian Subscribers: jarek Differential Revision: https://phab.getgrist.com/D4092 --- sandbox/grist/engine.py | 24 +++++++++++++++++++++--- sandbox/grist/test_engine.py | 24 ++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 3 deletions(-) diff --git a/sandbox/grist/engine.py b/sandbox/grist/engine.py index 1e2dae2e..b9cf0b21 100644 --- a/sandbox/grist/engine.py +++ b/sandbox/grist/engine.py @@ -388,9 +388,27 @@ class Engine(object): query_cols = [] if query: - query_cols = [(table.get_column(col_id), values) for (col_id, values) in six.iteritems(query)] - row_ids = [r for r in table.row_ids - if all((c.raw_get(r) in values) for (c, values) in query_cols)] + for col_id, values in six.iteritems(query): + col = table.get_column(col_id) + try: + # Try to use a set for speed. + values = set(values) + except TypeError: + # Values contains an unhashable value, leave it as a list. + pass + query_cols.append((col, values)) + row_ids = [] + for r in table.row_ids: + for (c, values) in query_cols: + try: + if c.raw_get(r) not in values: + break + except TypeError: + # values is a set but c.raw_get(r) is unhashable, so it's definitely not in values + break + else: + # No break, i.e. all columns matched + row_ids.append(r) for c in six.itervalues(table.all_columns): # pylint: disable=too-many-boolean-expressions diff --git a/sandbox/grist/test_engine.py b/sandbox/grist/test_engine.py index c99647b0..ae3a5756 100644 --- a/sandbox/grist/test_engine.py +++ b/sandbox/grist/test_engine.py @@ -557,6 +557,30 @@ class TestEngine(EngineTestCase): self.assertEqualDocData({'Address': data}, {'Address': testutil.table_data_from_rows('Address', col_names, [])}) + # Test unhashable values in the column and in the query + self.add_column('Address', 'list', type='Any', isFormula=True, + formula='[1] if $id == 21 else 2') + col_names.append('list') + + data = self.engine.fetch_table('Address', query={'list': [[1]]}) + self.assertEqualDocData({'Address': data}, + {'Address': testutil.table_data_from_rows('Address', col_names, [ + [ 21, "New York", "NY" , 1, [1]], + ])}) + + data = self.engine.fetch_table('Address', query={'list': [2]}) + self.assertEqualDocData({'Address': data}, + {'Address': testutil.table_data_from_rows('Address', col_names, [ + [ 22, "Albany", "NY" , 2, 2], + ])}) + + data = self.engine.fetch_table('Address', query={'list': [[1], 2]}) + self.assertEqualDocData({'Address': data}, + {'Address': testutil.table_data_from_rows('Address', col_names, [ + [ 21, "New York", "NY" , 1, [1]], + [ 22, "Albany", "NY" , 2, 2], + ])}) + def test_schema_restore_on_error(self): # Simulate an error inside a DocAction, and make sure we restore the schema (don't leave it in # inconsistent with metadata).