(core) Avoid quadratic time complexity in fetch_table with query

Summary: While looking at webhooks code, I noticed that it calls `ActiveDoc.fetchQuery` (which I think typically leads to the sandbox method `fetch_table`) with a query on the `id` column, where the values could potentially be all row IDs in the table (e.g. if a new column was added, as in a recent discussion about webhooks gone wrong). This suggests that `fetch_table` should try to avoid using a list for values when possible. In practice this only starts to become an issue at about 10k rows, so I don't know if this has caused any real problems, but it seemed like something worth fixing.

Test Plan:
Extended unit tests for correctness. Tested performance manually. Made a doc with this formula:

```
import time
row_ids = list(Table2.table.row_ids)
query = {"id": row_ids}
start = time.time()
result = table.table._engine.fetch_table('Table2', query=query)
end = time.time()
assert result[1] == row_ids
end - start, len(row_ids)
```

Then put a bunch of rows in `Table2`. This diff made the returned elapsed time much less.

Reviewers: georgegevoian

Reviewed By: georgegevoian

Subscribers: jarek

Differential Revision: https://phab.getgrist.com/D4092
This commit is contained in:
Alex Hall 2023-10-26 15:14:14 +02:00
parent 7a85aaa7a1
commit fb09b7afa0
2 changed files with 45 additions and 3 deletions

View File

@ -388,9 +388,27 @@ class Engine(object):
query_cols = [] query_cols = []
if query: if query:
query_cols = [(table.get_column(col_id), values) for (col_id, values) in six.iteritems(query)] for col_id, values in six.iteritems(query):
row_ids = [r for r in table.row_ids col = table.get_column(col_id)
if all((c.raw_get(r) in values) for (c, values) in query_cols)] try:
# Try to use a set for speed.
values = set(values)
except TypeError:
# Values contains an unhashable value, leave it as a list.
pass
query_cols.append((col, values))
row_ids = []
for r in table.row_ids:
for (c, values) in query_cols:
try:
if c.raw_get(r) not in values:
break
except TypeError:
# values is a set but c.raw_get(r) is unhashable, so it's definitely not in values
break
else:
# No break, i.e. all columns matched
row_ids.append(r)
for c in six.itervalues(table.all_columns): for c in six.itervalues(table.all_columns):
# pylint: disable=too-many-boolean-expressions # pylint: disable=too-many-boolean-expressions

View File

@ -557,6 +557,30 @@ class TestEngine(EngineTestCase):
self.assertEqualDocData({'Address': data}, self.assertEqualDocData({'Address': data},
{'Address': testutil.table_data_from_rows('Address', col_names, [])}) {'Address': testutil.table_data_from_rows('Address', col_names, [])})
# Test unhashable values in the column and in the query
self.add_column('Address', 'list', type='Any', isFormula=True,
formula='[1] if $id == 21 else 2')
col_names.append('list')
data = self.engine.fetch_table('Address', query={'list': [[1]]})
self.assertEqualDocData({'Address': data},
{'Address': testutil.table_data_from_rows('Address', col_names, [
[ 21, "New York", "NY" , 1, [1]],
])})
data = self.engine.fetch_table('Address', query={'list': [2]})
self.assertEqualDocData({'Address': data},
{'Address': testutil.table_data_from_rows('Address', col_names, [
[ 22, "Albany", "NY" , 2, 2],
])})
data = self.engine.fetch_table('Address', query={'list': [[1], 2]})
self.assertEqualDocData({'Address': data},
{'Address': testutil.table_data_from_rows('Address', col_names, [
[ 21, "New York", "NY" , 1, [1]],
[ 22, "Albany", "NY" , 2, 2],
])})
def test_schema_restore_on_error(self): def test_schema_restore_on_error(self):
# Simulate an error inside a DocAction, and make sure we restore the schema (don't leave it in # Simulate an error inside a DocAction, and make sure we restore the schema (don't leave it in
# inconsistent with metadata). # inconsistent with metadata).