(core) Python optimizations to speed up data engine

Summary:
- A bunch of optimizations guided by python profiling (esp. py-spy)
- Big one is optimizing Record/RecordSet attribute access
- Adds tracemalloc printout when running test_replay with PYTHONTRACEMALLOC=1 (on PY3)
  (but memory size is barely affected by these changes)

- Testing with RECORD_SANDBOX_BUFFERS_DIR, loading and calculating a particular
  very large doc (CRM), time taken improved from 73.9s to 54.8s (26% faster)

Test Plan: No behavior changes intended; relying on existing tests to verify that.

Reviewers: georgegevoian

Reviewed By: georgegevoian

Differential Revision: https://phab.getgrist.com/D3781
This commit is contained in:
Dmitry S
2023-02-04 11:20:13 -05:00
parent 7c448d746f
commit 9d4eeda480
8 changed files with 144 additions and 84 deletions

View File

@@ -11,7 +11,7 @@ import docmodel
import functions
import logger
import lookup
import records
from records import adjust_record, Record as BaseRecord, RecordSet as BaseRecordSet
import relation as relation_module # "relation" is used too much as a variable name below.
import usertypes
@@ -64,7 +64,8 @@ class UserTable(object):
"""
Name: lookupRecords
Usage: UserTable.__lookupRecords__(Field_In_Lookup_Table=value, ...)
Returns a [RecordSet](#recordset) matching the given field=value arguments. The value may be any expression,
Returns a [RecordSet](#recordset) matching the given field=value arguments. The value may be
any expression,
most commonly a field in the current row (e.g. `$SomeField`) or a constant (e.g. a quoted string
like `"Some Value"`) (examples below).
If `sort_by=field` is given, sort the results by that field.
@@ -88,7 +89,8 @@ class UserTable(object):
"""
Name: lookupOne
Usage: UserTable.__lookupOne__(Field_In_Lookup_Table=value, ...)
Returns a [Record](#record) matching the given field=value arguments. The value may be any expression,
Returns a [Record](#record) matching the given field=value arguments. The value may be any
expression,
most commonly a field in the current row (e.g. `$SomeField`) or a constant (e.g. a quoted string
like `"Some Value"`). If multiple records match, returns one of them. If none match, returns the
special empty record.
@@ -222,23 +224,24 @@ class Table(object):
# which are 'flattened' so source records may appear in multiple groups
self._summary_simple = None
# Add Record and RecordSet subclasses with correct `_table` attribute, which will also hold a
# field attribute for each column.
class Record(BaseRecord):
__slots__ = ()
_table = self
class RecordSet(BaseRecordSet):
__slots__ = ()
_table = self
self.Record = Record
self.RecordSet = RecordSet
# For use in _num_rows. The attribute isn't strictly needed,
# but it makes _num_rows slightly faster, and only creating the lookup map when _num_rows
# is called seems to be too late, at least for unit tests.
self._empty_lookup_column = self._get_lookup_map(())
# Add Record and RecordSet subclasses which fill in this table as the first argument
class Record(records.Record):
def __init__(inner_self, *args, **kwargs): # pylint: disable=no-self-argument
super(Record, inner_self).__init__(self, *args, **kwargs)
class RecordSet(records.RecordSet):
def __init__(inner_self, *args, **kwargs): # pylint: disable=no-self-argument
super(RecordSet, inner_self).__init__(self, *args, **kwargs)
self.Record = Record
self.RecordSet = RecordSet
def _num_rows(self):
"""
Similar to `len(self.lookup_records())` but faster and doesn't create dependencies.
@@ -301,6 +304,8 @@ class Table(object):
# column changes should stay the same. These get removed when unneeded using other means.
new_cols.update(sorted(six.iteritems(self._special_cols)))
self._update_record_classes(self.all_columns, new_cols)
# Set the new columns.
self.all_columns = new_cols
@@ -411,8 +416,7 @@ class Table(object):
if type(self.get_column(col_id).type_obj) != type(_updateSummary.grist_type):
self.delete_column(self.get_column(col_id))
col_obj = self._create_or_update_col(col_id, _updateSummary)
self._special_cols[col_id] = col_obj
self.all_columns[col_id] = col_obj
self._add_special_col(col_obj)
def get_helper_columns(self):
"""
@@ -509,9 +513,10 @@ class Table(object):
raise TypeError("sort_by must be a column ID (string)")
reverse = sort_by.startswith("-")
sort_col = sort_by.lstrip("-")
sort_col_obj = self.all_columns[sort_col]
row_ids = sorted(
row_id_set,
key=lambda r: column.SafeSortKey(self._get_col_value(sort_col, r, rel)),
key=lambda r: column.SafeSortKey(self._get_col_obj_value(sort_col_obj, r, rel)),
reverse=reverse,
)
else:
@@ -542,14 +547,20 @@ class Table(object):
else:
column_class = lookup.SimpleLookupMapColumn
lmap = column_class(self, lookup_col_id, col_ids_tuple)
self._special_cols[lookup_col_id] = lmap
self.all_columns[lookup_col_id] = lmap
self._add_special_col(lmap)
return lmap
def delete_column(self, col_obj):
assert col_obj.table_id == self.table_id
self._special_cols.pop(col_obj.col_id, None)
self.all_columns.pop(col_obj.col_id, None)
self._remove_field_from_record_classes(col_obj.col_id)
def _add_special_col(self, col_obj):
assert col_obj.table_id == self.table_id
self._special_cols[col_obj.col_id] = col_obj
self.all_columns[col_obj.col_id] = col_obj
self._add_field_to_record_classes(col_obj)
def lookupOrAddDerived(self, **kwargs):
record = self.lookup_one_record(**kwargs)
@@ -629,39 +640,73 @@ class Table(object):
# TODO: document everything here.
# Called when record.foo is accessed
def _get_col_value(self, col_id, row_id, relation):
[value] = self._get_col_subset_raw(col_id, [row_id], relation)
return records.adjust_record(relation, value)
# Equivalent to accessing record.foo, but only used in very limited cases now (field accessor is
# more optimized).
def _get_col_obj_value(self, col_obj, row_id, relation):
# creates a dependency and brings formula columns up-to-date.
self._engine._use_node(col_obj.node, relation, (row_id,))
value = col_obj.get_cell_value(row_id)
return adjust_record(relation, value)
def _attribute_error(self, col_id, relation):
self._engine._use_node(self._new_columns_node, relation)
raise AttributeError("Table '%s' has no column '%s'" % (self.table_id, col_id))
# Called when record_set.foo is accessed
def _get_col_subset(self, col_id, row_ids, relation):
values = self._get_col_subset_raw(col_id, row_ids, relation)
def _get_col_obj_subset(self, col_obj, row_ids, relation):
self._engine._use_node(col_obj.node, relation, row_ids)
values = [col_obj.get_cell_value(row_id) for row_id in row_ids]
# When all the values are the same type of Record (i.e. all references to the same table)
# combine them into a single RecordSet for that table instead of a list
# so that more attribute accesses can be chained,
# e.g. record_set.foo.bar where `foo` is a Reference column.
value_types = list(set(map(type, values)))
if len(value_types) == 1 and issubclass(value_types[0], records.Record):
return records.RecordSet(
values[0]._table,
if len(value_types) == 1 and issubclass(value_types[0], BaseRecord):
return values[0]._table.RecordSet(
# This is different from row_ids: these are the row IDs referenced by these Records,
# whereas row_ids are where the values were being stored.
[val._row_id for val in values],
relation.compose(values[0]._source_relation),
)
else:
return [records.adjust_record(relation, value) for value in values]
return [adjust_record(relation, value) for value in values]
# Internal helper to optimise _get_col_value
# so that it doesn't make a singleton RecordSet just to immediately unpack it
def _get_col_subset_raw(self, col_id, row_ids, relation):
col = self.all_columns[col_id]
# creates a dependency and brings formula columns up-to-date.
self._engine._use_node(col.node, relation, row_ids)
return [col.get_cell_value(row_id) for row_id in row_ids]
#----------------------------------------
def _update_record_classes(self, old_columns, new_columns):
for col_id in old_columns:
if col_id not in new_columns:
self._remove_field_from_record_classes(col_id)
for col_id, col_obj in six.iteritems(new_columns):
if col_obj != old_columns.get(col_id):
self._add_field_to_record_classes(col_obj)
def _add_field_to_record_classes(self, col_obj):
node = col_obj.node
use_node = self._engine._use_node
@property
def record_field(rec):
# This is equivalent to _get_col_obj_value(), but is extra-optimized with _get_col_obj_value()
# and adjust_record() inlined, since this is particularly hot code, called on every access of
# any data field in a formula.
use_node(node, rec._source_relation, (rec._row_id,))
value = col_obj.get_cell_value(rec._row_id)
if isinstance(value, (BaseRecord, BaseRecordSet)):
return value._clone_with_relation(rec._source_relation)
return value
@property
def recordset_field(recset):
return self._get_col_obj_subset(col_obj, recset._row_ids, recset._source_relation)
setattr(self.Record, col_obj.col_id, record_field)
setattr(self.RecordSet, col_obj.col_id, recordset_field)
def _remove_field_from_record_classes(self, col_id):
if hasattr(self.Record, col_id):
delattr(self.Record, col_id)
if hasattr(self.RecordSet, col_id):
delattr(self.RecordSet, col_id)