mirror of
https://github.com/gristlabs/grist-core.git
synced 2026-03-02 04:09:24 +00:00
(core) Python optimizations to speed up data engine
Summary: - A bunch of optimizations guided by python profiling (esp. py-spy) - Big one is optimizing Record/RecordSet attribute access - Adds tracemalloc printout when running test_replay with PYTHONTRACEMALLOC=1 (on PY3) (but memory size is barely affected by these changes) - Testing with RECORD_SANDBOX_BUFFERS_DIR, loading and calculating a particular very large doc (CRM), time taken improved from 73.9s to 54.8s (26% faster) Test Plan: No behavior changes intended; relying on existing tests to verify that. Reviewers: georgegevoian Reviewed By: georgegevoian Differential Revision: https://phab.getgrist.com/D3781
This commit is contained in:
@@ -11,7 +11,7 @@ import docmodel
|
||||
import functions
|
||||
import logger
|
||||
import lookup
|
||||
import records
|
||||
from records import adjust_record, Record as BaseRecord, RecordSet as BaseRecordSet
|
||||
import relation as relation_module # "relation" is used too much as a variable name below.
|
||||
import usertypes
|
||||
|
||||
@@ -64,7 +64,8 @@ class UserTable(object):
|
||||
"""
|
||||
Name: lookupRecords
|
||||
Usage: UserTable.__lookupRecords__(Field_In_Lookup_Table=value, ...)
|
||||
Returns a [RecordSet](#recordset) matching the given field=value arguments. The value may be any expression,
|
||||
Returns a [RecordSet](#recordset) matching the given field=value arguments. The value may be
|
||||
any expression,
|
||||
most commonly a field in the current row (e.g. `$SomeField`) or a constant (e.g. a quoted string
|
||||
like `"Some Value"`) (examples below).
|
||||
If `sort_by=field` is given, sort the results by that field.
|
||||
@@ -88,7 +89,8 @@ class UserTable(object):
|
||||
"""
|
||||
Name: lookupOne
|
||||
Usage: UserTable.__lookupOne__(Field_In_Lookup_Table=value, ...)
|
||||
Returns a [Record](#record) matching the given field=value arguments. The value may be any expression,
|
||||
Returns a [Record](#record) matching the given field=value arguments. The value may be any
|
||||
expression,
|
||||
most commonly a field in the current row (e.g. `$SomeField`) or a constant (e.g. a quoted string
|
||||
like `"Some Value"`). If multiple records match, returns one of them. If none match, returns the
|
||||
special empty record.
|
||||
@@ -222,23 +224,24 @@ class Table(object):
|
||||
# which are 'flattened' so source records may appear in multiple groups
|
||||
self._summary_simple = None
|
||||
|
||||
# Add Record and RecordSet subclasses with correct `_table` attribute, which will also hold a
|
||||
# field attribute for each column.
|
||||
class Record(BaseRecord):
|
||||
__slots__ = ()
|
||||
_table = self
|
||||
|
||||
class RecordSet(BaseRecordSet):
|
||||
__slots__ = ()
|
||||
_table = self
|
||||
|
||||
self.Record = Record
|
||||
self.RecordSet = RecordSet
|
||||
|
||||
# For use in _num_rows. The attribute isn't strictly needed,
|
||||
# but it makes _num_rows slightly faster, and only creating the lookup map when _num_rows
|
||||
# is called seems to be too late, at least for unit tests.
|
||||
self._empty_lookup_column = self._get_lookup_map(())
|
||||
|
||||
# Add Record and RecordSet subclasses which fill in this table as the first argument
|
||||
class Record(records.Record):
|
||||
def __init__(inner_self, *args, **kwargs): # pylint: disable=no-self-argument
|
||||
super(Record, inner_self).__init__(self, *args, **kwargs)
|
||||
|
||||
class RecordSet(records.RecordSet):
|
||||
def __init__(inner_self, *args, **kwargs): # pylint: disable=no-self-argument
|
||||
super(RecordSet, inner_self).__init__(self, *args, **kwargs)
|
||||
|
||||
self.Record = Record
|
||||
self.RecordSet = RecordSet
|
||||
|
||||
def _num_rows(self):
|
||||
"""
|
||||
Similar to `len(self.lookup_records())` but faster and doesn't create dependencies.
|
||||
@@ -301,6 +304,8 @@ class Table(object):
|
||||
# column changes should stay the same. These get removed when unneeded using other means.
|
||||
new_cols.update(sorted(six.iteritems(self._special_cols)))
|
||||
|
||||
self._update_record_classes(self.all_columns, new_cols)
|
||||
|
||||
# Set the new columns.
|
||||
self.all_columns = new_cols
|
||||
|
||||
@@ -411,8 +416,7 @@ class Table(object):
|
||||
if type(self.get_column(col_id).type_obj) != type(_updateSummary.grist_type):
|
||||
self.delete_column(self.get_column(col_id))
|
||||
col_obj = self._create_or_update_col(col_id, _updateSummary)
|
||||
self._special_cols[col_id] = col_obj
|
||||
self.all_columns[col_id] = col_obj
|
||||
self._add_special_col(col_obj)
|
||||
|
||||
def get_helper_columns(self):
|
||||
"""
|
||||
@@ -509,9 +513,10 @@ class Table(object):
|
||||
raise TypeError("sort_by must be a column ID (string)")
|
||||
reverse = sort_by.startswith("-")
|
||||
sort_col = sort_by.lstrip("-")
|
||||
sort_col_obj = self.all_columns[sort_col]
|
||||
row_ids = sorted(
|
||||
row_id_set,
|
||||
key=lambda r: column.SafeSortKey(self._get_col_value(sort_col, r, rel)),
|
||||
key=lambda r: column.SafeSortKey(self._get_col_obj_value(sort_col_obj, r, rel)),
|
||||
reverse=reverse,
|
||||
)
|
||||
else:
|
||||
@@ -542,14 +547,20 @@ class Table(object):
|
||||
else:
|
||||
column_class = lookup.SimpleLookupMapColumn
|
||||
lmap = column_class(self, lookup_col_id, col_ids_tuple)
|
||||
self._special_cols[lookup_col_id] = lmap
|
||||
self.all_columns[lookup_col_id] = lmap
|
||||
self._add_special_col(lmap)
|
||||
return lmap
|
||||
|
||||
def delete_column(self, col_obj):
|
||||
assert col_obj.table_id == self.table_id
|
||||
self._special_cols.pop(col_obj.col_id, None)
|
||||
self.all_columns.pop(col_obj.col_id, None)
|
||||
self._remove_field_from_record_classes(col_obj.col_id)
|
||||
|
||||
def _add_special_col(self, col_obj):
|
||||
assert col_obj.table_id == self.table_id
|
||||
self._special_cols[col_obj.col_id] = col_obj
|
||||
self.all_columns[col_obj.col_id] = col_obj
|
||||
self._add_field_to_record_classes(col_obj)
|
||||
|
||||
def lookupOrAddDerived(self, **kwargs):
|
||||
record = self.lookup_one_record(**kwargs)
|
||||
@@ -629,39 +640,73 @@ class Table(object):
|
||||
|
||||
# TODO: document everything here.
|
||||
|
||||
# Called when record.foo is accessed
|
||||
def _get_col_value(self, col_id, row_id, relation):
|
||||
[value] = self._get_col_subset_raw(col_id, [row_id], relation)
|
||||
return records.adjust_record(relation, value)
|
||||
# Equivalent to accessing record.foo, but only used in very limited cases now (field accessor is
|
||||
# more optimized).
|
||||
def _get_col_obj_value(self, col_obj, row_id, relation):
|
||||
# creates a dependency and brings formula columns up-to-date.
|
||||
self._engine._use_node(col_obj.node, relation, (row_id,))
|
||||
value = col_obj.get_cell_value(row_id)
|
||||
return adjust_record(relation, value)
|
||||
|
||||
def _attribute_error(self, col_id, relation):
|
||||
self._engine._use_node(self._new_columns_node, relation)
|
||||
raise AttributeError("Table '%s' has no column '%s'" % (self.table_id, col_id))
|
||||
|
||||
# Called when record_set.foo is accessed
|
||||
def _get_col_subset(self, col_id, row_ids, relation):
|
||||
values = self._get_col_subset_raw(col_id, row_ids, relation)
|
||||
def _get_col_obj_subset(self, col_obj, row_ids, relation):
|
||||
self._engine._use_node(col_obj.node, relation, row_ids)
|
||||
values = [col_obj.get_cell_value(row_id) for row_id in row_ids]
|
||||
|
||||
# When all the values are the same type of Record (i.e. all references to the same table)
|
||||
# combine them into a single RecordSet for that table instead of a list
|
||||
# so that more attribute accesses can be chained,
|
||||
# e.g. record_set.foo.bar where `foo` is a Reference column.
|
||||
value_types = list(set(map(type, values)))
|
||||
if len(value_types) == 1 and issubclass(value_types[0], records.Record):
|
||||
return records.RecordSet(
|
||||
values[0]._table,
|
||||
if len(value_types) == 1 and issubclass(value_types[0], BaseRecord):
|
||||
return values[0]._table.RecordSet(
|
||||
# This is different from row_ids: these are the row IDs referenced by these Records,
|
||||
# whereas row_ids are where the values were being stored.
|
||||
[val._row_id for val in values],
|
||||
relation.compose(values[0]._source_relation),
|
||||
)
|
||||
else:
|
||||
return [records.adjust_record(relation, value) for value in values]
|
||||
return [adjust_record(relation, value) for value in values]
|
||||
|
||||
# Internal helper to optimise _get_col_value
|
||||
# so that it doesn't make a singleton RecordSet just to immediately unpack it
|
||||
def _get_col_subset_raw(self, col_id, row_ids, relation):
|
||||
col = self.all_columns[col_id]
|
||||
# creates a dependency and brings formula columns up-to-date.
|
||||
self._engine._use_node(col.node, relation, row_ids)
|
||||
return [col.get_cell_value(row_id) for row_id in row_ids]
|
||||
#----------------------------------------
|
||||
|
||||
def _update_record_classes(self, old_columns, new_columns):
|
||||
for col_id in old_columns:
|
||||
if col_id not in new_columns:
|
||||
self._remove_field_from_record_classes(col_id)
|
||||
|
||||
for col_id, col_obj in six.iteritems(new_columns):
|
||||
if col_obj != old_columns.get(col_id):
|
||||
self._add_field_to_record_classes(col_obj)
|
||||
|
||||
def _add_field_to_record_classes(self, col_obj):
|
||||
node = col_obj.node
|
||||
use_node = self._engine._use_node
|
||||
|
||||
@property
|
||||
def record_field(rec):
|
||||
# This is equivalent to _get_col_obj_value(), but is extra-optimized with _get_col_obj_value()
|
||||
# and adjust_record() inlined, since this is particularly hot code, called on every access of
|
||||
# any data field in a formula.
|
||||
use_node(node, rec._source_relation, (rec._row_id,))
|
||||
value = col_obj.get_cell_value(rec._row_id)
|
||||
if isinstance(value, (BaseRecord, BaseRecordSet)):
|
||||
return value._clone_with_relation(rec._source_relation)
|
||||
return value
|
||||
|
||||
@property
|
||||
def recordset_field(recset):
|
||||
return self._get_col_obj_subset(col_obj, recset._row_ids, recset._source_relation)
|
||||
|
||||
setattr(self.Record, col_obj.col_id, record_field)
|
||||
setattr(self.RecordSet, col_obj.col_id, recordset_field)
|
||||
|
||||
def _remove_field_from_record_classes(self, col_id):
|
||||
if hasattr(self.Record, col_id):
|
||||
delattr(self.Record, col_id)
|
||||
if hasattr(self.RecordSet, col_id):
|
||||
delattr(self.RecordSet, col_id)
|
||||
|
||||
Reference in New Issue
Block a user