From 9d4eeda48094a7ca2408110149fe3d27fccf165f Mon Sep 17 00:00:00 2001 From: Dmitry S Date: Sat, 4 Feb 2023 11:20:13 -0500 Subject: [PATCH] (core) Python optimizations to speed up data engine Summary: - A bunch of optimizations guided by python profiling (esp. py-spy) - Big one is optimizing Record/RecordSet attribute access - Adds tracemalloc printout when running test_replay with PYTHONTRACEMALLOC=1 (on PY3) (but memory size is barely affected by these changes) - Testing with RECORD_SANDBOX_BUFFERS_DIR, loading and calculating a particular very large doc (CRM), time taken improved from 73.9s to 54.8s (26% faster) Test Plan: No behavior changes intended; relying on existing tests to verify that. Reviewers: georgegevoian Reviewed By: georgegevoian Differential Revision: https://phab.getgrist.com/D3781 --- sandbox/grist/column.py | 9 ++- sandbox/grist/lookup.py | 8 +-- sandbox/grist/objtypes.py | 4 +- sandbox/grist/records.py | 43 ++++++------- sandbox/grist/table.py | 121 ++++++++++++++++++++++++----------- sandbox/grist/test_engine.py | 5 +- sandbox/grist/test_replay.py | 12 +++- sandbox/grist/usertypes.py | 28 ++++---- 8 files changed, 145 insertions(+), 85 deletions(-) diff --git a/sandbox/grist/column.py b/sandbox/grist/column.py index d4432c82..c0ba8906 100644 --- a/sandbox/grist/column.py +++ b/sandbox/grist/column.py @@ -60,6 +60,7 @@ class BaseColumn(object): """ def __init__(self, table, col_id, col_info): self.type_obj = col_info.type_obj + self._is_right_type = self.type_obj.is_right_type self._data = [] self.col_id = col_id self.table_id = table.table_id @@ -154,10 +155,14 @@ class BaseColumn(object): else: raise objtypes.CellError(self.table_id, self.col_id, row_id, raw.error) - return self._convert_raw_value(raw) + # Inline _convert_raw_value here because this is particularly hot code, called on every access + # of any data field in a formula. + if self._is_right_type(raw): + return self._make_rich_value(raw) + return self._alt_text(raw) def _convert_raw_value(self, raw): - if self.type_obj.is_right_type(raw): + if self._is_right_type(raw): return self._make_rich_value(raw) return self._alt_text(raw) diff --git a/sandbox/grist/lookup.py b/sandbox/grist/lookup.py index ebd289fe..96683d8d 100644 --- a/sandbox/grist/lookup.py +++ b/sandbox/grist/lookup.py @@ -161,9 +161,9 @@ class SimpleLookupMapColumn(BaseLookupMapColumn): def _recalc_rec_method(self, rec, table): old_key = self._row_key_map.lookup_left(rec._row_id) - # Note that rec._get_col(_col_id) is what creates the correct dependency, as well as ensures + # Note that getattr(rec, _col_id) is what creates the correct dependency, as well as ensures # that the columns used to index by are brought up-to-date (in case they are formula columns). - new_key = tuple(_extract(rec._get_col(_col_id)) for _col_id in self._col_ids_tuple) + new_key = tuple(_extract(getattr(rec, _col_id)) for _col_id in self._col_ids_tuple) try: self._row_key_map.insert(rec._row_id, new_key) @@ -188,9 +188,9 @@ class ContainsLookupMapColumn(BaseLookupMapColumn): # looked up with CONTAINS() new_keys_groups = [] for col_id in self._col_ids_tuple: - # Note that _get_col is what creates the correct dependency, as well as ensures + # Note that getattr() is what creates the correct dependency, as well as ensures # that the columns used to index by are brought up-to-date (in case they are formula columns). - group = rec._get_col(extract_column_id(col_id)) + group = getattr(rec, extract_column_id(col_id)) if isinstance(col_id, _Contains): # Check that the cell targeted by CONTAINS() has an appropriate type. diff --git a/sandbox/grist/objtypes.py b/sandbox/grist/objtypes.py index cfd9d261..d19894e5 100644 --- a/sandbox/grist/objtypes.py +++ b/sandbox/grist/objtypes.py @@ -121,10 +121,8 @@ class CensoredValue(object): _censored_sentinel = CensoredValue() -_max_js_int = 1<<31 - def is_int_short(value): - return -_max_js_int <= value < _max_js_int + return -(1<<31) <= value < (1<<31) def safe_shift(arg, default=None): value = arg.pop(0) if arg else None diff --git a/sandbox/grist/records.py b/sandbox/grist/records.py index 3f5732dc..e2b124a0 100644 --- a/sandbox/grist/records.py +++ b/sandbox/grist/records.py @@ -39,8 +39,8 @@ class Record(object): Usage: __$group__ In a [summary table](summary-tables.md), `$group` is a special field - containing the list of Records that are summarized by the current summary line. E.g. the formula - `len($group)` counts the number of those records being summarized in each row. + containing the list of Records that are summarized by the current summary line. E.g. the + formula `len($group)` counts the number of those records being summarized in each row. See [RecordSet](#recordset) for useful properties offered by the returned object. @@ -54,9 +54,15 @@ class Record(object): """ ) + # Slots are an optimization to avoid the need for a per-object __dict__. + __slots__ = ('_row_id', '_source_relation') + + # Per-table derived classes override this and set it to the appropriate Table object. + _table = None + # Record is always a thin class, containing essentially a reference to a row in the table. The # properties to access individual fields of a row are provided in per-table derived classes. - def __init__(self, table, row_id, relation=None): + def __init__(self, row_id, relation=None): """ Creates a Record object. table - Table object, in which this record lives. @@ -69,20 +75,12 @@ class Record(object): which provides the table argument automatically. """ - self._table = table self._row_id = row_id - self._source_relation = relation or table._identity_relation + self._source_relation = relation or self._table._identity_relation - def _get_col(self, col_id): - return self._table._get_col_value(col_id, self._row_id, self._source_relation) - - # Look up a property of the record. Internal properties are simple. - # For columns, we explicitly check that we have them before attempting to access. - # Otherwise AttributeError is ambiguous - it could be because we don't have the - # column, or because the column threw an AttributeError when evaluated. + # Existing fields are added as @property methods in table.py. When no field is found, raise a + # more informative AttributeError. def __getattr__(self, name): - if name in self._table.all_columns: - return self._get_col(name) return self._table._attribute_error(name, self._source_relation) def __hash__(self): @@ -134,17 +132,23 @@ class RecordSet(object): You can get the number of records in a RecordSet using `len`, e.g. `len($group)`. """ + + # Slots are an optimization to avoid the need for a per-object __dict__. + __slots__ = ('_row_ids', '_source_relation', '_group_by', '_sort_by') + + # Per-table derived classes override this and set it to the appropriate Table object. + _table = None + # Methods should be named with a leading underscore to avoid interfering with access to # user-defined fields. - def __init__(self, table, row_ids, relation=None, group_by=None, sort_by=None): + def __init__(self, row_ids, relation=None, group_by=None, sort_by=None): """ group_by may be a dictionary mapping column names to values that are all the same for the given RecordSet. sort_by may be the column name used for sorting this record set. Both are set by lookupRecords, and used when using RecordSet to insert new records. """ - self._table = table self._row_ids = row_ids - self._source_relation = relation or table._identity_relation + self._source_relation = relation or self._table._identity_relation # If row_ids is itself a RecordList, default to its _group_by and _sort_by properties. self._group_by = group_by or getattr(row_ids, '_group_by', None) self._sort_by = sort_by or getattr(row_ids, '_sort_by', None) @@ -188,12 +192,7 @@ class RecordSet(object): row_id = min(self._row_ids) return self._table.Record(row_id, self._source_relation) - def _get_col(self, col_id): - return self._table._get_col_subset(col_id, self._row_ids, self._source_relation) - def __getattr__(self, name): - if name in self._table.all_columns: - return self._get_col(name) return self._table._attribute_error(name, self._source_relation) def __repr__(self): diff --git a/sandbox/grist/table.py b/sandbox/grist/table.py index 1e786427..060625ac 100644 --- a/sandbox/grist/table.py +++ b/sandbox/grist/table.py @@ -11,7 +11,7 @@ import docmodel import functions import logger import lookup -import records +from records import adjust_record, Record as BaseRecord, RecordSet as BaseRecordSet import relation as relation_module # "relation" is used too much as a variable name below. import usertypes @@ -64,7 +64,8 @@ class UserTable(object): """ Name: lookupRecords Usage: UserTable.__lookupRecords__(Field_In_Lookup_Table=value, ...) - Returns a [RecordSet](#recordset) matching the given field=value arguments. The value may be any expression, + Returns a [RecordSet](#recordset) matching the given field=value arguments. The value may be + any expression, most commonly a field in the current row (e.g. `$SomeField`) or a constant (e.g. a quoted string like `"Some Value"`) (examples below). If `sort_by=field` is given, sort the results by that field. @@ -88,7 +89,8 @@ class UserTable(object): """ Name: lookupOne Usage: UserTable.__lookupOne__(Field_In_Lookup_Table=value, ...) - Returns a [Record](#record) matching the given field=value arguments. The value may be any expression, + Returns a [Record](#record) matching the given field=value arguments. The value may be any + expression, most commonly a field in the current row (e.g. `$SomeField`) or a constant (e.g. a quoted string like `"Some Value"`). If multiple records match, returns one of them. If none match, returns the special empty record. @@ -222,23 +224,24 @@ class Table(object): # which are 'flattened' so source records may appear in multiple groups self._summary_simple = None - # For use in _num_rows. The attribute isn't strictly needed, - # but it makes _num_rows slightly faster, and only creating the lookup map when _num_rows - # is called seems to be too late, at least for unit tests. - self._empty_lookup_column = self._get_lookup_map(()) + # Add Record and RecordSet subclasses with correct `_table` attribute, which will also hold a + # field attribute for each column. + class Record(BaseRecord): + __slots__ = () + _table = self - # Add Record and RecordSet subclasses which fill in this table as the first argument - class Record(records.Record): - def __init__(inner_self, *args, **kwargs): # pylint: disable=no-self-argument - super(Record, inner_self).__init__(self, *args, **kwargs) - - class RecordSet(records.RecordSet): - def __init__(inner_self, *args, **kwargs): # pylint: disable=no-self-argument - super(RecordSet, inner_self).__init__(self, *args, **kwargs) + class RecordSet(BaseRecordSet): + __slots__ = () + _table = self self.Record = Record self.RecordSet = RecordSet + # For use in _num_rows. The attribute isn't strictly needed, + # but it makes _num_rows slightly faster, and only creating the lookup map when _num_rows + # is called seems to be too late, at least for unit tests. + self._empty_lookup_column = self._get_lookup_map(()) + def _num_rows(self): """ Similar to `len(self.lookup_records())` but faster and doesn't create dependencies. @@ -301,6 +304,8 @@ class Table(object): # column changes should stay the same. These get removed when unneeded using other means. new_cols.update(sorted(six.iteritems(self._special_cols))) + self._update_record_classes(self.all_columns, new_cols) + # Set the new columns. self.all_columns = new_cols @@ -411,8 +416,7 @@ class Table(object): if type(self.get_column(col_id).type_obj) != type(_updateSummary.grist_type): self.delete_column(self.get_column(col_id)) col_obj = self._create_or_update_col(col_id, _updateSummary) - self._special_cols[col_id] = col_obj - self.all_columns[col_id] = col_obj + self._add_special_col(col_obj) def get_helper_columns(self): """ @@ -509,9 +513,10 @@ class Table(object): raise TypeError("sort_by must be a column ID (string)") reverse = sort_by.startswith("-") sort_col = sort_by.lstrip("-") + sort_col_obj = self.all_columns[sort_col] row_ids = sorted( row_id_set, - key=lambda r: column.SafeSortKey(self._get_col_value(sort_col, r, rel)), + key=lambda r: column.SafeSortKey(self._get_col_obj_value(sort_col_obj, r, rel)), reverse=reverse, ) else: @@ -542,14 +547,20 @@ class Table(object): else: column_class = lookup.SimpleLookupMapColumn lmap = column_class(self, lookup_col_id, col_ids_tuple) - self._special_cols[lookup_col_id] = lmap - self.all_columns[lookup_col_id] = lmap + self._add_special_col(lmap) return lmap def delete_column(self, col_obj): assert col_obj.table_id == self.table_id self._special_cols.pop(col_obj.col_id, None) self.all_columns.pop(col_obj.col_id, None) + self._remove_field_from_record_classes(col_obj.col_id) + + def _add_special_col(self, col_obj): + assert col_obj.table_id == self.table_id + self._special_cols[col_obj.col_id] = col_obj + self.all_columns[col_obj.col_id] = col_obj + self._add_field_to_record_classes(col_obj) def lookupOrAddDerived(self, **kwargs): record = self.lookup_one_record(**kwargs) @@ -629,39 +640,73 @@ class Table(object): # TODO: document everything here. - # Called when record.foo is accessed - def _get_col_value(self, col_id, row_id, relation): - [value] = self._get_col_subset_raw(col_id, [row_id], relation) - return records.adjust_record(relation, value) + # Equivalent to accessing record.foo, but only used in very limited cases now (field accessor is + # more optimized). + def _get_col_obj_value(self, col_obj, row_id, relation): + # creates a dependency and brings formula columns up-to-date. + self._engine._use_node(col_obj.node, relation, (row_id,)) + value = col_obj.get_cell_value(row_id) + return adjust_record(relation, value) def _attribute_error(self, col_id, relation): self._engine._use_node(self._new_columns_node, relation) raise AttributeError("Table '%s' has no column '%s'" % (self.table_id, col_id)) # Called when record_set.foo is accessed - def _get_col_subset(self, col_id, row_ids, relation): - values = self._get_col_subset_raw(col_id, row_ids, relation) + def _get_col_obj_subset(self, col_obj, row_ids, relation): + self._engine._use_node(col_obj.node, relation, row_ids) + values = [col_obj.get_cell_value(row_id) for row_id in row_ids] # When all the values are the same type of Record (i.e. all references to the same table) # combine them into a single RecordSet for that table instead of a list # so that more attribute accesses can be chained, # e.g. record_set.foo.bar where `foo` is a Reference column. value_types = list(set(map(type, values))) - if len(value_types) == 1 and issubclass(value_types[0], records.Record): - return records.RecordSet( - values[0]._table, + if len(value_types) == 1 and issubclass(value_types[0], BaseRecord): + return values[0]._table.RecordSet( # This is different from row_ids: these are the row IDs referenced by these Records, # whereas row_ids are where the values were being stored. [val._row_id for val in values], relation.compose(values[0]._source_relation), ) else: - return [records.adjust_record(relation, value) for value in values] - - # Internal helper to optimise _get_col_value - # so that it doesn't make a singleton RecordSet just to immediately unpack it - def _get_col_subset_raw(self, col_id, row_ids, relation): - col = self.all_columns[col_id] - # creates a dependency and brings formula columns up-to-date. - self._engine._use_node(col.node, relation, row_ids) - return [col.get_cell_value(row_id) for row_id in row_ids] + return [adjust_record(relation, value) for value in values] + + #---------------------------------------- + + def _update_record_classes(self, old_columns, new_columns): + for col_id in old_columns: + if col_id not in new_columns: + self._remove_field_from_record_classes(col_id) + + for col_id, col_obj in six.iteritems(new_columns): + if col_obj != old_columns.get(col_id): + self._add_field_to_record_classes(col_obj) + + def _add_field_to_record_classes(self, col_obj): + node = col_obj.node + use_node = self._engine._use_node + + @property + def record_field(rec): + # This is equivalent to _get_col_obj_value(), but is extra-optimized with _get_col_obj_value() + # and adjust_record() inlined, since this is particularly hot code, called on every access of + # any data field in a formula. + use_node(node, rec._source_relation, (rec._row_id,)) + value = col_obj.get_cell_value(rec._row_id) + if isinstance(value, (BaseRecord, BaseRecordSet)): + return value._clone_with_relation(rec._source_relation) + return value + + @property + def recordset_field(recset): + return self._get_col_obj_subset(col_obj, recset._row_ids, recset._source_relation) + + setattr(self.Record, col_obj.col_id, record_field) + setattr(self.RecordSet, col_obj.col_id, recordset_field) + + def _remove_field_from_record_classes(self, col_id): + if hasattr(self.Record, col_id): + delattr(self.Record, col_id) + if hasattr(self.RecordSet, col_id): + delattr(self.RecordSet, col_id) diff --git a/sandbox/grist/test_engine.py b/sandbox/grist/test_engine.py index 7a523f5c..357e3c9d 100644 --- a/sandbox/grist/test_engine.py +++ b/sandbox/grist/test_engine.py @@ -26,8 +26,9 @@ View = namedtuple('View', 'id sections') Section = namedtuple('Section', 'id parentKey tableRef fields') Field = namedtuple('Field', 'id colRef') -unittest.TestCase.assertRaisesRegex = unittest.TestCase.assertRaisesRegexp -unittest.TestCase.assertRegex = unittest.TestCase.assertRegexpMatches +if six.PY2: + unittest.TestCase.assertRaisesRegex = unittest.TestCase.assertRaisesRegexp + unittest.TestCase.assertRegex = unittest.TestCase.assertRegexpMatches class EngineTestCase(unittest.TestCase): """ diff --git a/sandbox/grist/test_replay.py b/sandbox/grist/test_replay.py index 1ad2bc8e..85f005f1 100644 --- a/sandbox/grist/test_replay.py +++ b/sandbox/grist/test_replay.py @@ -44,7 +44,7 @@ import unittest from main import run from sandbox import Sandbox - +import six def marshal_load_all(path): result = [] @@ -65,6 +65,7 @@ class TestReplay(unittest.TestCase): root = os.environ.get("RECORD_SANDBOX_BUFFERS_DIR") if not root: self.skipTest("RECORD_SANDBOX_BUFFERS_DIR not set") + for dirpath, dirnames, filenames in os.walk(root): if "input" not in filenames: continue @@ -76,9 +77,18 @@ class TestReplay(unittest.TestCase): new_output_path = os.path.join(dirpath, "new_output") with open(input_path, "rb") as external_input: with open(new_output_path, "wb") as external_output: + if six.PY3: + import tracemalloc # pylint: disable=import-error + tracemalloc.reset_peak() + sandbox = Sandbox(external_input, external_output) run(sandbox) + # Run with env PYTHONTRACEMALLOC=1 to trace and print peak memory (runs much slower). + if six.PY3 and tracemalloc.is_tracing(): + mem_size, mem_peak = tracemalloc.get_traced_memory() + print("mem_size {}, mem_peak {}".format(mem_size, mem_peak)) + original_output = marshal_load_all(output_path) # _send_to_js does two layers of marshalling, diff --git a/sandbox/grist/usertypes.py b/sandbox/grist/usertypes.py index c2899cfd..64c17fd5 100644 --- a/sandbox/grist/usertypes.py +++ b/sandbox/grist/usertypes.py @@ -11,14 +11,16 @@ Python's array.array. However, at least on the Python side, it means that we nee data structure for values of the wrong type, and the memory savings aren't that great to be worth the extra complexity. """ +# pylint: disable=unidiomatic-typecheck import csv import datetime import json import math import six +from six import integer_types import objtypes -from objtypes import AltText +from objtypes import AltText, is_int_short import moment import logger from records import Record, RecordSet @@ -69,6 +71,8 @@ def ifError(value, value_if_error): # formulas, but it's unclear how to make that work. return value_if_error if isinstance(value, AltText) else value +_numeric_types = (float,) + six.integer_types +_numeric_or_none = (float, NoneType) + six.integer_types # Unique sentinel object to tell BaseColumnType constructor to use get_type_default(). _use_type_default = object() @@ -203,7 +207,7 @@ class Bool(BaseColumnType): # recognize. Everything else will result in alttext. if not value: return False - if isinstance(value, (float, six.integer_types)): + if isinstance(value, _numeric_types): return True if isinstance(value, AltText): value = six.text_type(value) @@ -229,14 +233,13 @@ class Int(BaseColumnType): return None # Convert to float first, since python does not allow casting strings with decimals to int ret = int(float(value)) - if not objtypes.is_int_short(ret): + if not is_int_short(ret): raise OverflowError("Integer value too large") return ret @classmethod def is_right_type(cls, value): - return value is None or (isinstance(value, six.integer_types) and not isinstance(value, bool) and - objtypes.is_int_short(value)) + return value is None or (type(value) in integer_types and is_int_short(value)) class Numeric(BaseColumnType): @@ -252,7 +255,7 @@ class Numeric(BaseColumnType): # TODO: Python distinguishes ints from floats, while JS only has floats. A value that can be # interpreted as an int will upon being entered have type 'float', but after database reload # will have type 'int'. - return isinstance(value, (float, six.integer_types, NoneType)) and not isinstance(value, bool) + return type(value) in _numeric_or_none class Date(Numeric): @@ -267,7 +270,7 @@ class Date(Numeric): return moment.date_to_ts(value.date()) elif isinstance(value, datetime.date): return moment.date_to_ts(value) - elif isinstance(value, (float, six.integer_types)): + elif isinstance(value, _numeric_types): return float(value) elif isinstance(value, six.string_types): # We also accept a date in ISO format (YYYY-MM-DD), the time portion is optional and ignored @@ -277,7 +280,7 @@ class Date(Numeric): @classmethod def is_right_type(cls, value): - return isinstance(value, (float, six.integer_types, NoneType)) + return isinstance(value, _numeric_or_none) class DateTime(Date): @@ -299,7 +302,7 @@ class DateTime(Date): return moment.dt_to_ts(value, self.timezone) elif isinstance(value, datetime.date): return moment.date_to_ts(value, self.timezone) - elif isinstance(value, (float, six.integer_types)): + elif isinstance(value, _numeric_types): return float(value) elif isinstance(value, six.string_types): # We also accept a datetime in ISO format (YYYY-MM-DD[T]HH:mm:ss) @@ -365,7 +368,7 @@ class PositionNumber(BaseColumnType): @classmethod def is_right_type(cls, value): # Same as Numeric, but does not support None. - return isinstance(value, (float, six.integer_types)) and not isinstance(value, bool) + return type(value) in _numeric_types class ManualSortPos(PositionNumber): @@ -387,14 +390,13 @@ class Id(BaseColumnType): if not isinstance(value, (int, Record)): raise TypeError("Cannot convert to Id type") ret = int(value) - if not objtypes.is_int_short(ret): + if not is_int_short(ret): raise OverflowError("Integer value too large") return ret @classmethod def is_right_type(cls, value): - return (isinstance(value, six.integer_types) and not isinstance(value, bool) and - objtypes.is_int_short(value)) + return (type(value) in integer_types and is_int_short(value)) class Reference(Id):