(core) Python optimizations to speed up data engine

Summary: - A bunch of optimizations guided by python profiling (esp. py-spy) - Big one is optimizing Record/RecordSet attribute access - Adds tracemalloc printout when running test_replay with PYTHONTRACEMALLOC=1 (on PY3) (but memory size is barely affected by these changes) - Testing with RECORD_SANDBOX_BUFFERS_DIR, loading and calculating a particular very large doc (CRM), time taken improved from 73.9s to 54.8s (26% faster) Test Plan: No behavior changes intended; relying on existing tests to verify that. Reviewers: georgegevoian Reviewed By: georgegevoian Differential Revision: https://phab.getgrist.com/D3781
2026-03-02 04:09:24 +00:00 · 2023-02-04 11:20:13 -05:00
parent 7c448d746f
commit 9d4eeda480
8 changed files with 144 additions and 84 deletions
--- a/sandbox/grist/table.py
+++ b/sandbox/grist/table.py
@@ -11,7 +11,7 @@ import docmodel
 import functions
 import logger
 import lookup
-import records
+from records import adjust_record, Record as BaseRecord, RecordSet as BaseRecordSet
 import relation as relation_module    # "relation" is used too much as a variable name below.
 import usertypes

@@ -64,7 +64,8 @@ class UserTable(object):
    """
    Name: lookupRecords
    Usage: UserTable.__lookupRecords__(Field_In_Lookup_Table=value, ...)
-    Returns a [RecordSet](#recordset) matching the given field=value arguments. The value may be any expression,
+    Returns a [RecordSet](#recordset) matching the given field=value arguments. The value may be
+    any expression,
    most commonly a field in the current row (e.g. `$SomeField`) or a constant (e.g. a quoted string
    like `"Some Value"`) (examples below).
    If `sort_by=field` is given, sort the results by that field.
@@ -88,7 +89,8 @@ class UserTable(object):
    """
    Name: lookupOne
    Usage: UserTable.__lookupOne__(Field_In_Lookup_Table=value, ...)
-    Returns a [Record](#record) matching the given field=value arguments. The value may be any expression,
+    Returns a [Record](#record) matching the given field=value arguments. The value may be any
+    expression,
    most commonly a field in the current row (e.g. `$SomeField`) or a constant (e.g. a quoted string
    like `"Some Value"`). If multiple records match, returns one of them. If none match, returns the
    special empty record.
@@ -222,23 +224,24 @@ class Table(object):
    # which are 'flattened' so source records may appear in multiple groups
    self._summary_simple = None

+    # Add Record and RecordSet subclasses with correct `_table` attribute, which will also hold a
+    # field attribute for each column.
+    class Record(BaseRecord):
+      __slots__ = ()
+      _table = self
+
+    class RecordSet(BaseRecordSet):
+      __slots__ = ()
+      _table = self
+
+    self.Record = Record
+    self.RecordSet = RecordSet
+
    # For use in _num_rows. The attribute isn't strictly needed,
    # but it makes _num_rows slightly faster, and only creating the lookup map when _num_rows
    # is called seems to be too late, at least for unit tests.
    self._empty_lookup_column = self._get_lookup_map(())

-    # Add Record and RecordSet subclasses which fill in this table as the first argument
-    class Record(records.Record):
-      def __init__(inner_self, *args, **kwargs):  # pylint: disable=no-self-argument
-        super(Record, inner_self).__init__(self, *args, **kwargs)
-
-    class RecordSet(records.RecordSet):
-      def __init__(inner_self, *args, **kwargs):  # pylint: disable=no-self-argument
-        super(RecordSet, inner_self).__init__(self, *args, **kwargs)
-
-    self.Record = Record
-    self.RecordSet = RecordSet
-
  def _num_rows(self):
    """
    Similar to `len(self.lookup_records())` but faster and doesn't create dependencies.
@@ -301,6 +304,8 @@ class Table(object):
    # column changes should stay the same. These get removed when unneeded using other means.
    new_cols.update(sorted(six.iteritems(self._special_cols)))

+    self._update_record_classes(self.all_columns, new_cols)
+
    # Set the new columns.
    self.all_columns = new_cols

@@ -411,8 +416,7 @@ class Table(object):
      if type(self.get_column(col_id).type_obj) != type(_updateSummary.grist_type):
        self.delete_column(self.get_column(col_id))
    col_obj = self._create_or_update_col(col_id, _updateSummary)
-    self._special_cols[col_id] = col_obj
-    self.all_columns[col_id] = col_obj
+    self._add_special_col(col_obj)

  def get_helper_columns(self):
    """
@@ -509,9 +513,10 @@ class Table(object):
        raise TypeError("sort_by must be a column ID (string)")
      reverse = sort_by.startswith("-")
      sort_col = sort_by.lstrip("-")
+      sort_col_obj = self.all_columns[sort_col]
      row_ids = sorted(
        row_id_set,
-        key=lambda r: column.SafeSortKey(self._get_col_value(sort_col, r, rel)),
+        key=lambda r: column.SafeSortKey(self._get_col_obj_value(sort_col_obj, r, rel)),
        reverse=reverse,
      )
    else:
@@ -542,14 +547,20 @@ class Table(object):
      else:
        column_class = lookup.SimpleLookupMapColumn
      lmap = column_class(self, lookup_col_id, col_ids_tuple)
-      self._special_cols[lookup_col_id] = lmap
-      self.all_columns[lookup_col_id] = lmap
+      self._add_special_col(lmap)
    return lmap

  def delete_column(self, col_obj):
    assert col_obj.table_id == self.table_id
    self._special_cols.pop(col_obj.col_id, None)
    self.all_columns.pop(col_obj.col_id, None)
+    self._remove_field_from_record_classes(col_obj.col_id)
+
+  def _add_special_col(self, col_obj):
+    assert col_obj.table_id == self.table_id
+    self._special_cols[col_obj.col_id] = col_obj
+    self.all_columns[col_obj.col_id] = col_obj
+    self._add_field_to_record_classes(col_obj)

  def lookupOrAddDerived(self, **kwargs):
    record = self.lookup_one_record(**kwargs)
@@ -629,39 +640,73 @@ class Table(object):

  # TODO: document everything here.

-  # Called when record.foo is accessed
-  def _get_col_value(self, col_id, row_id, relation):
-    [value] = self._get_col_subset_raw(col_id, [row_id], relation)
-    return records.adjust_record(relation, value)
+  # Equivalent to accessing record.foo, but only used in very limited cases now (field accessor is
+  # more optimized).
+  def _get_col_obj_value(self, col_obj, row_id, relation):
+    # creates a dependency and brings formula columns up-to-date.
+    self._engine._use_node(col_obj.node, relation, (row_id,))
+    value = col_obj.get_cell_value(row_id)
+    return adjust_record(relation, value)

  def _attribute_error(self, col_id, relation):
    self._engine._use_node(self._new_columns_node, relation)
    raise AttributeError("Table '%s' has no column '%s'" % (self.table_id, col_id))

  # Called when record_set.foo is accessed
-  def _get_col_subset(self, col_id, row_ids, relation):
-    values = self._get_col_subset_raw(col_id, row_ids, relation)
+  def _get_col_obj_subset(self, col_obj, row_ids, relation):
+    self._engine._use_node(col_obj.node, relation, row_ids)
+    values = [col_obj.get_cell_value(row_id) for row_id in row_ids]

    # When all the values are the same type of Record (i.e. all references to the same table)
    # combine them into a single RecordSet for that table instead of a list
    # so that more attribute accesses can be chained,
    # e.g. record_set.foo.bar where `foo` is a Reference column.
    value_types = list(set(map(type, values)))
-    if len(value_types) == 1 and issubclass(value_types[0], records.Record):
-      return records.RecordSet(
-        values[0]._table,
+    if len(value_types) == 1 and issubclass(value_types[0], BaseRecord):
+      return values[0]._table.RecordSet(
        # This is different from row_ids: these are the row IDs referenced by these Records,
        # whereas row_ids are where the values were being stored.
        [val._row_id for val in values],
        relation.compose(values[0]._source_relation),
      )
    else:
-      return [records.adjust_record(relation, value) for value in values]
+      return [adjust_record(relation, value) for value in values]

-  # Internal helper to optimise _get_col_value
-  # so that it doesn't make a singleton RecordSet just to immediately unpack it
-  def _get_col_subset_raw(self, col_id, row_ids, relation):
-    col = self.all_columns[col_id]
-    # creates a dependency and brings formula columns up-to-date.
-    self._engine._use_node(col.node, relation, row_ids)
-    return [col.get_cell_value(row_id) for row_id in row_ids]
+  #----------------------------------------
+
+  def _update_record_classes(self, old_columns, new_columns):
+    for col_id in old_columns:
+      if col_id not in new_columns:
+        self._remove_field_from_record_classes(col_id)
+
+    for col_id, col_obj in six.iteritems(new_columns):
+      if col_obj != old_columns.get(col_id):
+        self._add_field_to_record_classes(col_obj)
+
+  def _add_field_to_record_classes(self, col_obj):
+    node = col_obj.node
+    use_node = self._engine._use_node
+
+    @property
+    def record_field(rec):
+      # This is equivalent to _get_col_obj_value(), but is extra-optimized with _get_col_obj_value()
+      # and adjust_record() inlined, since this is particularly hot code, called on every access of
+      # any data field in a formula.
+      use_node(node, rec._source_relation, (rec._row_id,))
+      value = col_obj.get_cell_value(rec._row_id)
+      if isinstance(value, (BaseRecord, BaseRecordSet)):
+        return value._clone_with_relation(rec._source_relation)
+      return value
+
+    @property
+    def recordset_field(recset):
+      return self._get_col_obj_subset(col_obj, recset._row_ids, recset._source_relation)
+
+    setattr(self.Record, col_obj.col_id, record_field)
+    setattr(self.RecordSet, col_obj.col_id, recordset_field)
+
+  def _remove_field_from_record_classes(self, col_id):
+    if hasattr(self.Record, col_id):
+      delattr(self.Record, col_id)
+    if hasattr(self.RecordSet, col_id):
+      delattr(self.RecordSet, col_id)