(core) Python optimizations to speed up data engine

Summary: - A bunch of optimizations guided by python profiling (esp. py-spy) - Big one is optimizing Record/RecordSet attribute access - Adds tracemalloc printout when running test_replay with PYTHONTRACEMALLOC=1 (on PY3) (but memory size is barely affected by these changes) - Testing with RECORD_SANDBOX_BUFFERS_DIR, loading and calculating a particular very large doc (CRM), time taken improved from 73.9s to 54.8s (26% faster) Test Plan: No behavior changes intended; relying on existing tests to verify that. Reviewers: georgegevoian Reviewed By: georgegevoian Differential Revision: https://phab.getgrist.com/D3781
2024-10-27 20:44:07 +00:00 · 2023-02-04 11:20:13 -05:00 · 2023-02-04 11:20:13 -05:00 · 9d4eeda480
commit 9d4eeda480
parent 7c448d746f
8 changed files with 144 additions and 84 deletions
--- a/sandbox/grist/column.py
+++ b/sandbox/grist/column.py
@ -60,6 +60,7 @@ class BaseColumn(object):
  """
  def __init__(self, table, col_id, col_info):
    self.type_obj = col_info.type_obj
+    self._is_right_type = self.type_obj.is_right_type
    self._data = []
    self.col_id = col_id
    self.table_id = table.table_id
@ -154,10 +155,14 @@ class BaseColumn(object):
      else:
        raise objtypes.CellError(self.table_id, self.col_id, row_id, raw.error)

-    return self._convert_raw_value(raw)
+    # Inline _convert_raw_value here because this is particularly hot code, called on every access
+    # of any data field in a formula.
+    if self._is_right_type(raw):
+      return self._make_rich_value(raw)
+    return self._alt_text(raw)

  def _convert_raw_value(self, raw):
-    if self.type_obj.is_right_type(raw):
+    if self._is_right_type(raw):
      return self._make_rich_value(raw)
    return self._alt_text(raw)

--- a/sandbox/grist/lookup.py
+++ b/sandbox/grist/lookup.py
@ -161,9 +161,9 @@ class SimpleLookupMapColumn(BaseLookupMapColumn):
  def _recalc_rec_method(self, rec, table):
    old_key = self._row_key_map.lookup_left(rec._row_id)

-    # Note that rec._get_col(_col_id) is what creates the correct dependency, as well as ensures
+    # Note that getattr(rec, _col_id) is what creates the correct dependency, as well as ensures
    # that the columns used to index by are brought up-to-date (in case they are formula columns).
-    new_key = tuple(_extract(rec._get_col(_col_id)) for _col_id in self._col_ids_tuple)
+    new_key = tuple(_extract(getattr(rec, _col_id)) for _col_id in self._col_ids_tuple)

    try:
      self._row_key_map.insert(rec._row_id, new_key)
@ -188,9 +188,9 @@ class ContainsLookupMapColumn(BaseLookupMapColumn):
    # looked up with CONTAINS()
    new_keys_groups = []
    for col_id in self._col_ids_tuple:
-      # Note that _get_col is what creates the correct dependency, as well as ensures
+      # Note that getattr() is what creates the correct dependency, as well as ensures
      # that the columns used to index by are brought up-to-date (in case they are formula columns).
-      group = rec._get_col(extract_column_id(col_id))
+      group = getattr(rec, extract_column_id(col_id))

      if isinstance(col_id, _Contains):
        # Check that the cell targeted by CONTAINS() has an appropriate type.
--- a/sandbox/grist/objtypes.py
+++ b/sandbox/grist/objtypes.py
@ -121,10 +121,8 @@ class CensoredValue(object):
 _censored_sentinel = CensoredValue()


-_max_js_int = 1<<31
-
 def is_int_short(value):
-  return -_max_js_int <= value < _max_js_int
+  return -(1<<31) <= value < (1<<31)

 def safe_shift(arg, default=None):
  value = arg.pop(0) if arg else None
--- a/sandbox/grist/records.py
+++ b/sandbox/grist/records.py
@ -39,8 +39,8 @@ class Record(object):
    Usage: __$group__

    In a [summary table](summary-tables.md), `$group` is a special field
-    containing the list of Records that are summarized by the current summary line.  E.g. the formula
-    `len($group)` counts the number of those records being summarized in each row.
+    containing the list of Records that are summarized by the current summary line.  E.g. the
+    formula `len($group)` counts the number of those records being summarized in each row.

    See [RecordSet](#recordset) for useful properties offered by the returned object.

@ -54,9 +54,15 @@ class Record(object):
    """
  )

+  # Slots are an optimization to avoid the need for a per-object __dict__.
+  __slots__ = ('_row_id', '_source_relation')
+
+  # Per-table derived classes override this and set it to the appropriate Table object.
+  _table = None
+
  # Record is always a thin class, containing essentially a reference to a row in the table. The
  # properties to access individual fields of a row are provided in per-table derived classes.
-  def __init__(self, table, row_id, relation=None):
+  def __init__(self, row_id, relation=None):
    """
    Creates a Record object.
      table - Table object, in which this record lives.
@ -69,20 +75,12 @@ class Record(object):

    which provides the table argument automatically.
    """
-    self._table = table
    self._row_id = row_id
-    self._source_relation = relation or table._identity_relation
+    self._source_relation = relation or self._table._identity_relation

-  def _get_col(self, col_id):
-    return self._table._get_col_value(col_id, self._row_id, self._source_relation)
-
-  # Look up a property of the record.  Internal properties are simple.
-  # For columns, we explicitly check that we have them before attempting to access.
-  # Otherwise AttributeError is ambiguous - it could be because we don't have the
-  # column, or because the column threw an AttributeError when evaluated.
+  # Existing fields are added as @property methods in table.py. When no field is found, raise a
+  # more informative AttributeError.
  def __getattr__(self, name):
-    if name in self._table.all_columns:
-      return self._get_col(name)
    return self._table._attribute_error(name, self._source_relation)

  def __hash__(self):
@ -134,17 +132,23 @@ class RecordSet(object):

  You can get the number of records in a RecordSet using `len`, e.g. `len($group)`.
  """
+
+  # Slots are an optimization to avoid the need for a per-object __dict__.
+  __slots__ = ('_row_ids', '_source_relation', '_group_by', '_sort_by')
+
+  # Per-table derived classes override this and set it to the appropriate Table object.
+  _table = None
+
  # Methods should be named with a leading underscore to avoid interfering with access to
  # user-defined fields.
-  def __init__(self, table, row_ids, relation=None, group_by=None, sort_by=None):
+  def __init__(self, row_ids, relation=None, group_by=None, sort_by=None):
    """
    group_by may be a dictionary mapping column names to values that are all the same for the given
    RecordSet. sort_by may be the column name used for sorting this record set. Both are set by
    lookupRecords, and used when using RecordSet to insert new records.
    """
-    self._table = table
    self._row_ids = row_ids
-    self._source_relation = relation or table._identity_relation
+    self._source_relation = relation or self._table._identity_relation
    # If row_ids is itself a RecordList, default to its _group_by and _sort_by properties.
    self._group_by = group_by or getattr(row_ids, '_group_by', None)
    self._sort_by = sort_by or getattr(row_ids, '_sort_by', None)
@ -188,12 +192,7 @@ class RecordSet(object):
      row_id = min(self._row_ids)
    return self._table.Record(row_id, self._source_relation)

-  def _get_col(self, col_id):
-    return self._table._get_col_subset(col_id, self._row_ids, self._source_relation)
-
  def __getattr__(self, name):
-    if name in self._table.all_columns:
-      return self._get_col(name)
    return self._table._attribute_error(name, self._source_relation)

  def __repr__(self):
--- a/sandbox/grist/table.py
+++ b/sandbox/grist/table.py
@ -11,7 +11,7 @@ import docmodel
 import functions
 import logger
 import lookup
-import records
+from records import adjust_record, Record as BaseRecord, RecordSet as BaseRecordSet
 import relation as relation_module    # "relation" is used too much as a variable name below.
 import usertypes

@ -64,7 +64,8 @@ class UserTable(object):
    """
    Name: lookupRecords
    Usage: UserTable.__lookupRecords__(Field_In_Lookup_Table=value, ...)
-    Returns a [RecordSet](#recordset) matching the given field=value arguments. The value may be any expression,
+    Returns a [RecordSet](#recordset) matching the given field=value arguments. The value may be
+    any expression,
    most commonly a field in the current row (e.g. `$SomeField`) or a constant (e.g. a quoted string
    like `"Some Value"`) (examples below).
    If `sort_by=field` is given, sort the results by that field.
@ -88,7 +89,8 @@ class UserTable(object):
    """
    Name: lookupOne
    Usage: UserTable.__lookupOne__(Field_In_Lookup_Table=value, ...)
-    Returns a [Record](#record) matching the given field=value arguments. The value may be any expression,
+    Returns a [Record](#record) matching the given field=value arguments. The value may be any
+    expression,
    most commonly a field in the current row (e.g. `$SomeField`) or a constant (e.g. a quoted string
    like `"Some Value"`). If multiple records match, returns one of them. If none match, returns the
    special empty record.
@ -222,23 +224,24 @@ class Table(object):
    # which are 'flattened' so source records may appear in multiple groups
    self._summary_simple = None

+    # Add Record and RecordSet subclasses with correct `_table` attribute, which will also hold a
+    # field attribute for each column.
+    class Record(BaseRecord):
+      __slots__ = ()
+      _table = self
+
+    class RecordSet(BaseRecordSet):
+      __slots__ = ()
+      _table = self
+
+    self.Record = Record
+    self.RecordSet = RecordSet
+
    # For use in _num_rows. The attribute isn't strictly needed,
    # but it makes _num_rows slightly faster, and only creating the lookup map when _num_rows
    # is called seems to be too late, at least for unit tests.
    self._empty_lookup_column = self._get_lookup_map(())

-    # Add Record and RecordSet subclasses which fill in this table as the first argument
-    class Record(records.Record):
-      def __init__(inner_self, *args, **kwargs):  # pylint: disable=no-self-argument
-        super(Record, inner_self).__init__(self, *args, **kwargs)
-
-    class RecordSet(records.RecordSet):
-      def __init__(inner_self, *args, **kwargs):  # pylint: disable=no-self-argument
-        super(RecordSet, inner_self).__init__(self, *args, **kwargs)
-
-    self.Record = Record
-    self.RecordSet = RecordSet
-
  def _num_rows(self):
    """
    Similar to `len(self.lookup_records())` but faster and doesn't create dependencies.
@ -301,6 +304,8 @@ class Table(object):
    # column changes should stay the same. These get removed when unneeded using other means.
    new_cols.update(sorted(six.iteritems(self._special_cols)))

+    self._update_record_classes(self.all_columns, new_cols)
+
    # Set the new columns.
    self.all_columns = new_cols

@ -411,8 +416,7 @@ class Table(object):
      if type(self.get_column(col_id).type_obj) != type(_updateSummary.grist_type):
        self.delete_column(self.get_column(col_id))
    col_obj = self._create_or_update_col(col_id, _updateSummary)
-    self._special_cols[col_id] = col_obj
-    self.all_columns[col_id] = col_obj
+    self._add_special_col(col_obj)

  def get_helper_columns(self):
    """
@ -509,9 +513,10 @@ class Table(object):
        raise TypeError("sort_by must be a column ID (string)")
      reverse = sort_by.startswith("-")
      sort_col = sort_by.lstrip("-")
+      sort_col_obj = self.all_columns[sort_col]
      row_ids = sorted(
        row_id_set,
-        key=lambda r: column.SafeSortKey(self._get_col_value(sort_col, r, rel)),
+        key=lambda r: column.SafeSortKey(self._get_col_obj_value(sort_col_obj, r, rel)),
        reverse=reverse,
      )
    else:
@ -542,14 +547,20 @@ class Table(object):
      else:
        column_class = lookup.SimpleLookupMapColumn
      lmap = column_class(self, lookup_col_id, col_ids_tuple)
-      self._special_cols[lookup_col_id] = lmap
-      self.all_columns[lookup_col_id] = lmap
+      self._add_special_col(lmap)
    return lmap

  def delete_column(self, col_obj):
    assert col_obj.table_id == self.table_id
    self._special_cols.pop(col_obj.col_id, None)
    self.all_columns.pop(col_obj.col_id, None)
+    self._remove_field_from_record_classes(col_obj.col_id)
+
+  def _add_special_col(self, col_obj):
+    assert col_obj.table_id == self.table_id
+    self._special_cols[col_obj.col_id] = col_obj
+    self.all_columns[col_obj.col_id] = col_obj
+    self._add_field_to_record_classes(col_obj)

  def lookupOrAddDerived(self, **kwargs):
    record = self.lookup_one_record(**kwargs)
@ -629,39 +640,73 @@ class Table(object):

  # TODO: document everything here.

-  # Called when record.foo is accessed
-  def _get_col_value(self, col_id, row_id, relation):
-    [value] = self._get_col_subset_raw(col_id, [row_id], relation)
-    return records.adjust_record(relation, value)
+  # Equivalent to accessing record.foo, but only used in very limited cases now (field accessor is
+  # more optimized).
+  def _get_col_obj_value(self, col_obj, row_id, relation):
+    # creates a dependency and brings formula columns up-to-date.
+    self._engine._use_node(col_obj.node, relation, (row_id,))
+    value = col_obj.get_cell_value(row_id)
+    return adjust_record(relation, value)

  def _attribute_error(self, col_id, relation):
    self._engine._use_node(self._new_columns_node, relation)
    raise AttributeError("Table '%s' has no column '%s'" % (self.table_id, col_id))

  # Called when record_set.foo is accessed
-  def _get_col_subset(self, col_id, row_ids, relation):
-    values = self._get_col_subset_raw(col_id, row_ids, relation)
+  def _get_col_obj_subset(self, col_obj, row_ids, relation):
+    self._engine._use_node(col_obj.node, relation, row_ids)
+    values = [col_obj.get_cell_value(row_id) for row_id in row_ids]

    # When all the values are the same type of Record (i.e. all references to the same table)
    # combine them into a single RecordSet for that table instead of a list
    # so that more attribute accesses can be chained,
    # e.g. record_set.foo.bar where `foo` is a Reference column.
    value_types = list(set(map(type, values)))
-    if len(value_types) == 1 and issubclass(value_types[0], records.Record):
-      return records.RecordSet(
-        values[0]._table,
+    if len(value_types) == 1 and issubclass(value_types[0], BaseRecord):
+      return values[0]._table.RecordSet(
        # This is different from row_ids: these are the row IDs referenced by these Records,
        # whereas row_ids are where the values were being stored.
        [val._row_id for val in values],
        relation.compose(values[0]._source_relation),
      )
    else:
-      return [records.adjust_record(relation, value) for value in values]
+      return [adjust_record(relation, value) for value in values]

-  # Internal helper to optimise _get_col_value
-  # so that it doesn't make a singleton RecordSet just to immediately unpack it
-  def _get_col_subset_raw(self, col_id, row_ids, relation):
-    col = self.all_columns[col_id]
-    # creates a dependency and brings formula columns up-to-date.
-    self._engine._use_node(col.node, relation, row_ids)
-    return [col.get_cell_value(row_id) for row_id in row_ids]
+  #----------------------------------------
+
+  def _update_record_classes(self, old_columns, new_columns):
+    for col_id in old_columns:
+      if col_id not in new_columns:
+        self._remove_field_from_record_classes(col_id)
+
+    for col_id, col_obj in six.iteritems(new_columns):
+      if col_obj != old_columns.get(col_id):
+        self._add_field_to_record_classes(col_obj)
+
+  def _add_field_to_record_classes(self, col_obj):
+    node = col_obj.node
+    use_node = self._engine._use_node
+
+    @property
+    def record_field(rec):
+      # This is equivalent to _get_col_obj_value(), but is extra-optimized with _get_col_obj_value()
+      # and adjust_record() inlined, since this is particularly hot code, called on every access of
+      # any data field in a formula.
+      use_node(node, rec._source_relation, (rec._row_id,))
+      value = col_obj.get_cell_value(rec._row_id)
+      if isinstance(value, (BaseRecord, BaseRecordSet)):
+        return value._clone_with_relation(rec._source_relation)
+      return value
+
+    @property
+    def recordset_field(recset):
+      return self._get_col_obj_subset(col_obj, recset._row_ids, recset._source_relation)
+
+    setattr(self.Record, col_obj.col_id, record_field)
+    setattr(self.RecordSet, col_obj.col_id, recordset_field)
+
+  def _remove_field_from_record_classes(self, col_id):
+    if hasattr(self.Record, col_id):
+      delattr(self.Record, col_id)
+    if hasattr(self.RecordSet, col_id):
+      delattr(self.RecordSet, col_id)
--- a/sandbox/grist/test_engine.py
+++ b/sandbox/grist/test_engine.py
@ -26,8 +26,9 @@ View = namedtuple('View', 'id sections')
 Section = namedtuple('Section', 'id parentKey tableRef fields')
 Field = namedtuple('Field', 'id colRef')

-unittest.TestCase.assertRaisesRegex = unittest.TestCase.assertRaisesRegexp
-unittest.TestCase.assertRegex = unittest.TestCase.assertRegexpMatches
+if six.PY2:
+  unittest.TestCase.assertRaisesRegex = unittest.TestCase.assertRaisesRegexp
+  unittest.TestCase.assertRegex = unittest.TestCase.assertRegexpMatches

 class EngineTestCase(unittest.TestCase):
  """
--- a/sandbox/grist/test_replay.py
+++ b/sandbox/grist/test_replay.py
@ -44,7 +44,7 @@ import unittest

 from main import run
 from sandbox import Sandbox
-
+import six

 def marshal_load_all(path):
  result = []
@ -65,6 +65,7 @@ class TestReplay(unittest.TestCase):
    root = os.environ.get("RECORD_SANDBOX_BUFFERS_DIR")
    if not root:
      self.skipTest("RECORD_SANDBOX_BUFFERS_DIR not set")
+
    for dirpath, dirnames, filenames in os.walk(root):
      if "input" not in filenames:
        continue
@ -76,9 +77,18 @@ class TestReplay(unittest.TestCase):
      new_output_path = os.path.join(dirpath, "new_output")
      with open(input_path, "rb") as external_input:
        with open(new_output_path, "wb") as external_output:
+          if six.PY3:
+            import tracemalloc          # pylint: disable=import-error
+            tracemalloc.reset_peak()
+
          sandbox = Sandbox(external_input, external_output)
          run(sandbox)

+          # Run with env PYTHONTRACEMALLOC=1 to trace and print peak memory (runs much slower).
+          if six.PY3 and tracemalloc.is_tracing():
+            mem_size, mem_peak = tracemalloc.get_traced_memory()
+            print("mem_size {}, mem_peak {}".format(mem_size, mem_peak))
+
      original_output = marshal_load_all(output_path)

      # _send_to_js does two layers of marshalling,
--- a/sandbox/grist/usertypes.py
+++ b/sandbox/grist/usertypes.py
@ -11,14 +11,16 @@ Python's array.array. However, at least on the Python side, it means that we nee
 data structure for values of the wrong type, and the memory savings aren't that great to be worth
 the extra complexity.
 """
+# pylint: disable=unidiomatic-typecheck
 import csv
 import datetime
 import json
 import math

 import six
+from six import integer_types
 import objtypes
-from objtypes import AltText
+from objtypes import AltText, is_int_short
 import moment
 import logger
 from records import Record, RecordSet
@ -69,6 +71,8 @@ def ifError(value, value_if_error):
  # formulas, but it's unclear how to make that work.
  return value_if_error if isinstance(value, AltText) else value

+_numeric_types = (float,) + six.integer_types
+_numeric_or_none = (float, NoneType) + six.integer_types

 # Unique sentinel object to tell BaseColumnType constructor to use get_type_default().
 _use_type_default = object()
@ -203,7 +207,7 @@ class Bool(BaseColumnType):
    # recognize. Everything else will result in alttext.
    if not value:
      return False
-    if isinstance(value, (float, six.integer_types)):
+    if isinstance(value, _numeric_types):
      return True
    if isinstance(value, AltText):
      value = six.text_type(value)
@ -229,14 +233,13 @@ class Int(BaseColumnType):
      return None
    # Convert to float first, since python does not allow casting strings with decimals to int
    ret = int(float(value))
-    if not objtypes.is_int_short(ret):
+    if not is_int_short(ret):
      raise OverflowError("Integer value too large")
    return ret

  @classmethod
  def is_right_type(cls, value):
-    return value is None or (isinstance(value, six.integer_types) and not isinstance(value, bool) and
-      objtypes.is_int_short(value))
+    return value is None or (type(value) in integer_types and is_int_short(value))


 class Numeric(BaseColumnType):
@ -252,7 +255,7 @@ class Numeric(BaseColumnType):
    # TODO: Python distinguishes ints from floats, while JS only has floats. A value that can be
    # interpreted as an int will upon being entered have type 'float', but after database reload
    # will have type 'int'.
-    return isinstance(value, (float, six.integer_types, NoneType)) and not isinstance(value, bool)
+    return type(value) in _numeric_or_none


 class Date(Numeric):
@ -267,7 +270,7 @@ class Date(Numeric):
      return moment.date_to_ts(value.date())
    elif isinstance(value, datetime.date):
      return moment.date_to_ts(value)
-    elif isinstance(value, (float, six.integer_types)):
+    elif isinstance(value, _numeric_types):
      return float(value)
    elif isinstance(value, six.string_types):
      # We also accept a date in ISO format (YYYY-MM-DD), the time portion is optional and ignored
@ -277,7 +280,7 @@ class Date(Numeric):

  @classmethod
  def is_right_type(cls, value):
-    return isinstance(value, (float, six.integer_types, NoneType))
+    return isinstance(value, _numeric_or_none)


 class DateTime(Date):
@ -299,7 +302,7 @@ class DateTime(Date):
      return moment.dt_to_ts(value, self.timezone)
    elif isinstance(value, datetime.date):
      return moment.date_to_ts(value, self.timezone)
-    elif isinstance(value, (float, six.integer_types)):
+    elif isinstance(value, _numeric_types):
      return float(value)
    elif isinstance(value, six.string_types):
      # We also accept a datetime in ISO format (YYYY-MM-DD[T]HH:mm:ss)
@ -365,7 +368,7 @@ class PositionNumber(BaseColumnType):
  @classmethod
  def is_right_type(cls, value):
    # Same as Numeric, but does not support None.
-    return isinstance(value, (float, six.integer_types)) and not isinstance(value, bool)
+    return type(value) in _numeric_types


 class ManualSortPos(PositionNumber):
@ -387,14 +390,13 @@ class Id(BaseColumnType):
    if not isinstance(value, (int, Record)):
      raise TypeError("Cannot convert to Id type")
    ret = int(value)
-    if not objtypes.is_int_short(ret):
+    if not is_int_short(ret):
      raise OverflowError("Integer value too large")
    return ret

  @classmethod
  def is_right_type(cls, value):
-    return (isinstance(value, six.integer_types) and not isinstance(value, bool) and
-      objtypes.is_int_short(value))
+    return (type(value) in integer_types and is_int_short(value))


 class Reference(Id):