(core) move data engine code to core

Summary: this moves sandbox/grist to core, and adds a requirements.txt file for reconstructing the content of sandbox/thirdparty. Test Plan: existing tests pass. Tested core functionality manually. Tested docker build manually. Reviewers: dsagal Reviewed By: dsagal Differential Revision: https://phab.getgrist.com/D2563
2026-03-02 04:09:24 +00:00 · 2020-07-27 14:57:36 -04:00
parent 2399baaca2
commit b82eec714a
97 changed files with 29551 additions and 2 deletions
--- a/sandbox/grist/depend.py
+++ b/sandbox/grist/depend.py
@@ -0,0 +1,155 @@
+"""
+depend.py provides classes and functions to manage the dependency graph for grist formulas.
+
+Conceptually, all dependency relationships are the Edges (Node1, Relation, Node2), meaning that
+Node1 depends on Node2. Each Node represents a column in a particular table (could be a derived
+table, such as for subtotals). The Relation determines the row mapping, i.e. which rows in Node1
+column need to be recomputed when a row changes in Node2 column.
+
+When a formula is evaluated, the Record and RecordSet objects maintain a reference to the Relation
+in use, while property access determines which Nodes (or columns) depend on one another.
+"""
+
+# Note: this is partly inspired by the implementation of the ninja build system, see
+# https://github.com/martine/ninja/blob/master/src/graph.h
+
+# Idea for the future: we can consider the concept from ninja of "order-only deps", which are
+# needed before we can build the outputs, but which don't cause the outputs to rebuild. Support
+# for this (with computed values properly persisted) could allow some cool use cases, like columns
+# that recompute manually rather than automatically.
+
+from collections import namedtuple
+from sortedcontainers import SortedSet
+
+class Node(namedtuple('Node', ('table_id', 'col_id'))):
+  """
+  Each Node in the dependency graph represents a column in a table.
+  """
+  __slots__ = ()    # This is a memory-saving device to keep these objects small
+
+  def __str__(self):
+    return '[%s.%s]' % (self.table_id, self.col_id)
+
+
+class Edge(namedtuple('Edge', ('out_node', 'in_node', 'relation'))):
+  """
+  Each Edge connects two Nodes using a Relation. It says that out_node depends on in_node, so that
+  a change to in_node should trigger a recomputation of out_node.
+  """
+  __slots__ = ()    # This is a memory-saving device to keep these objects small
+
+  def __str__(self):
+    return '[%s.%s: %s.%s @ %s]' % (self.out_node.table_id, self.out_node.col_id,
+                                    self.in_node.table_id, self.in_node.col_id, self.relation)
+
+
+class CircularRefError(RuntimeError):
+  """
+  Exception thrown when a formula column references itself, directly or indirectly.
+  """
+  pass
+
+
+class _AllRows(object):
+  """
+  Special constant that indicates to `invalidate_deps` that all rows are affected and an entire
+  column is to be invalidated.
+  """
+  pass
+
+ALL_ROWS = _AllRows()
+
+class Graph(object):
+  """
+  Represents the dependency graph for all data in a grist document.
+  """
+  def __init__(self):
+    # The set of all Edges, i.e. the complete dependency graph.
+    self._all_edges = set()
+
+    # Map from node to the set of edges having it as the in_node (i.e. edges to dependents).
+    self._in_node_map = {}
+
+    # Map from node to the set of edges having it as the out_node (i.e. edges to dependencies).
+    self._out_node_map = {}
+
+  def dump_graph(self):
+    """
+    Print out the graph to stdout, for debugging.
+    """
+    print "Dependency graph (%d edges):" % len(self._all_edges)
+    for edge in sorted(self._all_edges):
+      print "  %s" % (edge,)
+
+  def add_edge(self, out_node, in_node, relation):
+    """
+    Adds an edge to the global dependency graph: out_node depends on in_node, i.e. a change to
+    in_node should trigger a recomputation of out_node.
+    """
+    edge = Edge(out_node, in_node, relation)
+    self._all_edges.add(edge)
+    self._in_node_map.setdefault(edge.in_node, set()).add(edge)
+    self._out_node_map.setdefault(edge.out_node, set()).add(edge)
+
+  def clear_dependencies(self, out_node):
+    """
+    Removes all edges which affect the given out_node, i.e. all of its dependencies.
+    """
+    remove_edges = self._out_node_map.pop(out_node, ())
+    for edge in remove_edges:
+      self._all_edges.remove(edge)
+      self._in_node_map.get(edge.in_node, set()).remove(edge)
+      edge.relation.reset_all()
+
+  def reset_dependencies(self, node, dirty_rows):
+    """
+    For edges the given node depends on, reset the given output rows. This is called just before
+    the rows get recomputed, to allow the relations to clear out state for those rows if needed.
+    """
+    in_edges = self._out_node_map.get(node, ())
+    for edge in in_edges:
+      edge.relation.reset_rows(dirty_rows)
+
+  def remove_node_if_unused(self, node):
+    """
+    Removes the given node if it has no dependents. Returns True if the node is gone, False if the
+    node has dependents.
+    """
+    if self._in_node_map.get(node, None):
+      return False
+    self.clear_dependencies(node)
+    self._in_node_map.pop(node, None)
+    return True
+
+  def invalidate_deps(self, dirty_node, dirty_rows, recompute_map, include_self=True):
+    """
+    Invalidates the given rows in the given node, and all of its dependents, i.e. all the nodes
+    that recursively depend on dirty_node. If include_self is False, then skips the given node
+    (e.g. if the node is raw data rather than formula). Results are added to recompute_map, which
+    is a dict mapping Nodes to sets of rows that need to be recomputed.
+
+    If dirty_rows is ALL_ROWS, the whole column is affected, and dependencies get recomputed from
+    scratch. ALL_ROWS propagates to all dependent columns, so those also get recomputed in full.
+    """
+    if include_self:
+      if recompute_map.get(dirty_node) == ALL_ROWS:
+        return
+      if dirty_rows == ALL_ROWS:
+        recompute_map[dirty_node] = ALL_ROWS
+        # If all rows are being recomputed, clear the dependencies of the affected column. (We add
+        # dependencies in the course of recomputing, but we can only start from an empty set of
+        # dependencies if we are about to recompute all rows.)
+        self.clear_dependencies(dirty_node)
+      else:
+        out_rows = recompute_map.setdefault(dirty_node, SortedSet())
+        prev_count = len(out_rows)
+        out_rows.update(dirty_rows)
+        # Don't bother recursing into dependencies if we didn't actually update anything.
+        if len(out_rows) <= prev_count:
+          return
+
+    # Iterate through a copy of _in_node_map, because recursive clear_dependencies may modify it.
+    for edge in list(self._in_node_map.get(dirty_node, ())):
+      affected_rows = (ALL_ROWS if dirty_rows == ALL_ROWS else
+                       edge.relation.get_affected_rows(dirty_rows))
+      self.invalidate_deps(edge.out_node, affected_rows, recompute_map, include_self=True)