gristlabs_grist-core/sandbox/grist/depend.py

"""
depend.py provides classes and functions to manage the dependency graph for grist formulas.

Conceptually, all dependency relationships are the Edges (Node1, Relation, Node2), meaning that
Node1 depends on Node2. Each Node represents a column in a particular table (could be a derived
table, such as for subtotals). The Relation determines the row mapping, i.e. which rows in Node1
column need to be recomputed when a row changes in Node2 column.

When a formula is evaluated, the Record and RecordSet objects maintain a reference to the Relation
in use, while property access determines which Nodes (or columns) depend on one another.
"""

# Note: this is partly inspired by the implementation of the ninja build system, see
# https://github.com/martine/ninja/blob/master/src/graph.h

# Idea for the future: we can consider the concept from ninja of "order-only deps", which are
# needed before we can build the outputs, but which don't cause the outputs to rebuild. Support
# for this (with computed values properly persisted) could allow some cool use cases, like columns
# that recompute manually rather than automatically.

from collections import namedtuple
from sortedcontainers import SortedSet

class Node(namedtuple('Node', ('table_id', 'col_id'))):
  """
  Each Node in the dependency graph represents a column in a table.
  """
  __slots__ = ()    # This is a memory-saving device to keep these objects small

  def __str__(self):
    return '[%s.%s]' % (self.table_id, self.col_id)


class Edge(namedtuple('Edge', ('out_node', 'in_node', 'relation'))):
  """
  Each Edge connects two Nodes using a Relation. It says that out_node depends on in_node, so that
  a change to in_node should trigger a recomputation of out_node.
  """
  __slots__ = ()    # This is a memory-saving device to keep these objects small

  def __str__(self):
    return '[%s.%s: %s.%s @ %s]' % (self.out_node.table_id, self.out_node.col_id,
                                    self.in_node.table_id, self.in_node.col_id, self.relation)


class CircularRefError(RuntimeError):
  """
  Exception thrown when a formula column references itself, directly or indirectly.
  """
  pass


class _AllRows(object):
  """
  Special constant that indicates to `invalidate_deps` that all rows are affected and an entire
  column is to be invalidated.
  """
  pass

ALL_ROWS = _AllRows()

class Graph(object):
  """
  Represents the dependency graph for all data in a grist document.
  """
  def __init__(self):
    # The set of all Edges, i.e. the complete dependency graph.
    self._all_edges = set()

    # Map from node to the set of edges having it as the in_node (i.e. edges to dependents).
    self._in_node_map = {}

    # Map from node to the set of edges having it as the out_node (i.e. edges to dependencies).
    self._out_node_map = {}

  def dump_graph(self):
    """
    Print out the graph to stdout, for debugging.
    """
    print "Dependency graph (%d edges):" % len(self._all_edges)
    for edge in sorted(self._all_edges):
      print "  %s" % (edge,)

  def add_edge(self, out_node, in_node, relation):
    """
    Adds an edge to the global dependency graph: out_node depends on in_node, i.e. a change to
    in_node should trigger a recomputation of out_node.
    """
    edge = Edge(out_node, in_node, relation)
    self._all_edges.add(edge)
    self._in_node_map.setdefault(edge.in_node, set()).add(edge)
    self._out_node_map.setdefault(edge.out_node, set()).add(edge)

  def clear_dependencies(self, out_node):
    """
    Removes all edges which affect the given out_node, i.e. all of its dependencies.
    """
    remove_edges = self._out_node_map.pop(out_node, ())
    for edge in remove_edges:
      self._all_edges.remove(edge)
      self._in_node_map.get(edge.in_node, set()).remove(edge)
      edge.relation.reset_all()

  def reset_dependencies(self, node, dirty_rows):
    """
    For edges the given node depends on, reset the given output rows. This is called just before
    the rows get recomputed, to allow the relations to clear out state for those rows if needed.
    """
    in_edges = self._out_node_map.get(node, ())
    for edge in in_edges:
      edge.relation.reset_rows(dirty_rows)

  def remove_node_if_unused(self, node):
    """
    Removes the given node if it has no dependents. Returns True if the node is gone, False if the
    node has dependents.
    """
    if self._in_node_map.get(node, None):
      return False
    self.clear_dependencies(node)
    self._in_node_map.pop(node, None)
    return True

  def invalidate_deps(self, dirty_node, dirty_rows, recompute_map, include_self=True):
    """
    Invalidates the given rows in the given node, and all of its dependents, i.e. all the nodes
    that recursively depend on dirty_node. If include_self is False, then skips the given node
    (e.g. if the node is raw data rather than formula). Results are added to recompute_map, which
    is a dict mapping Nodes to sets of rows that need to be recomputed.

    If dirty_rows is ALL_ROWS, the whole column is affected, and dependencies get recomputed from
    scratch. ALL_ROWS propagates to all dependent columns, so those also get recomputed in full.
    """
    if include_self:
      if recompute_map.get(dirty_node) == ALL_ROWS:
        return
      if dirty_rows == ALL_ROWS:
        recompute_map[dirty_node] = ALL_ROWS
        # If all rows are being recomputed, clear the dependencies of the affected column. (We add
        # dependencies in the course of recomputing, but we can only start from an empty set of
        # dependencies if we are about to recompute all rows.)
        self.clear_dependencies(dirty_node)
      else:
        out_rows = recompute_map.setdefault(dirty_node, SortedSet())
        prev_count = len(out_rows)
        out_rows.update(dirty_rows)
        # Don't bother recursing into dependencies if we didn't actually update anything.
        if len(out_rows) <= prev_count:
          return

    # Iterate through a copy of _in_node_map, because recursive clear_dependencies may modify it.
    for edge in list(self._in_node_map.get(dirty_node, ())):
      affected_rows = (ALL_ROWS if dirty_rows == ALL_ROWS else
                       edge.relation.get_affected_rows(dirty_rows))
      self.invalidate_deps(edge.out_node, affected_rows, recompute_map, include_self=True)
(core) move data engine code to core Summary: this moves sandbox/grist to core, and adds a requirements.txt file for reconstructing the content of sandbox/thirdparty. Test Plan: existing tests pass. Tested core functionality manually. Tested docker build manually. Reviewers: dsagal Reviewed By: dsagal Differential Revision: https://phab.getgrist.com/D2563 2020-07-27 18:57:36 +00:00			`"""`
			`depend.py provides classes and functions to manage the dependency graph for grist formulas.`

			`Conceptually, all dependency relationships are the Edges (Node1, Relation, Node2), meaning that`
			`Node1 depends on Node2. Each Node represents a column in a particular table (could be a derived`
			`table, such as for subtotals). The Relation determines the row mapping, i.e. which rows in Node1`
			`column need to be recomputed when a row changes in Node2 column.`

			`When a formula is evaluated, the Record and RecordSet objects maintain a reference to the Relation`
			`in use, while property access determines which Nodes (or columns) depend on one another.`
			`"""`

			`# Note: this is partly inspired by the implementation of the ninja build system, see`
			`# https://github.com/martine/ninja/blob/master/src/graph.h`

			`# Idea for the future: we can consider the concept from ninja of "order-only deps", which are`
			`# needed before we can build the outputs, but which don't cause the outputs to rebuild. Support`
			`# for this (with computed values properly persisted) could allow some cool use cases, like columns`
			`# that recompute manually rather than automatically.`

			`from collections import namedtuple`
			`from sortedcontainers import SortedSet`

			`class Node(namedtuple('Node', ('table_id', 'col_id'))):`
			`"""`
			`Each Node in the dependency graph represents a column in a table.`
			`"""`
			`__slots__ = () # This is a memory-saving device to keep these objects small`

			`def __str__(self):`
			`return '[%s.%s]' % (self.table_id, self.col_id)`


			`class Edge(namedtuple('Edge', ('out_node', 'in_node', 'relation'))):`
			`"""`
			`Each Edge connects two Nodes using a Relation. It says that out_node depends on in_node, so that`
			`a change to in_node should trigger a recomputation of out_node.`
			`"""`
			`__slots__ = () # This is a memory-saving device to keep these objects small`

			`def __str__(self):`
			`return '[%s.%s: %s.%s @ %s]' % (self.out_node.table_id, self.out_node.col_id,`
			`self.in_node.table_id, self.in_node.col_id, self.relation)`


			`class CircularRefError(RuntimeError):`
			`"""`
			`Exception thrown when a formula column references itself, directly or indirectly.`
			`"""`
			`pass`


			`class _AllRows(object):`
			`"""`
			Special constant that indicates to `invalidate_deps` that all rows are affected and an entire
			`column is to be invalidated.`
			`"""`
			`pass`

			`ALL_ROWS = _AllRows()`

			`class Graph(object):`
			`"""`
			`Represents the dependency graph for all data in a grist document.`
			`"""`
			`def __init__(self):`
			`# The set of all Edges, i.e. the complete dependency graph.`
			`self._all_edges = set()`

			`# Map from node to the set of edges having it as the in_node (i.e. edges to dependents).`
			`self._in_node_map = {}`

			`# Map from node to the set of edges having it as the out_node (i.e. edges to dependencies).`
			`self._out_node_map = {}`

			`def dump_graph(self):`
			`"""`
			`Print out the graph to stdout, for debugging.`
			`"""`
			`print "Dependency graph (%d edges):" % len(self._all_edges)`
			`for edge in sorted(self._all_edges):`
			`print " %s" % (edge,)`

			`def add_edge(self, out_node, in_node, relation):`
			`"""`
			`Adds an edge to the global dependency graph: out_node depends on in_node, i.e. a change to`
			`in_node should trigger a recomputation of out_node.`
			`"""`
			`edge = Edge(out_node, in_node, relation)`
			`self._all_edges.add(edge)`
			`self._in_node_map.setdefault(edge.in_node, set()).add(edge)`
			`self._out_node_map.setdefault(edge.out_node, set()).add(edge)`

			`def clear_dependencies(self, out_node):`
			`"""`
			`Removes all edges which affect the given out_node, i.e. all of its dependencies.`
			`"""`
			`remove_edges = self._out_node_map.pop(out_node, ())`
			`for edge in remove_edges:`
			`self._all_edges.remove(edge)`
			`self._in_node_map.get(edge.in_node, set()).remove(edge)`
			`edge.relation.reset_all()`

			`def reset_dependencies(self, node, dirty_rows):`
			`"""`
			`For edges the given node depends on, reset the given output rows. This is called just before`
			`the rows get recomputed, to allow the relations to clear out state for those rows if needed.`
			`"""`
			`in_edges = self._out_node_map.get(node, ())`
			`for edge in in_edges:`
			`edge.relation.reset_rows(dirty_rows)`

			`def remove_node_if_unused(self, node):`
			`"""`
			`Removes the given node if it has no dependents. Returns True if the node is gone, False if the`
			`node has dependents.`
			`"""`
			`if self._in_node_map.get(node, None):`
			`return False`
			`self.clear_dependencies(node)`
			`self._in_node_map.pop(node, None)`
			`return True`

			`def invalidate_deps(self, dirty_node, dirty_rows, recompute_map, include_self=True):`
			`"""`
			`Invalidates the given rows in the given node, and all of its dependents, i.e. all the nodes`
			`that recursively depend on dirty_node. If include_self is False, then skips the given node`
			`(e.g. if the node is raw data rather than formula). Results are added to recompute_map, which`
			`is a dict mapping Nodes to sets of rows that need to be recomputed.`

			`If dirty_rows is ALL_ROWS, the whole column is affected, and dependencies get recomputed from`
			`scratch. ALL_ROWS propagates to all dependent columns, so those also get recomputed in full.`
			`"""`
			`if include_self:`
			`if recompute_map.get(dirty_node) == ALL_ROWS:`
			`return`
			`if dirty_rows == ALL_ROWS:`
			`recompute_map[dirty_node] = ALL_ROWS`
			`# If all rows are being recomputed, clear the dependencies of the affected column. (We add`
			`# dependencies in the course of recomputing, but we can only start from an empty set of`
			`# dependencies if we are about to recompute all rows.)`
			`self.clear_dependencies(dirty_node)`
			`else:`
			`out_rows = recompute_map.setdefault(dirty_node, SortedSet())`
			`prev_count = len(out_rows)`
			`out_rows.update(dirty_rows)`
			`# Don't bother recursing into dependencies if we didn't actually update anything.`
			`if len(out_rows) <= prev_count:`
			`return`

			`# Iterate through a copy of _in_node_map, because recursive clear_dependencies may modify it.`
			`for edge in list(self._in_node_map.get(dirty_node, ())):`
			`affected_rows = (ALL_ROWS if dirty_rows == ALL_ROWS else`
			`edge.relation.get_affected_rows(dirty_rows))`
			`self.invalidate_deps(edge.out_node, affected_rows, recompute_map, include_self=True)`