gristlabs_grist-core/sandbox/grist/records.py
Dmitry S f0d0a07295 (core) Implement PREVIOUS/NEXT/RANK and lookupRecords().find.* methods.
Summary:
- `lookupRecords()` now allows efficient search in sorted results, with
  the syntax  `lookupRecords(..., order_by="-Date").find.le($Date)`. This will find the record with the nearest date that's <= `$Date`.
- The `find.*` methods are `le`, `lt`, `ge`, `gt`, and `eq`. All have O(log N) performance.
- `PREVIOUS(rec, group_by=..., order_by=...)` finds the previous record to rec, according to `group_by` / `order_by`, in amortized O(log N) time. For example, `PREVIOUS(rec, group_by="Account", order_by="Date")`.
- `PREVIOUS(rec, order_by=None)` finds the previous record in the full table, sorted by the `manualSort` column, to match the order visible in the unsorted table.
- `NEXT(...)` is just like `PREVIOUS(...)` but finds the next record.
- `RANK(rec, group_by=..., order_by=..., order="asc")` returns the rank of the record within the group, starting with 1. Order can be `"asc"` (default) or `"desc"`.
- The `order_by` argument in `lookupRecords`, and the new functions now supports tuples, as well as the "-" prefix to reverse order, e.g. `("Category", "-Date")`.
- New functions are only available in Python3, for a minor reason (to support keyword-only arguments for `group_by` and `order_by`) and also as a nudge to Python2 users to update.

- Includes fixes for several situations related to lookups that used to cause quadratic complexity.

Test Plan:
- New performance check that sorted lookups don't add quadratic complexity.
- Tests added for lookup find.* methods, and for PREVIOUS/NEXT/RANK.
- Tests added that renaming columns updates `order_by` and `group_by` arguments, and attributes on results (e.g. `PREVIOUS(...).ColId`) appropriately.
- Python3 tests can now produce verbose output when VERBOSE=1 and -v are given.

Reviewers: jarek, georgegevoian

Reviewed By: jarek, georgegevoian

Subscribers: paulfitz, jarek

Differential Revision: https://phab.getgrist.com/D4265
2024-07-17 12:00:55 -04:00

368 lines
13 KiB
Python

"""
Implements the base classes for Record and RecordSet objects used to represent records in Grist
tables. Individual tables use derived versions of these, which add per-column properties.
"""
from bisect import bisect_left, bisect_right
import functools
import sys
import six
@functools.total_ordering
class Record(object):
"""
Name: Record, rec
A Record represents a record of data. It is the primary means of accessing values in formulas. A
Record for a particular table has a property for each data and formula column in the table.
In a formula, `$field` is translated to `rec.field`, where `rec` is the Record for which the
formula is being evaluated.
For example:
```
def Full_Name(rec, table):
return rec.First_Name + ' ' + rec.LastName
def Name_Length(rec, table):
return len(rec.Full_Name)
```
"""
# Some documentation for method-like parts of Record, which aren't actually methods.
_DOC_EXTRA = (
"""
Name: $Field, rec.Field
Usage: __$__*Field* or __rec__*.Field*
Access the field named "Field" of the current record. E.g. `$First_Name` or `rec.First_Name`.
""",
"""
Name: $group, rec.group
Usage: __$group__
In a [summary table](summary-tables.md), `$group` is a special field
containing the list of Records that are summarized by the current summary line. E.g. the
formula `len($group)` counts the number of those records being summarized in each row.
See [RecordSet](#recordset) for useful properties offered by the returned object.
Examples:
```
sum($group.Amount) # Sum of the Amount field in the matching records
sum(r.Amount for r in $group) # Same as sum($group.Amount)
sum(r.Amount for r in $group if r > 0) # Sum of only the positive amounts
sum(r.Shares * r.Price for r in $group) # Sum of shares * price products
```
"""
)
# Slots are an optimization to avoid the need for a per-object __dict__.
__slots__ = ('_row_id', '_source_relation')
# Per-table derived classes override this and set it to the appropriate Table object.
_table = None
# Record is always a thin class, containing essentially a reference to a row in the table. The
# properties to access individual fields of a row are provided in per-table derived classes.
def __init__(self, row_id, relation=None):
"""
Creates a Record object.
table - Table object, in which this record lives.
row_id - The ID of the record within table.
relation - Relation object for how this record was obtained; used in dependency tracking.
In general you shouldn't call this constructor directly, but rather:
table.Record(row_id, relation)
which provides the table argument automatically.
"""
self._row_id = row_id
self._source_relation = relation or self._table._identity_relation
# Existing fields are added as @property methods in table.py. When no field is found, raise a
# more informative AttributeError.
def __getattr__(self, name):
return self._table._attribute_error(name, self._source_relation)
def __hash__(self):
return hash((self._table, self._row_id))
def __eq__(self, other):
return (isinstance(other, Record) and
(self._table, self._row_id) == (other._table, other._row_id))
def __ne__(self, other):
return not self.__eq__(other)
def __lt__(self, other):
return (self._table.table_id, self._row_id) < (other._table.table_id, other._row_id)
def __int__(self):
return self._row_id
def __nonzero__(self):
return bool(self._row_id)
__bool__ = __nonzero__
def __repr__(self):
return "%s[%s]" % (self._table.table_id, self._row_id)
def _clone_with_relation(self, src_relation):
return self._table.Record(self._row_id,
relation=src_relation.compose(self._source_relation))
class RecordSet(object):
"""
A RecordSet represents a collection of records, as returned by `Table.lookupRecords()` or
`$group` property in summary views.
A RecordSet allows iterating through the records:
```
sum(r.Amount for r in Students.lookupRecords(First_Name="John", Last_Name="Doe"))
min(r.DueDate for r in Tasks.lookupRecords(Owner="Bob"))
```
RecordSets also provide a convenient way to access the list of values for a particular field for
all the records, as `record_set.Field`. For example, the examples above are equivalent to:
```
sum(Students.lookupRecords(First_Name="John", Last_Name="Doe").Amount)
min(Tasks.lookupRecords(Owner="Bob").DueDate)
```
You can get the number of records in a RecordSet using `len`, e.g. `len($group)`.
"""
# Slots are an optimization to avoid the need for a per-object __dict__.
__slots__ = ('_row_ids', '_source_relation', '_group_by', '_sort_by', '_sort_key')
# Per-table derived classes override this and set it to the appropriate Table object.
_table = None
# Methods should be named with a leading underscore to avoid interfering with access to
# user-defined fields.
def __init__(self, row_ids, relation=None, group_by=None, sort_by=None, sort_key=None):
"""
group_by may be a dictionary mapping column names to values that are all the same for the given
RecordSet. sort_by may be the column name used for sorting this record set. Both are set by
lookupRecords, and used when using RecordSet to insert new records.
"""
self._row_ids = row_ids
self._source_relation = relation or self._table._identity_relation
# If row_ids is itself a RecordList, default to its _group_by, _sort_by, _sort_key properties.
self._group_by = group_by or getattr(row_ids, '_group_by', None)
self._sort_by = sort_by or getattr(row_ids, '_sort_by', None)
self._sort_key = sort_key or getattr(row_ids, '_sort_key', None)
def __len__(self):
return len(self._row_ids)
def __nonzero__(self):
return bool(self._row_ids)
__bool__ = __nonzero__
def __eq__(self, other):
return (isinstance(other, RecordSet) and
(self._table, self._row_ids) == (other._table, other._row_ids))
def __ne__(self, other):
return not self.__eq__(other)
def __iter__(self):
for row_id in self._row_ids:
yield self._table.Record(row_id, self._source_relation)
def __contains__(self, item):
"""item may be a Record or its row_id."""
if isinstance(item, int):
return item in self._row_ids
if isinstance(item, Record) and item._table == self._table:
return int(item) in self._row_ids
return False
def get_one(self):
# Pick the first record in the sorted order, or empty/sample record for empty RecordSet
row_id = self._row_ids[0] if self._row_ids else 0
return self._table.Record(row_id, self._source_relation)
def __getitem__(self, index):
# Allows subscripting a RecordSet as r[0] or r[-1].
row_id = self._row_ids[index]
return self._table.Record(row_id, self._source_relation)
def __getattr__(self, name):
return self._table._attribute_error(name, self._source_relation)
def __repr__(self):
return "%s[%s]" % (self._table.table_id, self._row_ids)
def _at(self, index):
"""
Returns element of RecordSet at the given index when the index is valid and non-negative.
Otherwise returns the empty/sample record.
"""
row_id = self._row_ids[index] if (0 <= index < len(self._row_ids)) else 0
return self._table.Record(row_id, self._source_relation)
def _clone_with_relation(self, src_relation):
return self._table.RecordSet(self._row_ids,
relation=src_relation.compose(self._source_relation),
group_by=self._group_by,
sort_by=self._sort_by,
sort_key=self._sort_key)
def _get_encodable_row_ids(self):
"""
Returns stored rowIds as a simple list or tuple type, even if actually stored as RecordList.
"""
# pylint: disable=unidiomatic-typecheck
if type(self._row_ids) in (list, tuple):
return self._row_ids
else:
return list(self._row_ids)
def _get_sort_key(self):
if not self._sort_key:
if self._sort_by:
raise ValueError("Sorted by %s but no sort_key" % (self._sort_by,))
raise ValueError("Can only use 'find' methods in a sorted reference list")
return self._sort_key
def _to_local_row_id(self, item):
if isinstance(item, int):
return item
if isinstance(item, Record) and item._table == self._table:
return int(item)
raise ValueError("unexpected search item") # Need better error
@property
def find(self):
"""
A set of methods for finding values in sorted set of records. For example:
```
Transactions.lookupRecords(..., sort_by="Date").find.lt($Date)
Table.lookupRecords(..., sort_by=("Foo", "Bar")).find.le(foo, bar)
```
If the `find` method is shadowed by a same-named user column, you may use `_find` instead.
The methods available are:
- `lt`: (less than) find nearest record with sort values < the given values
- `le`: (less than or equal to) find nearest record with sort values <= the given values
- `gt`: (greater than) find nearest record with sort values > the given values
- `ge`: (greater than or equal to) find nearest record with sort values >= the given values
- `eq`: (equal to) find nearest record with sort values == the given values
Example from https://templates.getgrist.com/5pHLanQNThxk/Payroll. Each person has a history of
pay rates, in the Rates table. To find a rate applicable on a certain date, here is how you
can do it old-style:
```
# Get all the rates for the Person and Role in this row.
rates = Rates.lookupRecords(Person=$Person, Role=$Role)
# Pick out only those rates whose Rate_Start is on or before this row's Date.
past_rates = [r for r in rates if r.Rate_Start <= $Date]
# Select the latest of past_rates, i.e. maximum by Rate_Start.
rate = max(past_rates, key=lambda r: r.Rate_Start)
# Return the Hourly_Rate from the relevant Rates record.
return rate.Hourly_Rate
```
With the new methods, it is much simpler:
```
rate = Rates.lookupRecords(Person=$Person, Role=$Role, sort_by="Rate_Start").find.le($Date)
return rate.Hourly_Rate
```
Note that this is also much faster when there are many rates for the same Person and Role.
"""
return FindOps(self)
@property
def _find(self):
return FindOps(self)
def _find_eq(self, *values):
found = self._bisect_find(bisect_left, 0, _min_row_id, values)
if found:
# 'found' means that we found a row that's greater-than-or-equal-to the values we are
# looking for. To check if the row is actually "equal", it remains to check if it is stictly
# greater than the passed-in values.
key = self._get_sort_key()
if key(found._row_id, values) < key(found._row_id):
return self._table.Record(0, self._source_relation)
return found
def _bisect_index(self, bisect_func, search_row_id, search_values=None):
key = self._get_sort_key()
# Note that 'key' argument is only available from Python 3.10.
return bisect_func(self._row_ids, key(search_row_id, search_values), key=key)
def _bisect_find(self, bisect_func, shift, search_row_id, search_values=None):
i = self._bisect_index(bisect_func, search_row_id, search_values=search_values)
return self._at(i + shift)
_min_row_id = -sys.float_info.max
_max_row_id = sys.float_info.max
if six.PY3:
class FindOps(object):
def __init__(self, record_set):
self._rset = record_set
def previous(self, row):
row_id = self._rset._to_local_row_id(row)
return self._rset._bisect_find(bisect_left, -1, row_id)
def next(self, row):
row_id = self._rset._to_local_row_id(row)
return self._rset._bisect_find(bisect_right, 0, row_id)
def rank(self, row, order="asc"):
row_id = self._rset._to_local_row_id(row)
index = self._rset._bisect_index(bisect_left, row_id)
if order == "asc":
return index + 1
elif order == "desc":
return len(self._rset) - index
else:
raise ValueError("The 'order' parameter must be \"asc\" (default) or \"desc\"")
def lt(self, *values):
return self._rset._bisect_find(bisect_left, -1, _min_row_id, values)
def le(self, *values):
return self._rset._bisect_find(bisect_right, -1, _max_row_id, values)
def gt(self, *values):
return self._rset._bisect_find(bisect_right, 0, _max_row_id, values)
def ge(self, *values):
return self._rset._bisect_find(bisect_left, 0, _min_row_id, values)
def eq(self, *values):
return self._rset._find_eq(*values)
else:
class FindOps(object):
def __init__(self, record_set):
raise NotImplementedError("Update engine to Python3 to use lookupRecords().find")
def adjust_record(relation, value):
"""
Helper to adjust a Record's source relation to be the composition with the given relation. This
is used to wrap values like `foo.bar`: if `bar` is a Record, then its source relation should be
the composition of the source relation of `foo` and the relation associated with `bar`.
"""
if isinstance(value, (Record, RecordSet)):
return value._clone_with_relation(relation)
return value