gristlabs_grist-core/sandbox/grist/test_lookup_perf.py

import math
import time
import testutil
import test_engine

class TestLookupPerformance(test_engine.EngineTestCase):
  def test_non_quadratic(self):
    # This test measures performance which depends on other stuff running on the machine, which
    # makes it inherently flaky. But if it fails legitimately, it should fail every time. So we
    # run multiple times (3), and fail only if all of those times fail.
    for i in range(2):
      try:
        return self._do_test_non_quadratic()
      except Exception as e:
        print("FAIL #%d" % (i + 1))
    self._do_test_non_quadratic()

  def _do_test_non_quadratic(self):
    # If the same lookupRecords is called by many cells, it should reuse calculations, not lead to
    # quadratic complexity. (Actually making use of the result would often still be O(N) in each
    # cell, but here we check that just doing the lookup is O(1) amortized.)

    # Table1 has columns: Date and Status, each will have just two distinct values.
    # We add a bunch of formulas that should take constant time outside of the lookup.

    # The way we test for quadratic complexity is by timing "BulkAddRecord" action that causes all
    # rows to recalculate for a geometrically growing sequence of row counts. Then we
    # log-transform the data and do linear regression on it. It should produce data that fits
    # closely a line of slope 1.

    self.setUp()    # Repeat setup because this test case gets called multiple times.
    self.load_sample(testutil.parse_test_sample({
      "SCHEMA": [
        [1, "Table1", [
          [1, "Date", "Date", False, "", "", ""],
          [2, "Status", "Text", False, "", "", ""],
          [3, "lookup_1a", "Any", True, "len(Table1.all)", "", ""],
          [4, "lookup_2a", "Any", True, "len(Table1.lookupRecords(order_by='-Date'))", "", ""],
          [5, "lookup_3a", "Any", True,
            "len(Table1.lookupRecords(Status=$Status, order_by=('-Date', '-id')))", "", ""],
          [6, "lookup_1b", "Any", True, "Table1.lookupOne().id", "", ""],
          # Keep one legacy sort_by example (it shares implementation, so should work similarly)
          [7, "lookup_2b", "Any", True, "Table1.lookupOne(sort_by='-Date').id", "", ""],
          [8, "lookup_3b", "Any", True,
            "Table1.lookupOne(Status=$Status, order_by=('-Date', '-id')).id", "", ""],
        ]]
      ],
      "DATA": {}
    }))

    num_records = 0

    def add_records(count):
      assert count % 4 == 0, "Call add_records with multiples of 4 here"
      self.add_records("Table1", ["Date", "Status"], [
        [ "2024-01-01",  "Green" ],
        [ "2024-01-01",  "Green" ],
        [ "2024-02-01",  "Blue" ],
        [ "2000-01-01",  "Blue" ],
      ] * (count // 4))

      N = num_records + count
      self.assertTableData(
        "Table1", cols="subset", rows="subset", data=[
          ["id", "lookup_1a", "lookup_2a", "lookup_3a", "lookup_1b", "lookup_2b", "lookup_3b"],
          [1,    N,           N,           N // 2,      1,           3,           N - 2],
        ])
      return N

    # Add records in a geometric sequence
    times = {}
    start_time = time.time()
    last_time = start_time
    count_add = 20
    while last_time < start_time + 2:       # Stop once we've spent 2 seconds
      add_time = time.time()
      num_records = add_records(count_add)
      last_time = time.time()
      times[num_records] = last_time - add_time
      count_add *= 2

    count_array = sorted(times.keys())
    times_array = [times[r] for r in count_array]

    # Perform linear regression on log-transformed data
    log_count_array = [math.log(x) for x in count_array]
    log_times_array = [math.log(x) for x in times_array]

    # Calculate slope and intercept using the least squares method.
    # Doing this manually so that it works in Python2 too.
    # Otherwise, we could just use statistics.linear_regression()
    n = len(log_count_array)
    sum_x = sum(log_count_array)
    sum_y = sum(log_times_array)
    sum_xx = sum(x * x for x in log_count_array)
    sum_xy = sum(x * y for x, y in zip(log_count_array, log_times_array))
    slope = (n * sum_xy - sum_x * sum_y) / (n * sum_xx - sum_x * sum_x)
    intercept = (sum_y - slope * sum_x) / n

    # Calculate R-squared
    mean_y = sum_y / n
    ss_tot = sum((y - mean_y) ** 2 for y in log_times_array)
    ss_res = sum((y - (slope * x + intercept)) ** 2
        for x, y in zip(log_count_array, log_times_array))
    r_squared = 1 - (ss_res / ss_tot)

    # Check that the slope is close to 1. For log-transformed data, this means a linear
    # relationship (a quadratic term would make the slope 2).
    # In practice, we see slope even less 1 (because there is a non-trivial constant term), so we
    # can assert things a bit lower than 1: 0.86 to 1.04.
    err_msg = "Time is non-linear: slope {} R^2 {}".format(slope, r_squared)
    self.assertAlmostEqual(slope, 0.95, delta=0.09, msg=err_msg)

    # Check that R^2 is close to 1, meaning that data is very close to that line (of slope ~1).
    self.assertAlmostEqual(r_squared, 1, delta=0.08, msg=err_msg)
(core) Implement PREVIOUS/NEXT/RANK and lookupRecords().find.* methods. Summary: - `lookupRecords()` now allows efficient search in sorted results, with the syntax `lookupRecords(..., order_by="-Date").find.le($Date)`. This will find the record with the nearest date that's <= `$Date`. - The `find.` methods are `le`, `lt`, `ge`, `gt`, and `eq`. All have O(log N) performance. - `PREVIOUS(rec, group_by=..., order_by=...)` finds the previous record to rec, according to `group_by` / `order_by`, in amortized O(log N) time. For example, `PREVIOUS(rec, group_by="Account", order_by="Date")`. - `PREVIOUS(rec, order_by=None)` finds the previous record in the full table, sorted by the `manualSort` column, to match the order visible in the unsorted table. - `NEXT(...)` is just like `PREVIOUS(...)` but finds the next record. - `RANK(rec, group_by=..., order_by=..., order="asc")` returns the rank of the record within the group, starting with 1. Order can be `"asc"` (default) or `"desc"`. - The `order_by` argument in `lookupRecords`, and the new functions now supports tuples, as well as the "-" prefix to reverse order, e.g. `("Category", "-Date")`. - New functions are only available in Python3, for a minor reason (to support keyword-only arguments for `group_by` and `order_by`) and also as a nudge to Python2 users to update. - Includes fixes for several situations related to lookups that used to cause quadratic complexity. Test Plan: - New performance check that sorted lookups don't add quadratic complexity. - Tests added for lookup find. methods, and for PREVIOUS/NEXT/RANK. - Tests added that renaming columns updates `order_by` and `group_by` arguments, and attributes on results (e.g. `PREVIOUS(...).ColId`) appropriately. - Python3 tests can now produce verbose output when VERBOSE=1 and -v are given. Reviewers: jarek, georgegevoian Reviewed By: jarek, georgegevoian Subscribers: paulfitz, jarek Differential Revision: https://phab.getgrist.com/D4265 2024-07-17 03:43:53 +00:00			`import math`
			`import time`
			`import testutil`
			`import test_engine`

			`class TestLookupPerformance(test_engine.EngineTestCase):`
			`def test_non_quadratic(self):`
			`# This test measures performance which depends on other stuff running on the machine, which`
			`# makes it inherently flaky. But if it fails legitimately, it should fail every time. So we`
			`# run multiple times (3), and fail only if all of those times fail.`
			`for i in range(2):`
			`try:`
			`return self._do_test_non_quadratic()`
			`except Exception as e:`
			`print("FAIL #%d" % (i + 1))`
			`self._do_test_non_quadratic()`

			`def _do_test_non_quadratic(self):`
			`# If the same lookupRecords is called by many cells, it should reuse calculations, not lead to`
			`# quadratic complexity. (Actually making use of the result would often still be O(N) in each`
			`# cell, but here we check that just doing the lookup is O(1) amortized.)`

			`# Table1 has columns: Date and Status, each will have just two distinct values.`
			`# We add a bunch of formulas that should take constant time outside of the lookup.`

			`# The way we test for quadratic complexity is by timing "BulkAddRecord" action that causes all`
			`# rows to recalculate for a geometrically growing sequence of row counts. Then we`
			`# log-transform the data and do linear regression on it. It should produce data that fits`
			`# closely a line of slope 1.`

			`self.setUp() # Repeat setup because this test case gets called multiple times.`
			`self.load_sample(testutil.parse_test_sample({`
			`"SCHEMA": [`
			`[1, "Table1", [`
			`[1, "Date", "Date", False, "", "", ""],`
			`[2, "Status", "Text", False, "", "", ""],`
			`[3, "lookup_1a", "Any", True, "len(Table1.all)", "", ""],`
			`[4, "lookup_2a", "Any", True, "len(Table1.lookupRecords(order_by='-Date'))", "", ""],`
			`[5, "lookup_3a", "Any", True,`
			`"len(Table1.lookupRecords(Status=$Status, order_by=('-Date', '-id')))", "", ""],`
			`[6, "lookup_1b", "Any", True, "Table1.lookupOne().id", "", ""],`
			`# Keep one legacy sort_by example (it shares implementation, so should work similarly)`
			`[7, "lookup_2b", "Any", True, "Table1.lookupOne(sort_by='-Date').id", "", ""],`
			`[8, "lookup_3b", "Any", True,`
			`"Table1.lookupOne(Status=$Status, order_by=('-Date', '-id')).id", "", ""],`
			`]]`
			`],`
			`"DATA": {}`
			`}))`

			`num_records = 0`

			`def add_records(count):`
			`assert count % 4 == 0, "Call add_records with multiples of 4 here"`
			`self.add_records("Table1", ["Date", "Status"], [`
			`[ "2024-01-01", "Green" ],`
			`[ "2024-01-01", "Green" ],`
			`[ "2024-02-01", "Blue" ],`
			`[ "2000-01-01", "Blue" ],`
			`] * (count // 4))`

			`N = num_records + count`
			`self.assertTableData(`
			`"Table1", cols="subset", rows="subset", data=[`
			`["id", "lookup_1a", "lookup_2a", "lookup_3a", "lookup_1b", "lookup_2b", "lookup_3b"],`
			`[1, N, N, N // 2, 1, 3, N - 2],`
			`])`
			`return N`

			`# Add records in a geometric sequence`
			`times = {}`
			`start_time = time.time()`
			`last_time = start_time`
			`count_add = 20`
			`while last_time < start_time + 2: # Stop once we've spent 2 seconds`
			`add_time = time.time()`
			`num_records = add_records(count_add)`
			`last_time = time.time()`
			`times[num_records] = last_time - add_time`
			`count_add *= 2`

			`count_array = sorted(times.keys())`
			`times_array = [times[r] for r in count_array]`

			`# Perform linear regression on log-transformed data`
			`log_count_array = [math.log(x) for x in count_array]`
			`log_times_array = [math.log(x) for x in times_array]`

			`# Calculate slope and intercept using the least squares method.`
			`# Doing this manually so that it works in Python2 too.`
			`# Otherwise, we could just use statistics.linear_regression()`
			`n = len(log_count_array)`
			`sum_x = sum(log_count_array)`
			`sum_y = sum(log_times_array)`
			`sum_xx = sum(x * x for x in log_count_array)`
			`sum_xy = sum(x * y for x, y in zip(log_count_array, log_times_array))`
			`slope = (n * sum_xy - sum_x * sum_y) / (n * sum_xx - sum_x * sum_x)`
			`intercept = (sum_y - slope * sum_x) / n`

			`# Calculate R-squared`
			`mean_y = sum_y / n`
			`ss_tot = sum((y - mean_y) ** 2 for y in log_times_array)`
			`ss_res = sum((y - (slope * x + intercept)) ** 2`
			`for x, y in zip(log_count_array, log_times_array))`
			`r_squared = 1 - (ss_res / ss_tot)`

			`# Check that the slope is close to 1. For log-transformed data, this means a linear`
			`# relationship (a quadratic term would make the slope 2).`
			`# In practice, we see slope even less 1 (because there is a non-trivial constant term), so we`
			`# can assert things a bit lower than 1: 0.86 to 1.04.`
			`err_msg = "Time is non-linear: slope {} R^2 {}".format(slope, r_squared)`
			`self.assertAlmostEqual(slope, 0.95, delta=0.09, msg=err_msg)`

			`# Check that R^2 is close to 1, meaning that data is very close to that line (of slope ~1).`
			`self.assertAlmostEqual(r_squared, 1, delta=0.08, msg=err_msg)`