(core) move data engine code to core

Summary: this moves sandbox/grist to core, and adds a requirements.txt file for reconstructing the content of sandbox/thirdparty. Test Plan: existing tests pass. Tested core functionality manually. Tested docker build manually. Reviewers: dsagal Reviewed By: dsagal Differential Revision: https://phab.getgrist.com/D2563
2026-03-02 04:09:24 +00:00 · 2020-07-27 14:57:36 -04:00
parent 2399baaca2
commit b82eec714a
97 changed files with 29551 additions and 2 deletions
--- a/sandbox/grist/csv_patch.py
+++ b/sandbox/grist/csv_patch.py
@@ -0,0 +1,84 @@
+import re
+import csv
+
+# Monkey-patch csv.Sniffer class, in which the quote/delimiter detection has silly bugs in the
+# regexp that it uses. It also seems poorly-implemented in other ways. We can probably do better
+# by not using csv.Sniffer at all.
+# The method below is a modified copy of the same-named method in the standard csv.Sniffer class.
+def _guess_quote_and_delimiter(_self, data, delimiters):
+  """
+  Looks for text enclosed between two identical quotes
+  (the probable quotechar) which are preceded and followed
+  by the same character (the probable delimiter).
+  For example:
+           ,'some text',
+  The quote with the most wins, same with the delimiter.
+  If there is no quotechar the delimiter can't be determined
+  this way.
+  """
+
+  regexp = re.compile(
+    r"""
+    (?:(?P<delim>[^\w\n"\'])|^|\n)  # delimiter or start-of-line
+    (?P<space>\ ?)           # optional initial space
+    (?P<quote>["\']).*?(?P=quote)   # quote-surrounded field
+    (?:(?P=delim)|$|\r?\n)      # delimiter or end-of-line
+    """, re.VERBOSE | re.DOTALL | re.MULTILINE)
+  matches = regexp.findall(data)
+
+  if not matches:
+    # (quotechar, doublequote, delimiter, skipinitialspace)
+    return ('', False, None, 0)
+  quotes = {}
+  delims = {}
+  spaces = 0
+  for m in matches:
+    n = regexp.groupindex['quote'] - 1
+    key = m[n]
+    if key:
+      quotes[key] = quotes.get(key, 0) + 1
+    try:
+      n = regexp.groupindex['delim'] - 1
+      key = m[n]
+    except KeyError:
+      continue
+    if key and (delimiters is None or key in delimiters):
+      delims[key] = delims.get(key, 0) + 1
+    try:
+      n = regexp.groupindex['space'] - 1
+    except KeyError:
+      continue
+    if m[n]:
+      spaces += 1
+
+  quotechar = reduce(lambda a, b, _quotes = quotes:
+             (_quotes[a] > _quotes[b]) and a or b, quotes.keys())
+
+  if delims:
+    delim = reduce(lambda a, b, _delims = delims:
+             (_delims[a] > _delims[b]) and a or b, delims.keys())
+    skipinitialspace = delims[delim] == spaces
+    if delim == '\n': # most likely a file with a single column
+      delim = ''
+  else:
+    # there is *no* delimiter, it's a single column of quoted data
+    delim = ''
+    skipinitialspace = 0
+
+  # if we see an extra quote between delimiters, we've got a
+  # double quoted format
+  dq_regexp = re.compile(
+               (r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)" +
+                r"s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)") % \
+               {'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE)
+
+
+
+  if dq_regexp.search(data):
+    doublequote = True
+  else:
+    doublequote = False
+
+  return (quotechar, doublequote, delim, skipinitialspace)
+
+csv.Sniffer._guess_quote_and_delimiter = _guess_quote_and_delimiter