mirror of
https://github.com/gristlabs/grist-core.git
synced 2026-03-02 04:09:24 +00:00
(core) move data engine code to core
Summary: this moves sandbox/grist to core, and adds a requirements.txt file for reconstructing the content of sandbox/thirdparty. Test Plan: existing tests pass. Tested core functionality manually. Tested docker build manually. Reviewers: dsagal Reviewed By: dsagal Differential Revision: https://phab.getgrist.com/D2563
This commit is contained in:
84
sandbox/grist/csv_patch.py
Normal file
84
sandbox/grist/csv_patch.py
Normal file
@@ -0,0 +1,84 @@
|
||||
import re
|
||||
import csv
|
||||
|
||||
# Monkey-patch csv.Sniffer class, in which the quote/delimiter detection has silly bugs in the
|
||||
# regexp that it uses. It also seems poorly-implemented in other ways. We can probably do better
|
||||
# by not using csv.Sniffer at all.
|
||||
# The method below is a modified copy of the same-named method in the standard csv.Sniffer class.
|
||||
def _guess_quote_and_delimiter(_self, data, delimiters):
|
||||
"""
|
||||
Looks for text enclosed between two identical quotes
|
||||
(the probable quotechar) which are preceded and followed
|
||||
by the same character (the probable delimiter).
|
||||
For example:
|
||||
,'some text',
|
||||
The quote with the most wins, same with the delimiter.
|
||||
If there is no quotechar the delimiter can't be determined
|
||||
this way.
|
||||
"""
|
||||
|
||||
regexp = re.compile(
|
||||
r"""
|
||||
(?:(?P<delim>[^\w\n"\'])|^|\n) # delimiter or start-of-line
|
||||
(?P<space>\ ?) # optional initial space
|
||||
(?P<quote>["\']).*?(?P=quote) # quote-surrounded field
|
||||
(?:(?P=delim)|$|\r?\n) # delimiter or end-of-line
|
||||
""", re.VERBOSE | re.DOTALL | re.MULTILINE)
|
||||
matches = regexp.findall(data)
|
||||
|
||||
if not matches:
|
||||
# (quotechar, doublequote, delimiter, skipinitialspace)
|
||||
return ('', False, None, 0)
|
||||
quotes = {}
|
||||
delims = {}
|
||||
spaces = 0
|
||||
for m in matches:
|
||||
n = regexp.groupindex['quote'] - 1
|
||||
key = m[n]
|
||||
if key:
|
||||
quotes[key] = quotes.get(key, 0) + 1
|
||||
try:
|
||||
n = regexp.groupindex['delim'] - 1
|
||||
key = m[n]
|
||||
except KeyError:
|
||||
continue
|
||||
if key and (delimiters is None or key in delimiters):
|
||||
delims[key] = delims.get(key, 0) + 1
|
||||
try:
|
||||
n = regexp.groupindex['space'] - 1
|
||||
except KeyError:
|
||||
continue
|
||||
if m[n]:
|
||||
spaces += 1
|
||||
|
||||
quotechar = reduce(lambda a, b, _quotes = quotes:
|
||||
(_quotes[a] > _quotes[b]) and a or b, quotes.keys())
|
||||
|
||||
if delims:
|
||||
delim = reduce(lambda a, b, _delims = delims:
|
||||
(_delims[a] > _delims[b]) and a or b, delims.keys())
|
||||
skipinitialspace = delims[delim] == spaces
|
||||
if delim == '\n': # most likely a file with a single column
|
||||
delim = ''
|
||||
else:
|
||||
# there is *no* delimiter, it's a single column of quoted data
|
||||
delim = ''
|
||||
skipinitialspace = 0
|
||||
|
||||
# if we see an extra quote between delimiters, we've got a
|
||||
# double quoted format
|
||||
dq_regexp = re.compile(
|
||||
(r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)" +
|
||||
r"s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)") % \
|
||||
{'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE)
|
||||
|
||||
|
||||
|
||||
if dq_regexp.search(data):
|
||||
doublequote = True
|
||||
else:
|
||||
doublequote = False
|
||||
|
||||
return (quotechar, doublequote, delim, skipinitialspace)
|
||||
|
||||
csv.Sniffer._guess_quote_and_delimiter = _guess_quote_and_delimiter
|
||||
Reference in New Issue
Block a user