1
0
mirror of https://github.com/gristlabs/grist-core.git synced 2024-10-27 20:44:07 +00:00
gristlabs_grist-core/sandbox/grist/csv_patch.py

86 lines
2.7 KiB
Python
Raw Normal View History

import re
import csv
from functools import reduce
# Monkey-patch csv.Sniffer class, in which the quote/delimiter detection has silly bugs in the
# regexp that it uses. It also seems poorly-implemented in other ways. We can probably do better
# by not using csv.Sniffer at all.
# The method below is a modified copy of the same-named method in the standard csv.Sniffer class.
def _guess_quote_and_delimiter(_self, data, delimiters):
"""
Looks for text enclosed between two identical quotes
(the probable quotechar) which are preceded and followed
by the same character (the probable delimiter).
For example:
,'some text',
The quote with the most wins, same with the delimiter.
If there is no quotechar the delimiter can't be determined
this way.
"""
regexp = re.compile(
r"""
(?:(?P<delim>[^\w\n"\'])|^|\n) # delimiter or start-of-line
(?P<space>\ ?) # optional initial space
(?P<quote>["\']).*?(?P=quote) # quote-surrounded field
(?:(?P=delim)|$|\r?\n) # delimiter or end-of-line
""", re.VERBOSE | re.DOTALL | re.MULTILINE)
matches = regexp.findall(data)
if not matches:
# (quotechar, doublequote, delimiter, skipinitialspace)
return ('', False, None, 0)
quotes = {}
delims = {}
spaces = 0
for m in matches:
n = regexp.groupindex['quote'] - 1
key = m[n]
if key:
quotes[key] = quotes.get(key, 0) + 1
try:
n = regexp.groupindex['delim'] - 1
key = m[n]
except KeyError:
continue
if key and (delimiters is None or key in delimiters):
delims[key] = delims.get(key, 0) + 1
try:
n = regexp.groupindex['space'] - 1
except KeyError:
continue
if m[n]:
spaces += 1
quotechar = reduce(lambda a, b, _quotes = quotes:
(_quotes[a] > _quotes[b]) and a or b, quotes.keys())
if delims:
delim = reduce(lambda a, b, _delims = delims:
(_delims[a] > _delims[b]) and a or b, delims.keys())
skipinitialspace = delims[delim] == spaces
if delim == '\n': # most likely a file with a single column
delim = ''
else:
# there is *no* delimiter, it's a single column of quoted data
delim = ''
skipinitialspace = 0
# if we see an extra quote between delimiters, we've got a
# double quoted format
dq_regexp = re.compile(
(r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)" +
r"s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)") % \
{'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE)
if dq_regexp.search(data):
doublequote = True
else:
doublequote = False
return (quotechar, doublequote, delim, skipinitialspace)
csv.Sniffer._guess_quote_and_delimiter = _guess_quote_and_delimiter