gristlabs_grist-core/sandbox/grist/csv_patch.py

import re
import csv
from functools import reduce

# Monkey-patch csv.Sniffer class, in which the quote/delimiter detection has silly bugs in the
# regexp that it uses. It also seems poorly-implemented in other ways. We can probably do better
# by not using csv.Sniffer at all.
# The method below is a modified copy of the same-named method in the standard csv.Sniffer class.
def _guess_quote_and_delimiter(_self, data, delimiters):
  """
  Looks for text enclosed between two identical quotes
  (the probable quotechar) which are preceded and followed
  by the same character (the probable delimiter).
  For example:
           ,'some text',
  The quote with the most wins, same with the delimiter.
  If there is no quotechar the delimiter can't be determined
  this way.
  """

  regexp = re.compile(
    r"""
    (?:(?P<delim>[^\w\n"\'])|^|\n)  # delimiter or start-of-line
    (?P<space>\ ?)           # optional initial space
    (?P<quote>["\']).*?(?P=quote)   # quote-surrounded field
    (?:(?P=delim)|$|\r?\n)      # delimiter or end-of-line
    """, re.VERBOSE | re.DOTALL | re.MULTILINE)
  matches = regexp.findall(data)

  if not matches:
    # (quotechar, doublequote, delimiter, skipinitialspace)
    return ('', False, None, 0)
  quotes = {}
  delims = {}
  spaces = 0
  for m in matches:
    n = regexp.groupindex['quote'] - 1
    key = m[n]
    if key:
      quotes[key] = quotes.get(key, 0) + 1
    try:
      n = regexp.groupindex['delim'] - 1
      key = m[n]
    except KeyError:
      continue
    if key and (delimiters is None or key in delimiters):
      delims[key] = delims.get(key, 0) + 1
    try:
      n = regexp.groupindex['space'] - 1
    except KeyError:
      continue
    if m[n]:
      spaces += 1

  quotechar = reduce(lambda a, b, _quotes = quotes:
             (_quotes[a] > _quotes[b]) and a or b, quotes.keys())

  if delims:
    delim = reduce(lambda a, b, _delims = delims:
             (_delims[a] > _delims[b]) and a or b, delims.keys())
    skipinitialspace = delims[delim] == spaces
    if delim == '\n': # most likely a file with a single column
      delim = ''
  else:
    # there is *no* delimiter, it's a single column of quoted data
    delim = ''
    skipinitialspace = 0

  # if we see an extra quote between delimiters, we've got a
  # double quoted format
  dq_regexp = re.compile(
               (r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)" +
                r"s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)") % \
               {'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE)


  if dq_regexp.search(data):
    doublequote = True
  else:
    doublequote = False

  return (quotechar, doublequote, delim, skipinitialspace)

csv.Sniffer._guess_quote_and_delimiter = _guess_quote_and_delimiter
(core) move data engine code to core Summary: this moves sandbox/grist to core, and adds a requirements.txt file for reconstructing the content of sandbox/thirdparty. Test Plan: existing tests pass. Tested core functionality manually. Tested docker build manually. Reviewers: dsagal Reviewed By: dsagal Differential Revision: https://phab.getgrist.com/D2563 2020-07-27 18:57:36 +00:00			`import re`
			`import csv`
(core) Simple Python 3 compatibility changes Summary: Changes that move towards python 3 compatibility that are easy to review without much thought Test Plan: The tests Reviewers: dsagal Reviewed By: dsagal Differential Revision: https://phab.getgrist.com/D2873 2021-06-22 15:12:25 +00:00			`from functools import reduce`
(core) move data engine code to core Summary: this moves sandbox/grist to core, and adds a requirements.txt file for reconstructing the content of sandbox/thirdparty. Test Plan: existing tests pass. Tested core functionality manually. Tested docker build manually. Reviewers: dsagal Reviewed By: dsagal Differential Revision: https://phab.getgrist.com/D2563 2020-07-27 18:57:36 +00:00
			`# Monkey-patch csv.Sniffer class, in which the quote/delimiter detection has silly bugs in the`
			`# regexp that it uses. It also seems poorly-implemented in other ways. We can probably do better`
			`# by not using csv.Sniffer at all.`
			`# The method below is a modified copy of the same-named method in the standard csv.Sniffer class.`
			`def _guess_quote_and_delimiter(_self, data, delimiters):`
			`"""`
			`Looks for text enclosed between two identical quotes`
			`(the probable quotechar) which are preceded and followed`
			`by the same character (the probable delimiter).`
			`For example:`
			`,'some text',`
			`The quote with the most wins, same with the delimiter.`
			`If there is no quotechar the delimiter can't be determined`
			`this way.`
			`"""`

			`regexp = re.compile(`
			`r"""`
			`(?:(?P<delim>[^\w\n"\'])\|^\|\n) # delimiter or start-of-line`
			`(?P<space>\ ?) # optional initial space`
			`(?P<quote>["\']).*?(?P=quote) # quote-surrounded field`
			`(?:(?P=delim)\|$\|\r?\n) # delimiter or end-of-line`
			`""", re.VERBOSE \| re.DOTALL \| re.MULTILINE)`
			`matches = regexp.findall(data)`

			`if not matches:`
			`# (quotechar, doublequote, delimiter, skipinitialspace)`
			`return ('', False, None, 0)`
			`quotes = {}`
			`delims = {}`
			`spaces = 0`
			`for m in matches:`
			`n = regexp.groupindex['quote'] - 1`
			`key = m[n]`
			`if key:`
			`quotes[key] = quotes.get(key, 0) + 1`
			`try:`
			`n = regexp.groupindex['delim'] - 1`
			`key = m[n]`
			`except KeyError:`
			`continue`
			`if key and (delimiters is None or key in delimiters):`
			`delims[key] = delims.get(key, 0) + 1`
			`try:`
			`n = regexp.groupindex['space'] - 1`
			`except KeyError:`
			`continue`
			`if m[n]:`
			`spaces += 1`

			`quotechar = reduce(lambda a, b, _quotes = quotes:`
			`(_quotes[a] > _quotes[b]) and a or b, quotes.keys())`

			`if delims:`
			`delim = reduce(lambda a, b, _delims = delims:`
			`(_delims[a] > _delims[b]) and a or b, delims.keys())`
			`skipinitialspace = delims[delim] == spaces`
			`if delim == '\n': # most likely a file with a single column`
			`delim = ''`
			`else:`
			`# there is no delimiter, it's a single column of quoted data`
			`delim = ''`
			`skipinitialspace = 0`

			`# if we see an extra quote between delimiters, we've got a`
			`# double quoted format`
			`dq_regexp = re.compile(`
			`(r"((%(delim)s)\|^)\W%(quote)s[^%(delim)s\n]%(quote)" +`
			`r"s[^%(delim)s\n]%(quote)s\W((%(delim)s)\|$)") % \`
			`{'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE)`



			`if dq_regexp.search(data):`
			`doublequote = True`
			`else:`
			`doublequote = False`

			`return (quotechar, doublequote, delim, skipinitialspace)`

			`csv.Sniffer._guess_quote_and_delimiter = _guess_quote_and_delimiter`