import re import csv from functools import reduce # Monkey-patch csv.Sniffer class, in which the quote/delimiter detection has silly bugs in the # regexp that it uses. It also seems poorly-implemented in other ways. We can probably do better # by not using csv.Sniffer at all. # The method below is a modified copy of the same-named method in the standard csv.Sniffer class. def _guess_quote_and_delimiter(_self, data, delimiters): """ Looks for text enclosed between two identical quotes (the probable quotechar) which are preceded and followed by the same character (the probable delimiter). For example: ,'some text', The quote with the most wins, same with the delimiter. If there is no quotechar the delimiter can't be determined this way. """ regexp = re.compile( r""" (?:(?P[^\w\n"\'])|^|\n) # delimiter or start-of-line (?P\ ?) # optional initial space (?P["\']).*?(?P=quote) # quote-surrounded field (?:(?P=delim)|$|\r?\n) # delimiter or end-of-line """, re.VERBOSE | re.DOTALL | re.MULTILINE) matches = regexp.findall(data) if not matches: # (quotechar, doublequote, delimiter, skipinitialspace) return ('', False, None, 0) quotes = {} delims = {} spaces = 0 for m in matches: n = regexp.groupindex['quote'] - 1 key = m[n] if key: quotes[key] = quotes.get(key, 0) + 1 try: n = regexp.groupindex['delim'] - 1 key = m[n] except KeyError: continue if key and (delimiters is None or key in delimiters): delims[key] = delims.get(key, 0) + 1 try: n = regexp.groupindex['space'] - 1 except KeyError: continue if m[n]: spaces += 1 quotechar = reduce(lambda a, b, _quotes = quotes: (_quotes[a] > _quotes[b]) and a or b, quotes.keys()) if delims: delim = reduce(lambda a, b, _delims = delims: (_delims[a] > _delims[b]) and a or b, delims.keys()) skipinitialspace = delims[delim] == spaces if delim == '\n': # most likely a file with a single column delim = '' else: # there is *no* delimiter, it's a single column of quoted data delim = '' skipinitialspace = 0 # if we see an extra quote between delimiters, we've got a # double quoted format dq_regexp = re.compile( (r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)" + r"s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)") % \ {'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE) if dq_regexp.search(data): doublequote = True else: doublequote = False return (quotechar, doublequote, delim, skipinitialspace) csv.Sniffer._guess_quote_and_delimiter = _guess_quote_and_delimiter