gristlabs_grist-core/sandbox/grist/imports/dateguess.py

"""This module guesses possible formats of dates which can be parsed using datetime.strptime
based on samples.

dateguesser.guess(sample)
dateguesser.guess takes a sample date string and returns a set of
datetime.strftime/strptime-compliant date format strings that will correctly parse.

dateguesser.guess_bulk(list_of_samples, error_rate=0)
dateguesser.guess_bulk takes a list of sample date strings and acceptable error rate
and returns a list of datetime.strftime/strptime-compliant date format strings
sorted by error rate that will correctly parse.

Algorithm:

  1. Tokenize input string into chunks based on character type: digits, alphas, the rest.
  2. Analyze each token independently in terms what format codes could represent
  3. For given list of tokens generate all permutations of format codes
  4. During generating permutations check for validness of generated format and skip if invalid.
  5. Use rules listed below to decide if format is invalid:

Invalid format checks:

  Rule #1: Year MUST be in the date. Year is the minimum possible parsable date.
  Rule #2. No holes (missing parts) in the format parts.
  Rule #3. Time parts are neighbors to each other. No interleaving time with the date.
  Rule #4. It's highly impossible that minutes coming before hour, millis coming before seconds etc
  Rule #5. Pattern can't have some part of date/time defined more than once.
  Rule #6: Separators between elements of the time group should be the same.
  Rule #7: If am/pm is in date we assume that 12-hour dates are allowed only. Otherwise it's 24-hour
  Rule #8: Year can't be between other date elements

Note:
  dateguess doesn't support defaulting to current year because parsing should be deterministic,
  it's better to to fail guessing the format then to guess it incorrectly.

Examples:
  >>> guess('2014/05/05 14:00:00 UTC')
  set(['%Y/%d/%m %H:%M:%S %Z', '%Y/%m/%d %H:%M:%S %Z'])
  >>> guess('12/12/12')
  set(['%y/%m/%d', '%d/%m/%y', '%m/%d/%y', '%y/%d/%m'])
  >>> guess_bulk(['12-11-2014', '12-25-2014'])
  ['%m-%d-%Y']
  >>> guess_bulk(['12-11-2014', '25-25-2014'])
  []
  >>> guess_bulk(['12-11-2013', '13-8-2013', '05-25-2013', '12-25-2013'], error_rate=0.5)
  ['%m-%d-%Y']
"""


import calendar
import itertools
import logging
import re
from collections import defaultdict

from backports.functools_lru_cache import lru_cache
import moment


MONTH_NAME = calendar.month_name
MONTH_ABBR = calendar.month_abbr
TZ_VALID_NAMES = {z[0] for z in moment.get_tz_data().items()}
AM_PM = {'am', 'pm'}
DAYS_OF_WEEK_NAME = calendar.day_name
DAYS_OF_WEEK_ABBR = calendar.day_abbr

DATE_ELEMENTS = [
  # Name   Pattern  Predicate               Group (mutual exclusive)  Consumes N prev elements
  ("Year", "%Y", lambda x, p, v: x.isdigit() and len(x) == 4, "Y", 0),
  ("Year short", "%y", lambda x, p, v: x.isdigit() and len(x) == 2, "Y", 0),
  ("Month", "%m", lambda x, p, v: x.isdigit() and len(x) <= 2 and 0 < int(x) <= 12, "m", 0),
  ("Month name full", "%B", lambda x, p, v: x.isalpha() and x.capitalize() in MONTH_NAME, "m", 0),
  ("Month name abbr", "%b", lambda x, p, v: x.isalpha() and x.capitalize() in MONTH_ABBR, "m", 0),
  ("Day", "%d", lambda x, p, v: x.isdigit() and len(x) <= 2 and 0 < int(x) <= 31, "d", 0),
  ("Day of week", "%A", lambda x, p, v: x.isalpha()
                                        and x.capitalize() in DAYS_OF_WEEK_NAME, "a", 0),
  ("Day of week abbr", "%a", lambda x, p, v: x.isalpha()
                                             and x.capitalize() in DAYS_OF_WEEK_ABBR, "a", 0),

  ("Compound HHMMSS", "%H%M%S", lambda x, p, v: x.isdigit() and len(x) == 6
                                                and 0 <= int(x[0:2]) < 24
                                                and 0 <= int(x[2:4]) < 60
                                                and 0 <= int(x[4:6]) < 60, "HMS", 0),

  ("Hour", "%H", lambda x, p, v: x.isdigit() and len(x) <= 2 and 0 <= int(x) <= 23, "H", 0),
  ("Hour in 12hr mode", "%I", lambda x, p, v: x.isdigit() and len(x) <= 2
                                              and 0 <= int(x) <= 11, "H", 0),
  ("AM/PM", "%p", lambda x, p, v: x.isalpha() and len(x) == 2 and x.lower() in AM_PM, "p", 0),
  ("Minutes", "%M", lambda x, p, v: x.isdigit() and len(x) <= 2 and 0 <= int(x) <= 59, "M", 0),
  ("Seconds", "%S", lambda x, p, v: x.isdigit() and len(x) <= 2 and 0 <= int(x) <= 59, "S", 0),
  ("Fraction of second", "%f", lambda x, p, v: x.isdigit() and p is not None
                                               and p.val == '.', "f", 0),
  ("Timezone name", "%Z", lambda x, p, v: x.isalpha() and len(x) > 2
                                          and x in TZ_VALID_NAMES, "Z", 0),
  ("Timezone +HHMM", "%z", lambda x, p, v: x.isdigit() and len(x) == 4 and 0 <= int(x[0:2]) < 15
                                           and 0 <= int(x[2:4]) < 60 and p is not None
                                           and p.val == '+', "Z", 1),
  ("Timezone -HHMM", "%z", lambda x, p, v: x.isdigit() and len(x) == 4 and 0 <= int(x[0:2]) < 15
                                           and 0 <= int(x[2:4]) < 60 and p is not None
                                           and p.val == '-', "Z", 1),
]


class Token(object):
  """Represents a part of a date string that's being parsed.
  Note that __hash__ and __eq__ are overridden in order
  to compare only meaningful parts of an object.
  """
  def __init__(self, val, length):
    self.val = val
    self.length = length
    self.compatible_types = ()

  def __hash__(self):
    h = hash(self.length) + hash(self.compatible_types)
    if not self.compatible_types:
      h += hash(self.val)
    return hash(h)

  def __eq__(self, other):
    """
    Two tokens are equal when these both are true:
    a) length and compatible types are equal
    b) if it is separator (no compatible types), separator values must be equal
    """
    if self.length != other.length or self.compatible_types != other.compatible_types:
      return False
    if not other.compatible_types and self.val != other.val:
      return False
    return True


def _check_rule_1(pattern, types_used):
  """Rule #1: Year MUST be in the date. Year is the minimum possible parsable date.

  Examples:
    >>> _check_rule_1('%Y/%m/%d', 'Ymd')
    True
    >>> _check_rule_1('%m/%d', 'md')
    False
  """
  if 'Y' not in types_used:
    logging.debug("Rule #1 is violated for pattern %s. Types used: %s", pattern, types_used)
    return False
  return True


def _check_rule_2(pattern, types_used):
  """Rule #2: No holes (missing parts) in the format parts.

  Examples:
    >>> _check_rule_2('%Y:%H', 'YH')
    False
    >>> _check_rule_2('%Y/%m/%d %H', 'YmdH')
    True
  """
  priorities = 'YmdHMSf'
  seen_parts = [p in types_used for p in priorities]
  if sorted(seen_parts, reverse=True) != seen_parts:
    logging.debug("Rule #2 is violated for pattern %s. Types used: %s", pattern, types_used)
    return False
  return True


def _check_rule_3(pattern, types_used):
  """Rule #3: Time parts are neighbors to time only. No interleaving time with the date.

  Examples:
    >>> _check_rule_3('%m/%d %H:%M %Y', 'mdHMY')
    True
    >>> _check_rule_3('%m/%d %H:%Y:%M', 'mdHYM')
    False
  """
  time_parts = 'HMSf'
  time_parts_highlighted = [t in time_parts for t in types_used]
  time_parts_deduplicated = [a[0] for a in itertools.groupby(time_parts_highlighted)]
  if len(list(filter(lambda x: x, time_parts_deduplicated))) > 1:
    logging.debug("Rule #3 is violated for pattern %s. Types used: %s", pattern, types_used)
    return False
  return True


def _check_rule_4(pattern, types_used):
  """Rule #4: It's highly impossible that minutes coming before hours,
  millis coming before seconds etc.

  Examples:
    >>> _check_rule_4('%H:%M', 'HM')
    True
    >>> _check_rule_4('%S:%M', 'SM')
    False
  """
  time_parts_priority = 'HMSf'
  time_parts_indexes = list(filter(lambda x: x >= 0,
                                              [time_parts_priority.find(t) for t in types_used]))
  if sorted(time_parts_indexes) != time_parts_indexes:
    logging.debug("Rule #4 is violated for pattern %s. Types used: %s", pattern, types_used)
    return False
  return True


def _check_rule_5(pattern, types_used):
  """Rule #5: Pattern can't have some part of date/time defined more than once.

  Examples:
    >>> _check_rule_5('%Y/%Y', 'YY')
    False
    >>> _check_rule_5('%m/%b', 'mm')
    False
    >>> _check_rule_5('%Y/%m', 'Ym')
    True
  """
  if len(types_used) != len(set(types_used)):
    logging.debug("Rule #5 is violated for pattern %s. Types used: %s", pattern, types_used)
    return False
  return True


def _check_rule_6(tokens_chosen, pattern, types_used):
  """Rule #6: Separators between elements of the time group should be the same.

  Examples:
    _check_rule_5(tokens_chosen_1, '%Y-%m-%dT%H:%M:%S', 'YmdHMS') => True
    _check_rule_5(tokens_chosen_2, '%Y-%m-%dT%H %M %S', 'YmdHMS') => True
    _check_rule_5(tokens_chosen_3, '%Y-%m-%dT%H-%M:%S', 'YmdHMS') => False (different separators
                                                                  ('-' and ':') in time group)
  """
  time_parts = 'HMS'
  num_of_time_parts_used = len(list(filter(lambda x: x in time_parts, types_used)))
  time_parts_seen = 0
  separators_seen = []
  previous_was_a_separator = False

  for token in tokens_chosen:
    if token[1] is not None and token[1][3] in time_parts:
      # This rule doesn't work for separator-less time group so when we found the type
      # and it's three letters then it's (see type "Compound HHMMSS") then stop iterating
      if len(token[1][3]) == 3:
        break
      # If not a first time then
      if time_parts_seen > 0 and not previous_was_a_separator:
        separators_seen.append(None)
      time_parts_seen += 1
      if time_parts_seen == num_of_time_parts_used:
        break
      previous_was_a_separator = False
    else:
      if time_parts_seen > 0:
        separators_seen.append(token[0].val)
      previous_was_a_separator = True

  if len(set(separators_seen)) > 1:
    logging.debug("Rule #6 is violated for pattern %s. Seen separators: %s",
                  pattern, separators_seen)
    return False
  return True


def _check_rule_7a(pattern):
  """Rule #7a: If am/pm is in date we assume that 12-hour dates are allowed only.
  Otherwise it's 24-hour.

  Examples:
    >>> _check_rule_7a('%Y/%m/%d %H:%M %p')
    False
    >>> _check_rule_7a('%Y/%m/%d %I:%M %p')
    True
  """
  if '%p' in pattern and '%H' in pattern:
    logging.debug("Rule #7a is violated for pattern %s", pattern)
    return False
  return True


def _check_rule_7b(pattern):
  """Rule #7b: If am/pm is in date we assume that 12-hour dates are allowed only.
  Otherwise it's 24-hour.

  Examples:
    >>> _check_rule_7b('%Y/%m/%d %I:%M')
    False
    >>> _check_rule_7b('%Y/%m/%d %I:%M %p')
    True
  """
  if '%I' in pattern and '%p' not in pattern:
    logging.debug("Rule #7b is violated for pattern %s", pattern)
    return False
  return True


def _check_rule_8(pattern, types_used):
  """Rule #9: Year can't be between other date elements

  Examples:
    >>> _check_rule_8('%m/%Y/%d %I:%M', 'mYdIM')
    False
  """
  if 'mYd' in types_used or 'dYm' in types_used:
    logging.debug("Rule #8 is violated for pattern %s", pattern)
    return False
  return True


def _tokenize_by_character_class(s):
  """Return a list of strings by splitting s (tokenizing) by character class.

  Example:
    >>> t = _tokenize_by_character_class('Thu, May 14th, 2014 1:15 pm +0000')
    >>> [i.val for i in t]
    ['Thu', ',', ' ', 'May', ' ', '14', 'th', ',', ' ', '2014', ' ', '1', ':', '15', ' ', 'pm', ' ', '+', '0000']

    >>> t = _tokenize_by_character_class('5/14/2014')
    >>> [i.val for i in t]
    ['5', '/', '14', '/', '2014']
  """
  res = re.split(r'(\d+)|(\W)|(_)', s)
  return [Token(i, len(i)) for i in res if i]


def _sliding_triplets(tokens):
  for idx, t in enumerate(tokens):
    yield (t, tokens[idx-1] if idx > 0 else None, tokens[idx+1] if idx < len(tokens)-1 else None)


def _analyze_tokens(tokens):
  """Analize each token and find out compatible types for it."""
  for token, prev, nxt in _sliding_triplets(tokens):
    token.compatible_types = tuple([t for t in DATE_ELEMENTS if t[2](token.val, prev, nxt)])


@lru_cache()
def _generate_all_permutations(tokens):
  """Generate all permutations of format codes for given list of tokens.

  Brute-forcing of all possible permutations and rules checking eats most of the time or date
  parsing. But since the input is expected to be highly uniform then we can expect that
  memoization of this step will be very efficient.

  Token contains values for date parts but due to overridden eq and hash methods,
  we treat two tokens having the same length and same possible formats as equal
  tokens and separators should be the same
  """
  all_patterns = set()
  _generate_all_permutations_recursive(tokens, 0, [], "", all_patterns, "")

  return all_patterns


def _check_is_pattern_valid_quick_fail_rules(pattern, types_used):
  """Apply rules which are applicable for partially constructed patterns.

  Example: duplicates of a date part in a pattern.
  """
  return _check_rule_5(pattern, types_used) \
      and _check_rule_4(pattern, types_used) \
      and _check_rule_7a(pattern)


def _check_is_pattern_valid_full_pattern_rules(tokens_chosen, pattern, types_used):
  """Apply rules which are applicable for full pattern only.

  Example: existence of Year part in the pattern.
  """
  return _check_rule_1(pattern, types_used) \
      and _check_rule_2(pattern, types_used) \
      and _check_rule_3(pattern, types_used) \
      and _check_rule_6(tokens_chosen, pattern, types_used) \
      and _check_rule_7b(pattern) \
      and _check_rule_8(pattern, types_used)


def _generate_all_permutations_recursive(tokens, token_idx, tokens_chosen, pattern, found_patterns,
                                         types_used):
  """Generate all format elements permutations recursively.

  Args:
    tokens (list[Token]): List of tokens.
    token_idx (int): Index of token processing this cycle.
    tokens_chosen (list[(Token, Token.compatible_type)]): List of tuples
      containing token and compatible type
    pattern (str): String containing format for parsing
    found_patterns (set): Set of guessed patterns
    types_used (str): String of types used to build pattern.

  Returns:
    list: List of permutations
  """
  if not _check_is_pattern_valid_quick_fail_rules(pattern, types_used):
    return

  if token_idx < len(tokens):
    t = tokens[token_idx]
    if t.compatible_types:
      for ct in t.compatible_types:
        _generate_all_permutations_recursive(tokens, token_idx+1, tokens_chosen[:] + [(t, ct)],
                                             (pattern if ct[4] == 0 else pattern[:-ct[4]]) + ct[1],
                                             found_patterns, types_used + ct[3])
    else:
      # if no compatible types it should be separator, add it to the pattern
      _generate_all_permutations_recursive(tokens, token_idx+1,
                                           tokens_chosen[:] + [(t, None)], pattern + t.val,
                                           found_patterns, types_used)
  else:
    if _check_is_pattern_valid_full_pattern_rules(tokens_chosen, pattern, types_used):
      found_patterns.add(pattern)


def guess(date):
  """Guesses datetime.strftime/strptime-compliant date formats for date string.

  Args:
    date (str): Date string.

  Returns:
    set: Set of datetime.strftime/strptime-compliant date format strings

  Examples:
    >>> guess('2014/05/05 14:00:00 UTC')
    set(['%Y/%d/%m %H:%M:%S %Z', '%Y/%m/%d %H:%M:%S %Z'])
    >>> guess('12/12/12')
    set(['%y/%m/%d', '%d/%m/%y', '%m/%d/%y', '%y/%d/%m'])
  """
  tokens = _tokenize_by_character_class(date)
  _analyze_tokens(tokens)
  return _generate_all_permutations(tuple(tokens))


def guess_bulk(dates, error_rate=0):
  """Guesses datetime.strftime/strptime-compliant date formats for list of the samples.

  Args:
    dates (list): List of samples date strings.
    error_rate (float): Acceptable error rate (default 0.0)

  Returns:
    list: List of datetime.strftime/strptime-compliant date format strings sorted by error rate

  Examples:
    >>> guess_bulk(['12-11-2014', '12-25-2014'])
    ['%m-%d-%Y']
    >>> guess_bulk(['12-11-2014', '25-25-2014'])
    []
    >>> guess_bulk(['12-11-2013', '13-8-2013', '05-25-2013', '12-25-2013'], error_rate=0.5)
    ['%m-%d-%Y']
  """
  if error_rate == 0.0:
    patterns = None
    for date in dates:
      guesses_patterns = guess(date)
      if patterns is None:
        patterns = guesses_patterns
      else:
        patterns = patterns.intersection(guesses_patterns)
      if not patterns:
        break   # No need to iterate more if zero patterns found
    return list(patterns)
  else:
    found_dates = 0
    pattern_counters = defaultdict(lambda: 0)
    num_dates = len(dates)
    min_num_dates_to_be_found = num_dates - num_dates * error_rate

    for idx, date in enumerate(dates):
      patterns = guess(date)
      if patterns:
        found_dates += 1
      for pattern in patterns:
        pattern_counters[pattern] = pattern_counters[pattern] + 1

      # Early return if number of strings that can't be date is already over error rate
      cells_left = num_dates - idx - 1
      cannot_be_found = float(found_dates + cells_left) < min_num_dates_to_be_found
      if cannot_be_found:
        return []

    patterns = [(v, k) for k, v in pattern_counters.items()
                if v > min_num_dates_to_be_found]
    patterns.sort(reverse=True)
    return [k for (v, k) in patterns]
(core) support python3 in grist-core, and running engine via docker and/or gvisor Summary: * Moves essential plugins to grist-core, so that basic imports (e.g. csv) work. * Adds support for a `GRIST_SANDBOX_FLAVOR` flag that can systematically override how the data engine is run. - `GRIST_SANDBOX_FLAVOR=pynbox` is "classic" nacl-based sandbox. - `GRIST_SANDBOX_FLAVOR=docker` runs engines in individual docker containers. It requires an image specified in `sandbox/docker` (alternative images can be named with `GRIST_SANDBOX` flag - need to contain python and engine requirements). It is a simple reference implementation for sandboxing. - `GRIST_SANDBOX_FLAVOR=unsandboxed` runs whatever local version of python is specified by a `GRIST_SANDBOX` flag directly, with no sandboxing. Engine requirements must be installed, so an absolute path to a python executable in a virtualenv is easiest to manage. - `GRIST_SANDBOX_FLAVOR=gvisor` runs the data engine via gvisor's runsc. Experimental, with implementation not included in grist-core. Since gvisor runs on Linux only, this flavor supports wrapping the sandboxes in a single shared docker container. * Tweaks some recent express query parameter code to work in grist-core, which has a slightly different version of express (smoke test doesn't catch this since in Jenkins core is built within a workspace that has node_modules, and wires get crossed - in a dev environment the problem on master can be seen by doing `buildtools/build_core.sh /tmp/any_path_outside_grist`). The new sandbox options do not have tests yet, nor does this they change the behavior of grist servers today. They are there to clean up and consolidate a collection of patches I've been using that were getting cumbersome, and make it easier to run experiments. I haven't looked closely at imports beyond core. Test Plan: tested manually against regular grist and grist-core, including imports Reviewers: alexmojaki, dsagal Reviewed By: alexmojaki Differential Revision: https://phab.getgrist.com/D2942 2021-07-27 23:43:21 +00:00			`"""This module guesses possible formats of dates which can be parsed using datetime.strptime`
			`based on samples.`

			`dateguesser.guess(sample)`
			`dateguesser.guess takes a sample date string and returns a set of`
			`datetime.strftime/strptime-compliant date format strings that will correctly parse.`

			`dateguesser.guess_bulk(list_of_samples, error_rate=0)`
			`dateguesser.guess_bulk takes a list of sample date strings and acceptable error rate`
			`and returns a list of datetime.strftime/strptime-compliant date format strings`
			`sorted by error rate that will correctly parse.`

			`Algorithm:`

			`1. Tokenize input string into chunks based on character type: digits, alphas, the rest.`
			`2. Analyze each token independently in terms what format codes could represent`
			`3. For given list of tokens generate all permutations of format codes`
			`4. During generating permutations check for validness of generated format and skip if invalid.`
			`5. Use rules listed below to decide if format is invalid:`

			`Invalid format checks:`

			`Rule #1: Year MUST be in the date. Year is the minimum possible parsable date.`
			`Rule #2. No holes (missing parts) in the format parts.`
			`Rule #3. Time parts are neighbors to each other. No interleaving time with the date.`
			`Rule #4. It's highly impossible that minutes coming before hour, millis coming before seconds etc`
			`Rule #5. Pattern can't have some part of date/time defined more than once.`
			`Rule #6: Separators between elements of the time group should be the same.`
			`Rule #7: If am/pm is in date we assume that 12-hour dates are allowed only. Otherwise it's 24-hour`
			`Rule #8: Year can't be between other date elements`

			`Note:`
			`dateguess doesn't support defaulting to current year because parsing should be deterministic,`
			`it's better to to fail guessing the format then to guess it incorrectly.`

			`Examples:`
			`>>> guess('2014/05/05 14:00:00 UTC')`
			`set(['%Y/%d/%m %H:%M:%S %Z', '%Y/%m/%d %H:%M:%S %Z'])`
			`>>> guess('12/12/12')`
			`set(['%y/%m/%d', '%d/%m/%y', '%m/%d/%y', '%y/%d/%m'])`
			`>>> guess_bulk(['12-11-2014', '12-25-2014'])`
			`['%m-%d-%Y']`
			`>>> guess_bulk(['12-11-2014', '25-25-2014'])`
			`[]`
			`>>> guess_bulk(['12-11-2013', '13-8-2013', '05-25-2013', '12-25-2013'], error_rate=0.5)`
			`['%m-%d-%Y']`
			`"""`


			`import calendar`
			`import itertools`
			`import logging`
			`import re`
			`from collections import defaultdict`

			`from backports.functools_lru_cache import lru_cache`
			`import moment`


			`MONTH_NAME = calendar.month_name`
			`MONTH_ABBR = calendar.month_abbr`
			`TZ_VALID_NAMES = {z[0] for z in moment.get_tz_data().items()}`
			`AM_PM = {'am', 'pm'}`
			`DAYS_OF_WEEK_NAME = calendar.day_name`
			`DAYS_OF_WEEK_ABBR = calendar.day_abbr`

			`DATE_ELEMENTS = [`
			`# Name Pattern Predicate Group (mutual exclusive) Consumes N prev elements`
			`("Year", "%Y", lambda x, p, v: x.isdigit() and len(x) == 4, "Y", 0),`
			`("Year short", "%y", lambda x, p, v: x.isdigit() and len(x) == 2, "Y", 0),`
			`("Month", "%m", lambda x, p, v: x.isdigit() and len(x) <= 2 and 0 < int(x) <= 12, "m", 0),`
			`("Month name full", "%B", lambda x, p, v: x.isalpha() and x.capitalize() in MONTH_NAME, "m", 0),`
			`("Month name abbr", "%b", lambda x, p, v: x.isalpha() and x.capitalize() in MONTH_ABBR, "m", 0),`
			`("Day", "%d", lambda x, p, v: x.isdigit() and len(x) <= 2 and 0 < int(x) <= 31, "d", 0),`
			`("Day of week", "%A", lambda x, p, v: x.isalpha()`
			`and x.capitalize() in DAYS_OF_WEEK_NAME, "a", 0),`
			`("Day of week abbr", "%a", lambda x, p, v: x.isalpha()`
			`and x.capitalize() in DAYS_OF_WEEK_ABBR, "a", 0),`

			`("Compound HHMMSS", "%H%M%S", lambda x, p, v: x.isdigit() and len(x) == 6`
			`and 0 <= int(x[0:2]) < 24`
			`and 0 <= int(x[2:4]) < 60`
			`and 0 <= int(x[4:6]) < 60, "HMS", 0),`

			`("Hour", "%H", lambda x, p, v: x.isdigit() and len(x) <= 2 and 0 <= int(x) <= 23, "H", 0),`
			`("Hour in 12hr mode", "%I", lambda x, p, v: x.isdigit() and len(x) <= 2`
			`and 0 <= int(x) <= 11, "H", 0),`
			`("AM/PM", "%p", lambda x, p, v: x.isalpha() and len(x) == 2 and x.lower() in AM_PM, "p", 0),`
			`("Minutes", "%M", lambda x, p, v: x.isdigit() and len(x) <= 2 and 0 <= int(x) <= 59, "M", 0),`
			`("Seconds", "%S", lambda x, p, v: x.isdigit() and len(x) <= 2 and 0 <= int(x) <= 59, "S", 0),`
			`("Fraction of second", "%f", lambda x, p, v: x.isdigit() and p is not None`
			`and p.val == '.', "f", 0),`
			`("Timezone name", "%Z", lambda x, p, v: x.isalpha() and len(x) > 2`
			`and x in TZ_VALID_NAMES, "Z", 0),`
			`("Timezone +HHMM", "%z", lambda x, p, v: x.isdigit() and len(x) == 4 and 0 <= int(x[0:2]) < 15`
			`and 0 <= int(x[2:4]) < 60 and p is not None`
			`and p.val == '+', "Z", 1),`
			`("Timezone -HHMM", "%z", lambda x, p, v: x.isdigit() and len(x) == 4 and 0 <= int(x[0:2]) < 15`
			`and 0 <= int(x[2:4]) < 60 and p is not None`
			`and p.val == '-', "Z", 1),`
			`]`


			`class Token(object):`
			`"""Represents a part of a date string that's being parsed.`
			`Note that __hash__ and __eq__ are overridden in order`
			`to compare only meaningful parts of an object.`
			`"""`
			`def __init__(self, val, length):`
			`self.val = val`
			`self.length = length`
			`self.compatible_types = ()`

			`def __hash__(self):`
			`h = hash(self.length) + hash(self.compatible_types)`
			`if not self.compatible_types:`
			`h += hash(self.val)`
			`return hash(h)`

			`def __eq__(self, other):`
			`"""`
			`Two tokens are equal when these both are true:`
			`a) length and compatible types are equal`
			`b) if it is separator (no compatible types), separator values must be equal`
			`"""`
			`if self.length != other.length or self.compatible_types != other.compatible_types:`
			`return False`
			`if not other.compatible_types and self.val != other.val:`
			`return False`
			`return True`


			`def _check_rule_1(pattern, types_used):`
			`"""Rule #1: Year MUST be in the date. Year is the minimum possible parsable date.`

			`Examples:`
			`>>> _check_rule_1('%Y/%m/%d', 'Ymd')`
			`True`
			`>>> _check_rule_1('%m/%d', 'md')`
			`False`
			`"""`
			`if 'Y' not in types_used:`
			`logging.debug("Rule #1 is violated for pattern %s. Types used: %s", pattern, types_used)`
			`return False`
			`return True`


			`def _check_rule_2(pattern, types_used):`
			`"""Rule #2: No holes (missing parts) in the format parts.`

			`Examples:`
			`>>> _check_rule_2('%Y:%H', 'YH')`
			`False`
			`>>> _check_rule_2('%Y/%m/%d %H', 'YmdH')`
			`True`
			`"""`
			`priorities = 'YmdHMSf'`
			`seen_parts = [p in types_used for p in priorities]`
			`if sorted(seen_parts, reverse=True) != seen_parts:`
			`logging.debug("Rule #2 is violated for pattern %s. Types used: %s", pattern, types_used)`
			`return False`
			`return True`


			`def _check_rule_3(pattern, types_used):`
			`"""Rule #3: Time parts are neighbors to time only. No interleaving time with the date.`

			`Examples:`
			`>>> _check_rule_3('%m/%d %H:%M %Y', 'mdHMY')`
			`True`
			`>>> _check_rule_3('%m/%d %H:%Y:%M', 'mdHYM')`
			`False`
			`"""`
			`time_parts = 'HMSf'`
			`time_parts_highlighted = [t in time_parts for t in types_used]`
			`time_parts_deduplicated = [a[0] for a in itertools.groupby(time_parts_highlighted)]`
			`if len(list(filter(lambda x: x, time_parts_deduplicated))) > 1:`
			`logging.debug("Rule #3 is violated for pattern %s. Types used: %s", pattern, types_used)`
			`return False`
			`return True`


			`def _check_rule_4(pattern, types_used):`
			`"""Rule #4: It's highly impossible that minutes coming before hours,`
			`millis coming before seconds etc.`

			`Examples:`
			`>>> _check_rule_4('%H:%M', 'HM')`
			`True`
			`>>> _check_rule_4('%S:%M', 'SM')`
			`False`
			`"""`
			`time_parts_priority = 'HMSf'`
			`time_parts_indexes = list(filter(lambda x: x >= 0,`
			`[time_parts_priority.find(t) for t in types_used]))`
			`if sorted(time_parts_indexes) != time_parts_indexes:`
			`logging.debug("Rule #4 is violated for pattern %s. Types used: %s", pattern, types_used)`
			`return False`
			`return True`


			`def _check_rule_5(pattern, types_used):`
			`"""Rule #5: Pattern can't have some part of date/time defined more than once.`

			`Examples:`
			`>>> _check_rule_5('%Y/%Y', 'YY')`
			`False`
			`>>> _check_rule_5('%m/%b', 'mm')`
			`False`
			`>>> _check_rule_5('%Y/%m', 'Ym')`
			`True`
			`"""`
			`if len(types_used) != len(set(types_used)):`
			`logging.debug("Rule #5 is violated for pattern %s. Types used: %s", pattern, types_used)`
			`return False`
			`return True`


			`def _check_rule_6(tokens_chosen, pattern, types_used):`
			`"""Rule #6: Separators between elements of the time group should be the same.`

			`Examples:`
			`_check_rule_5(tokens_chosen_1, '%Y-%m-%dT%H:%M:%S', 'YmdHMS') => True`
			`_check_rule_5(tokens_chosen_2, '%Y-%m-%dT%H %M %S', 'YmdHMS') => True`
			`_check_rule_5(tokens_chosen_3, '%Y-%m-%dT%H-%M:%S', 'YmdHMS') => False (different separators`
			`('-' and ':') in time group)`
			`"""`
			`time_parts = 'HMS'`
			`num_of_time_parts_used = len(list(filter(lambda x: x in time_parts, types_used)))`
			`time_parts_seen = 0`
			`separators_seen = []`
			`previous_was_a_separator = False`

			`for token in tokens_chosen:`
			`if token[1] is not None and token[1][3] in time_parts:`
			`# This rule doesn't work for separator-less time group so when we found the type`
			`# and it's three letters then it's (see type "Compound HHMMSS") then stop iterating`
			`if len(token[1][3]) == 3:`
			`break`
			`# If not a first time then`
			`if time_parts_seen > 0 and not previous_was_a_separator:`
			`separators_seen.append(None)`
			`time_parts_seen += 1`
			`if time_parts_seen == num_of_time_parts_used:`
			`break`
			`previous_was_a_separator = False`
			`else:`
			`if time_parts_seen > 0:`
			`separators_seen.append(token[0].val)`
			`previous_was_a_separator = True`

			`if len(set(separators_seen)) > 1:`
			`logging.debug("Rule #6 is violated for pattern %s. Seen separators: %s",`
			`pattern, separators_seen)`
			`return False`
			`return True`


			`def _check_rule_7a(pattern):`
			`"""Rule #7a: If am/pm is in date we assume that 12-hour dates are allowed only.`
			`Otherwise it's 24-hour.`

			`Examples:`
			`>>> _check_rule_7a('%Y/%m/%d %H:%M %p')`
			`False`
			`>>> _check_rule_7a('%Y/%m/%d %I:%M %p')`
			`True`
			`"""`
			`if '%p' in pattern and '%H' in pattern:`
			`logging.debug("Rule #7a is violated for pattern %s", pattern)`
			`return False`
			`return True`


			`def _check_rule_7b(pattern):`
			`"""Rule #7b: If am/pm is in date we assume that 12-hour dates are allowed only.`
			`Otherwise it's 24-hour.`

			`Examples:`
			`>>> _check_rule_7b('%Y/%m/%d %I:%M')`
			`False`
			`>>> _check_rule_7b('%Y/%m/%d %I:%M %p')`
			`True`
			`"""`
			`if '%I' in pattern and '%p' not in pattern:`
			`logging.debug("Rule #7b is violated for pattern %s", pattern)`
			`return False`
			`return True`


			`def _check_rule_8(pattern, types_used):`
			`"""Rule #9: Year can't be between other date elements`

			`Examples:`
			`>>> _check_rule_8('%m/%Y/%d %I:%M', 'mYdIM')`
			`False`
			`"""`
			`if 'mYd' in types_used or 'dYm' in types_used:`
			`logging.debug("Rule #8 is violated for pattern %s", pattern)`
			`return False`
			`return True`


			`def _tokenize_by_character_class(s):`
			`"""Return a list of strings by splitting s (tokenizing) by character class.`

			`Example:`
			`>>> t = _tokenize_by_character_class('Thu, May 14th, 2014 1:15 pm +0000')`
			`>>> [i.val for i in t]`
			`['Thu', ',', ' ', 'May', ' ', '14', 'th', ',', ' ', '2014', ' ', '1', ':', '15', ' ', 'pm', ' ', '+', '0000']`

			`>>> t = _tokenize_by_character_class('5/14/2014')`
			`>>> [i.val for i in t]`
			`['5', '/', '14', '/', '2014']`
			`"""`
			`res = re.split(r'(\d+)\|(\W)\|(_)', s)`
			`return [Token(i, len(i)) for i in res if i]`


			`def _sliding_triplets(tokens):`
			`for idx, t in enumerate(tokens):`
			`yield (t, tokens[idx-1] if idx > 0 else None, tokens[idx+1] if idx < len(tokens)-1 else None)`


			`def _analyze_tokens(tokens):`
			`"""Analize each token and find out compatible types for it."""`
			`for token, prev, nxt in _sliding_triplets(tokens):`
			`token.compatible_types = tuple([t for t in DATE_ELEMENTS if t[2](token.val, prev, nxt)])`


			`@lru_cache()`
			`def _generate_all_permutations(tokens):`
			`"""Generate all permutations of format codes for given list of tokens.`

			`Brute-forcing of all possible permutations and rules checking eats most of the time or date`
			`parsing. But since the input is expected to be highly uniform then we can expect that`
			`memoization of this step will be very efficient.`

			`Token contains values for date parts but due to overridden eq and hash methods,`
			`we treat two tokens having the same length and same possible formats as equal`
			`tokens and separators should be the same`
			`"""`
			`all_patterns = set()`
			`_generate_all_permutations_recursive(tokens, 0, [], "", all_patterns, "")`

			`return all_patterns`


			`def _check_is_pattern_valid_quick_fail_rules(pattern, types_used):`
			`"""Apply rules which are applicable for partially constructed patterns.`

			`Example: duplicates of a date part in a pattern.`
			`"""`
			`return _check_rule_5(pattern, types_used) \`
			`and _check_rule_4(pattern, types_used) \`
			`and _check_rule_7a(pattern)`


			`def _check_is_pattern_valid_full_pattern_rules(tokens_chosen, pattern, types_used):`
			`"""Apply rules which are applicable for full pattern only.`

			`Example: existence of Year part in the pattern.`
			`"""`
			`return _check_rule_1(pattern, types_used) \`
			`and _check_rule_2(pattern, types_used) \`
			`and _check_rule_3(pattern, types_used) \`
			`and _check_rule_6(tokens_chosen, pattern, types_used) \`
			`and _check_rule_7b(pattern) \`
			`and _check_rule_8(pattern, types_used)`


			`def _generate_all_permutations_recursive(tokens, token_idx, tokens_chosen, pattern, found_patterns,`
			`types_used):`
			`"""Generate all format elements permutations recursively.`

			`Args:`
			`tokens (list[Token]): List of tokens.`
			`token_idx (int): Index of token processing this cycle.`
			`tokens_chosen (list[(Token, Token.compatible_type)]): List of tuples`
			`containing token and compatible type`
			`pattern (str): String containing format for parsing`
			`found_patterns (set): Set of guessed patterns`
			`types_used (str): String of types used to build pattern.`

			`Returns:`
			`list: List of permutations`
			`"""`
			`if not _check_is_pattern_valid_quick_fail_rules(pattern, types_used):`
			`return`

			`if token_idx < len(tokens):`
			`t = tokens[token_idx]`
			`if t.compatible_types:`
			`for ct in t.compatible_types:`
			`_generate_all_permutations_recursive(tokens, token_idx+1, tokens_chosen[:] + [(t, ct)],`
			`(pattern if ct[4] == 0 else pattern[:-ct[4]]) + ct[1],`
			`found_patterns, types_used + ct[3])`
			`else:`
			`# if no compatible types it should be separator, add it to the pattern`
			`_generate_all_permutations_recursive(tokens, token_idx+1,`
			`tokens_chosen[:] + [(t, None)], pattern + t.val,`
			`found_patterns, types_used)`
			`else:`
			`if _check_is_pattern_valid_full_pattern_rules(tokens_chosen, pattern, types_used):`
			`found_patterns.add(pattern)`


			`def guess(date):`
			`"""Guesses datetime.strftime/strptime-compliant date formats for date string.`

			`Args:`
			`date (str): Date string.`

			`Returns:`
			`set: Set of datetime.strftime/strptime-compliant date format strings`

			`Examples:`
			`>>> guess('2014/05/05 14:00:00 UTC')`
			`set(['%Y/%d/%m %H:%M:%S %Z', '%Y/%m/%d %H:%M:%S %Z'])`
			`>>> guess('12/12/12')`
			`set(['%y/%m/%d', '%d/%m/%y', '%m/%d/%y', '%y/%d/%m'])`
			`"""`
			`tokens = _tokenize_by_character_class(date)`
			`_analyze_tokens(tokens)`
			`return _generate_all_permutations(tuple(tokens))`


			`def guess_bulk(dates, error_rate=0):`
			`"""Guesses datetime.strftime/strptime-compliant date formats for list of the samples.`

			`Args:`
			`dates (list): List of samples date strings.`
			`error_rate (float): Acceptable error rate (default 0.0)`

			`Returns:`
			`list: List of datetime.strftime/strptime-compliant date format strings sorted by error rate`

			`Examples:`
			`>>> guess_bulk(['12-11-2014', '12-25-2014'])`
			`['%m-%d-%Y']`
			`>>> guess_bulk(['12-11-2014', '25-25-2014'])`
			`[]`
			`>>> guess_bulk(['12-11-2013', '13-8-2013', '05-25-2013', '12-25-2013'], error_rate=0.5)`
			`['%m-%d-%Y']`
			`"""`
			`if error_rate == 0.0:`
			`patterns = None`
			`for date in dates:`
			`guesses_patterns = guess(date)`
			`if patterns is None:`
			`patterns = guesses_patterns`
			`else:`
			`patterns = patterns.intersection(guesses_patterns)`
			`if not patterns:`
			`break # No need to iterate more if zero patterns found`
			`return list(patterns)`
			`else:`
			`found_dates = 0`
			`pattern_counters = defaultdict(lambda: 0)`
			`num_dates = len(dates)`
			`min_num_dates_to_be_found = num_dates - num_dates * error_rate`

			`for idx, date in enumerate(dates):`
			`patterns = guess(date)`
			`if patterns:`
			`found_dates += 1`
			`for pattern in patterns:`
			`pattern_counters[pattern] = pattern_counters[pattern] + 1`

			`# Early return if number of strings that can't be date is already over error rate`
			`cells_left = num_dates - idx - 1`
			`cannot_be_found = float(found_dates + cells_left) < min_num_dates_to_be_found`
			`if cannot_be_found:`
			`return []`

			`patterns = [(v, k) for k, v in pattern_counters.items()`
			`if v > min_num_dates_to_be_found]`
			`patterns.sort(reverse=True)`
			`return [k for (v, k) in patterns]`