mirror of
https://github.com/gristlabs/grist-core.git
synced 2026-03-02 04:09:24 +00:00
(core) Lossless imports
Summary: - Removed string parsing and some type guessing code from parse_data.py. That logic is now implicitly done by ValueGuesser by leaving the initial column type as Any. parse_data.py mostly comes into play when importing files (e.g. Excel) containing values that already have types, i.e. numbers and dates. - 0s and 1s are treated as numbers instead of booleans to keep imports lossless. - Removed dateguess.py and test_dateguess.py. - Changed what `guessDateFormat` does when multiple date formats work equally well for the given data, in order to be consistent with the old dateguess.py. - Columns containing numbers are now always imported as Numeric, never Int. - Removed `NullIfEmptyParser` because it was interfering with the new system. Its purpose was to avoid pointlessly changing a column from Any to Text when no actual data was inserted. A different solution to that problem was already added to `_ensure_column_accepts_data` in the data engine in a recent related diff. Test Plan: - Added 2 `nbrowser/Importer2` tests. - Updated various existing tests. - Extended testing of `guessDateFormat`. Added `guessDateFormats` to show how ambiguous dates are handled internally. Reviewers: georgegevoian Reviewed By: georgegevoian Differential Revision: https://phab.getgrist.com/D3302
This commit is contained in:
@@ -1,490 +0,0 @@
|
||||
"""This module guesses possible formats of dates which can be parsed using datetime.strptime
|
||||
based on samples.
|
||||
|
||||
dateguesser.guess(sample)
|
||||
dateguesser.guess takes a sample date string and returns a set of
|
||||
datetime.strftime/strptime-compliant date format strings that will correctly parse.
|
||||
|
||||
dateguesser.guess_bulk(list_of_samples, error_rate=0)
|
||||
dateguesser.guess_bulk takes a list of sample date strings and acceptable error rate
|
||||
and returns a list of datetime.strftime/strptime-compliant date format strings
|
||||
sorted by error rate that will correctly parse.
|
||||
|
||||
Algorithm:
|
||||
|
||||
1. Tokenize input string into chunks based on character type: digits, alphas, the rest.
|
||||
2. Analyze each token independently in terms what format codes could represent
|
||||
3. For given list of tokens generate all permutations of format codes
|
||||
4. During generating permutations check for validness of generated format and skip if invalid.
|
||||
5. Use rules listed below to decide if format is invalid:
|
||||
|
||||
Invalid format checks:
|
||||
|
||||
Rule #1: Year MUST be in the date. Year is the minimum possible parsable date.
|
||||
Rule #2. No holes (missing parts) in the format parts.
|
||||
Rule #3. Time parts are neighbors to each other. No interleaving time with the date.
|
||||
Rule #4. It's highly impossible that minutes coming before hour, millis coming before seconds etc
|
||||
Rule #5. Pattern can't have some part of date/time defined more than once.
|
||||
Rule #6: Separators between elements of the time group should be the same.
|
||||
Rule #7: If am/pm is in date we assume that 12-hour dates are allowed only. Otherwise it's 24-hour
|
||||
Rule #8: Year can't be between other date elements
|
||||
|
||||
Note:
|
||||
dateguess doesn't support defaulting to current year because parsing should be deterministic,
|
||||
it's better to to fail guessing the format then to guess it incorrectly.
|
||||
|
||||
Examples:
|
||||
>>> guess('2014/05/05 14:00:00 UTC')
|
||||
set(['%Y/%d/%m %H:%M:%S %Z', '%Y/%m/%d %H:%M:%S %Z'])
|
||||
>>> guess('12/12/12')
|
||||
set(['%y/%m/%d', '%d/%m/%y', '%m/%d/%y', '%y/%d/%m'])
|
||||
>>> guess_bulk(['12-11-2014', '12-25-2014'])
|
||||
['%m-%d-%Y']
|
||||
>>> guess_bulk(['12-11-2014', '25-25-2014'])
|
||||
[]
|
||||
>>> guess_bulk(['12-11-2013', '13-8-2013', '05-25-2013', '12-25-2013'], error_rate=0.5)
|
||||
['%m-%d-%Y']
|
||||
"""
|
||||
|
||||
|
||||
import calendar
|
||||
import itertools
|
||||
import logging
|
||||
import re
|
||||
from collections import defaultdict
|
||||
|
||||
from backports.functools_lru_cache import lru_cache
|
||||
import moment
|
||||
|
||||
|
||||
MONTH_NAME = calendar.month_name
|
||||
MONTH_ABBR = calendar.month_abbr
|
||||
TZ_VALID_NAMES = {z[0] for z in moment.get_tz_data().items()}
|
||||
AM_PM = {'am', 'pm'}
|
||||
DAYS_OF_WEEK_NAME = calendar.day_name
|
||||
DAYS_OF_WEEK_ABBR = calendar.day_abbr
|
||||
ASCII_DIGITS_RE = re.compile(r'^[0-9]+$')
|
||||
|
||||
# Using x.isdigit() matches strings like u'\xb2' (superscripts) which we don't want.
|
||||
# Use isdigit(x) instead, to only match ASCII digits 0-9.
|
||||
isdigit = ASCII_DIGITS_RE.match
|
||||
|
||||
DATE_ELEMENTS = [
|
||||
# Name Pattern Predicate Group (mutual exclusive) Consumes N prev elements
|
||||
("Year", "%Y", lambda x, p, v: isdigit(x) and len(x) == 4, "Y", 0),
|
||||
("Year short", "%y", lambda x, p, v: isdigit(x) and len(x) == 2, "Y", 0),
|
||||
("Month", "%m", lambda x, p, v: isdigit(x) and len(x) <= 2 and 0 < int(x) <= 12, "m", 0),
|
||||
("Month name full", "%B", lambda x, p, v: x.isalpha() and x.capitalize() in MONTH_NAME, "m", 0),
|
||||
("Month name abbr", "%b", lambda x, p, v: x.isalpha() and x.capitalize() in MONTH_ABBR, "m", 0),
|
||||
("Day", "%d", lambda x, p, v: isdigit(x) and len(x) <= 2 and 0 < int(x) <= 31, "d", 0),
|
||||
("Day of week", "%A", lambda x, p, v: x.isalpha()
|
||||
and x.capitalize() in DAYS_OF_WEEK_NAME, "a", 0),
|
||||
("Day of week abbr", "%a", lambda x, p, v: x.isalpha()
|
||||
and x.capitalize() in DAYS_OF_WEEK_ABBR, "a", 0),
|
||||
|
||||
("Compound HHMMSS", "%H%M%S", lambda x, p, v: isdigit(x) and len(x) == 6
|
||||
and 0 <= int(x[0:2]) < 24
|
||||
and 0 <= int(x[2:4]) < 60
|
||||
and 0 <= int(x[4:6]) < 60, "HMS", 0),
|
||||
|
||||
("Hour", "%H", lambda x, p, v: isdigit(x) and len(x) <= 2 and 0 <= int(x) <= 23, "H", 0),
|
||||
("Hour in 12hr mode", "%I", lambda x, p, v: isdigit(x) and len(x) <= 2
|
||||
and 0 <= int(x) <= 11, "H", 0),
|
||||
("AM/PM", "%p", lambda x, p, v: x.isalpha() and len(x) == 2 and x.lower() in AM_PM, "p", 0),
|
||||
("Minutes", "%M", lambda x, p, v: isdigit(x) and len(x) <= 2 and 0 <= int(x) <= 59, "M", 0),
|
||||
("Seconds", "%S", lambda x, p, v: isdigit(x) and len(x) <= 2 and 0 <= int(x) <= 59, "S", 0),
|
||||
("Fraction of second", "%f", lambda x, p, v: isdigit(x) and p is not None
|
||||
and p.val == '.', "f", 0),
|
||||
("Timezone name", "%Z", lambda x, p, v: x.isalpha() and len(x) > 2
|
||||
and x in TZ_VALID_NAMES, "Z", 0),
|
||||
("Timezone +HHMM", "%z", lambda x, p, v: isdigit(x) and len(x) == 4 and 0 <= int(x[0:2]) < 15
|
||||
and 0 <= int(x[2:4]) < 60 and p is not None
|
||||
and p.val == '+', "Z", 1),
|
||||
("Timezone -HHMM", "%z", lambda x, p, v: isdigit(x) and len(x) == 4 and 0 <= int(x[0:2]) < 15
|
||||
and 0 <= int(x[2:4]) < 60 and p is not None
|
||||
and p.val == '-', "Z", 1),
|
||||
]
|
||||
|
||||
|
||||
class Token(object):
|
||||
"""Represents a part of a date string that's being parsed.
|
||||
Note that __hash__ and __eq__ are overridden in order
|
||||
to compare only meaningful parts of an object.
|
||||
"""
|
||||
def __init__(self, val, length):
|
||||
self.val = val
|
||||
self.length = length
|
||||
self.compatible_types = ()
|
||||
|
||||
def __hash__(self):
|
||||
h = hash(self.length) + hash(self.compatible_types)
|
||||
if not self.compatible_types:
|
||||
h += hash(self.val)
|
||||
return hash(h)
|
||||
|
||||
def __eq__(self, other):
|
||||
"""
|
||||
Two tokens are equal when these both are true:
|
||||
a) length and compatible types are equal
|
||||
b) if it is separator (no compatible types), separator values must be equal
|
||||
"""
|
||||
if self.length != other.length or self.compatible_types != other.compatible_types:
|
||||
return False
|
||||
if not other.compatible_types and self.val != other.val:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _check_rule_1(pattern, types_used):
|
||||
"""Rule #1: Year MUST be in the date. Year is the minimum possible parsable date.
|
||||
|
||||
Examples:
|
||||
>>> _check_rule_1('%Y/%m/%d', 'Ymd')
|
||||
True
|
||||
>>> _check_rule_1('%m/%d', 'md')
|
||||
False
|
||||
"""
|
||||
if 'Y' not in types_used:
|
||||
logging.debug("Rule #1 is violated for pattern %s. Types used: %s", pattern, types_used)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _check_rule_2(pattern, types_used):
|
||||
"""Rule #2: No holes (missing parts) in the format parts.
|
||||
|
||||
Examples:
|
||||
>>> _check_rule_2('%Y:%H', 'YH')
|
||||
False
|
||||
>>> _check_rule_2('%Y/%m/%d %H', 'YmdH')
|
||||
True
|
||||
"""
|
||||
priorities = 'YmdHMSf'
|
||||
seen_parts = [p in types_used for p in priorities]
|
||||
if sorted(seen_parts, reverse=True) != seen_parts:
|
||||
logging.debug("Rule #2 is violated for pattern %s. Types used: %s", pattern, types_used)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _check_rule_3(pattern, types_used):
|
||||
"""Rule #3: Time parts are neighbors to time only. No interleaving time with the date.
|
||||
|
||||
Examples:
|
||||
>>> _check_rule_3('%m/%d %H:%M %Y', 'mdHMY')
|
||||
True
|
||||
>>> _check_rule_3('%m/%d %H:%Y:%M', 'mdHYM')
|
||||
False
|
||||
"""
|
||||
time_parts = 'HMSf'
|
||||
time_parts_highlighted = [t in time_parts for t in types_used]
|
||||
time_parts_deduplicated = [a[0] for a in itertools.groupby(time_parts_highlighted)]
|
||||
if len(list(filter(lambda x: x, time_parts_deduplicated))) > 1:
|
||||
logging.debug("Rule #3 is violated for pattern %s. Types used: %s", pattern, types_used)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _check_rule_4(pattern, types_used):
|
||||
"""Rule #4: It's highly impossible that minutes coming before hours,
|
||||
millis coming before seconds etc.
|
||||
|
||||
Examples:
|
||||
>>> _check_rule_4('%H:%M', 'HM')
|
||||
True
|
||||
>>> _check_rule_4('%S:%M', 'SM')
|
||||
False
|
||||
"""
|
||||
time_parts_priority = 'HMSf'
|
||||
time_parts_indexes = list(filter(lambda x: x >= 0,
|
||||
[time_parts_priority.find(t) for t in types_used]))
|
||||
if sorted(time_parts_indexes) != time_parts_indexes:
|
||||
logging.debug("Rule #4 is violated for pattern %s. Types used: %s", pattern, types_used)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _check_rule_5(pattern, types_used):
|
||||
"""Rule #5: Pattern can't have some part of date/time defined more than once.
|
||||
|
||||
Examples:
|
||||
>>> _check_rule_5('%Y/%Y', 'YY')
|
||||
False
|
||||
>>> _check_rule_5('%m/%b', 'mm')
|
||||
False
|
||||
>>> _check_rule_5('%Y/%m', 'Ym')
|
||||
True
|
||||
"""
|
||||
if len(types_used) != len(set(types_used)):
|
||||
logging.debug("Rule #5 is violated for pattern %s. Types used: %s", pattern, types_used)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _check_rule_6(tokens_chosen, pattern, types_used):
|
||||
"""Rule #6: Separators between elements of the time group should be the same.
|
||||
|
||||
Examples:
|
||||
_check_rule_5(tokens_chosen_1, '%Y-%m-%dT%H:%M:%S', 'YmdHMS') => True
|
||||
_check_rule_5(tokens_chosen_2, '%Y-%m-%dT%H %M %S', 'YmdHMS') => True
|
||||
_check_rule_5(tokens_chosen_3, '%Y-%m-%dT%H-%M:%S', 'YmdHMS') => False (different separators
|
||||
('-' and ':') in time group)
|
||||
"""
|
||||
time_parts = 'HMS'
|
||||
num_of_time_parts_used = len(list(filter(lambda x: x in time_parts, types_used)))
|
||||
time_parts_seen = 0
|
||||
separators_seen = []
|
||||
previous_was_a_separator = False
|
||||
|
||||
for token in tokens_chosen:
|
||||
if token[1] is not None and token[1][3] in time_parts:
|
||||
# This rule doesn't work for separator-less time group so when we found the type
|
||||
# and it's three letters then it's (see type "Compound HHMMSS") then stop iterating
|
||||
if len(token[1][3]) == 3:
|
||||
break
|
||||
# If not a first time then
|
||||
if time_parts_seen > 0 and not previous_was_a_separator:
|
||||
separators_seen.append(None)
|
||||
time_parts_seen += 1
|
||||
if time_parts_seen == num_of_time_parts_used:
|
||||
break
|
||||
previous_was_a_separator = False
|
||||
else:
|
||||
if time_parts_seen > 0:
|
||||
separators_seen.append(token[0].val)
|
||||
previous_was_a_separator = True
|
||||
|
||||
if len(set(separators_seen)) > 1:
|
||||
logging.debug("Rule #6 is violated for pattern %s. Seen separators: %s",
|
||||
pattern, separators_seen)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _check_rule_7a(pattern):
|
||||
"""Rule #7a: If am/pm is in date we assume that 12-hour dates are allowed only.
|
||||
Otherwise it's 24-hour.
|
||||
|
||||
Examples:
|
||||
>>> _check_rule_7a('%Y/%m/%d %H:%M %p')
|
||||
False
|
||||
>>> _check_rule_7a('%Y/%m/%d %I:%M %p')
|
||||
True
|
||||
"""
|
||||
if '%p' in pattern and '%H' in pattern:
|
||||
logging.debug("Rule #7a is violated for pattern %s", pattern)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _check_rule_7b(pattern):
|
||||
"""Rule #7b: If am/pm is in date we assume that 12-hour dates are allowed only.
|
||||
Otherwise it's 24-hour.
|
||||
|
||||
Examples:
|
||||
>>> _check_rule_7b('%Y/%m/%d %I:%M')
|
||||
False
|
||||
>>> _check_rule_7b('%Y/%m/%d %I:%M %p')
|
||||
True
|
||||
"""
|
||||
if '%I' in pattern and '%p' not in pattern:
|
||||
logging.debug("Rule #7b is violated for pattern %s", pattern)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _check_rule_8(pattern, types_used):
|
||||
"""Rule #9: Year can't be between other date elements
|
||||
|
||||
Examples:
|
||||
>>> _check_rule_8('%m/%Y/%d %I:%M', 'mYdIM')
|
||||
False
|
||||
"""
|
||||
if 'mYd' in types_used or 'dYm' in types_used:
|
||||
logging.debug("Rule #8 is violated for pattern %s", pattern)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _tokenize_by_character_class(s):
|
||||
"""Return a list of strings by splitting s (tokenizing) by character class.
|
||||
|
||||
Example:
|
||||
>>> t = _tokenize_by_character_class('Thu, May 14th, 2014 1:15 pm +0000')
|
||||
>>> [i.val for i in t]
|
||||
['Thu', ',', ' ', 'May', ' ', '14', 'th', ',', ' ', '2014', ' ', '1', ':', '15', ' ', 'pm', ' ', '+', '0000']
|
||||
|
||||
>>> t = _tokenize_by_character_class('5/14/2014')
|
||||
>>> [i.val for i in t]
|
||||
['5', '/', '14', '/', '2014']
|
||||
"""
|
||||
res = re.split(r'(\d+)|(\W)|(_)', s)
|
||||
return [Token(i, len(i)) for i in res if i]
|
||||
|
||||
|
||||
def _sliding_triplets(tokens):
|
||||
for idx, t in enumerate(tokens):
|
||||
yield (t, tokens[idx-1] if idx > 0 else None, tokens[idx+1] if idx < len(tokens)-1 else None)
|
||||
|
||||
|
||||
def _analyze_tokens(tokens):
|
||||
"""Analyze each token and find out compatible types for it."""
|
||||
for token, prev, nxt in _sliding_triplets(tokens):
|
||||
token.compatible_types = tuple([t for t in DATE_ELEMENTS if t[2](token.val, prev, nxt)])
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def _generate_all_permutations(tokens):
|
||||
"""Generate all permutations of format codes for given list of tokens.
|
||||
|
||||
Brute-forcing of all possible permutations and rules checking eats most of the time or date
|
||||
parsing. But since the input is expected to be highly uniform then we can expect that
|
||||
memoization of this step will be very efficient.
|
||||
|
||||
Token contains values for date parts but due to overridden eq and hash methods,
|
||||
we treat two tokens having the same length and same possible formats as equal
|
||||
tokens and separators should be the same
|
||||
"""
|
||||
all_patterns = set()
|
||||
_generate_all_permutations_recursive(tokens, 0, [], "", all_patterns, "")
|
||||
|
||||
return all_patterns
|
||||
|
||||
|
||||
def _check_is_pattern_valid_quick_fail_rules(pattern, types_used):
|
||||
"""Apply rules which are applicable for partially constructed patterns.
|
||||
|
||||
Example: duplicates of a date part in a pattern.
|
||||
"""
|
||||
return _check_rule_5(pattern, types_used) \
|
||||
and _check_rule_4(pattern, types_used) \
|
||||
and _check_rule_7a(pattern)
|
||||
|
||||
|
||||
def _check_is_pattern_valid_full_pattern_rules(tokens_chosen, pattern, types_used):
|
||||
"""Apply rules which are applicable for full pattern only.
|
||||
|
||||
Example: existence of Year part in the pattern.
|
||||
"""
|
||||
return _check_rule_1(pattern, types_used) \
|
||||
and _check_rule_2(pattern, types_used) \
|
||||
and _check_rule_3(pattern, types_used) \
|
||||
and _check_rule_6(tokens_chosen, pattern, types_used) \
|
||||
and _check_rule_7b(pattern) \
|
||||
and _check_rule_8(pattern, types_used)
|
||||
|
||||
|
||||
def _generate_all_permutations_recursive(tokens, token_idx, tokens_chosen, pattern, found_patterns,
|
||||
types_used):
|
||||
"""Generate all format elements permutations recursively.
|
||||
|
||||
Args:
|
||||
tokens (list[Token]): List of tokens.
|
||||
token_idx (int): Index of token processing this cycle.
|
||||
tokens_chosen (list[(Token, Token.compatible_type)]): List of tuples
|
||||
containing token and compatible type
|
||||
pattern (str): String containing format for parsing
|
||||
found_patterns (set): Set of guessed patterns
|
||||
types_used (str): String of types used to build pattern.
|
||||
|
||||
Returns:
|
||||
list: List of permutations
|
||||
"""
|
||||
if not _check_is_pattern_valid_quick_fail_rules(pattern, types_used):
|
||||
return
|
||||
|
||||
if token_idx < len(tokens):
|
||||
t = tokens[token_idx]
|
||||
if t.compatible_types:
|
||||
for ct in t.compatible_types:
|
||||
_generate_all_permutations_recursive(tokens, token_idx+1, tokens_chosen[:] + [(t, ct)],
|
||||
(pattern if ct[4] == 0 else pattern[:-ct[4]]) + ct[1],
|
||||
found_patterns, types_used + ct[3])
|
||||
else:
|
||||
# if no compatible types it should be separator, add it to the pattern
|
||||
_generate_all_permutations_recursive(tokens, token_idx+1,
|
||||
tokens_chosen[:] + [(t, None)], pattern + t.val,
|
||||
found_patterns, types_used)
|
||||
else:
|
||||
if _check_is_pattern_valid_full_pattern_rules(tokens_chosen, pattern, types_used):
|
||||
found_patterns.add(pattern)
|
||||
|
||||
|
||||
def guess(date):
|
||||
"""Guesses datetime.strftime/strptime-compliant date formats for date string.
|
||||
|
||||
Args:
|
||||
date (str): Date string.
|
||||
|
||||
Returns:
|
||||
set: Set of datetime.strftime/strptime-compliant date format strings
|
||||
|
||||
Examples:
|
||||
>>> guess('2014/05/05 14:00:00 UTC')
|
||||
set(['%Y/%d/%m %H:%M:%S %Z', '%Y/%m/%d %H:%M:%S %Z'])
|
||||
>>> guess('12/12/12')
|
||||
set(['%y/%m/%d', '%d/%m/%y', '%m/%d/%y', '%y/%d/%m'])
|
||||
"""
|
||||
# Don't attempt to parse strings that are so long as to be certainly non-dates. Somewhat long
|
||||
# strings could be dates (like "Wednesday, September 16, 2020 A.D. 08:47:02.2667911 AM -06:00",
|
||||
# and who knows what other languages do). A limit is important also because the current approach
|
||||
# can run into "maximum recursion depth exceeded" on a very long string.
|
||||
if len(date) > 150:
|
||||
return set()
|
||||
tokens = _tokenize_by_character_class(date)
|
||||
_analyze_tokens(tokens)
|
||||
return _generate_all_permutations(tuple(tokens))
|
||||
|
||||
|
||||
def guess_bulk(dates, error_rate=0):
|
||||
"""Guesses datetime.strftime/strptime-compliant date formats for list of the samples.
|
||||
|
||||
Args:
|
||||
dates (list): List of samples date strings.
|
||||
error_rate (float): Acceptable error rate (default 0.0)
|
||||
|
||||
Returns:
|
||||
list: List of datetime.strftime/strptime-compliant date format strings sorted by error rate
|
||||
|
||||
Examples:
|
||||
>>> guess_bulk(['12-11-2014', '12-25-2014'])
|
||||
['%m-%d-%Y']
|
||||
>>> guess_bulk(['12-11-2014', '25-25-2014'])
|
||||
[]
|
||||
>>> guess_bulk(['12-11-2013', '13-8-2013', '05-25-2013', '12-25-2013'], error_rate=0.5)
|
||||
['%m-%d-%Y']
|
||||
"""
|
||||
if error_rate == 0.0:
|
||||
patterns = None
|
||||
for date in dates:
|
||||
guesses_patterns = guess(date)
|
||||
if patterns is None:
|
||||
patterns = guesses_patterns
|
||||
else:
|
||||
patterns = patterns.intersection(guesses_patterns)
|
||||
if not patterns:
|
||||
break # No need to iterate more if zero patterns found
|
||||
return list(patterns)
|
||||
else:
|
||||
found_dates = 0
|
||||
pattern_counters = defaultdict(lambda: 0)
|
||||
num_dates = len(dates)
|
||||
min_num_dates_to_be_found = num_dates - num_dates * error_rate
|
||||
|
||||
for idx, date in enumerate(dates):
|
||||
patterns = guess(date)
|
||||
if patterns:
|
||||
found_dates += 1
|
||||
for pattern in patterns:
|
||||
pattern_counters[pattern] = pattern_counters[pattern] + 1
|
||||
|
||||
# Early return if number of strings that can't be date is already over error rate
|
||||
cells_left = num_dates - idx - 1
|
||||
cannot_be_found = float(found_dates + cells_left) < min_num_dates_to_be_found
|
||||
if cannot_be_found:
|
||||
return []
|
||||
|
||||
patterns = [(v, k) for k, v in pattern_counters.items()
|
||||
if v > min_num_dates_to_be_found]
|
||||
patterns.sort(reverse=True)
|
||||
return [k for (v, k) in patterns]
|
||||
@@ -1,102 +0,0 @@
|
||||
import unittest
|
||||
from imports.dateguess import guess, guess_bulk
|
||||
|
||||
|
||||
class TestGuesser(unittest.TestCase):
|
||||
def assertDate(self, input_str, fmt_list):
|
||||
guessed = guess(input_str)
|
||||
self.assertEqual(set(guessed), set(fmt_list))
|
||||
|
||||
def assertDates(self, input_lst, error_rate, fmt_list):
|
||||
guessed = guess_bulk(input_lst, error_rate=error_rate)
|
||||
self.assertEqual(set(guessed), set(fmt_list))
|
||||
|
||||
def test_guess_dates(self):
|
||||
self.assertDate('', [])
|
||||
self.assertDate("2013-13-13", [])
|
||||
self.assertDate("25/25/1911", [])
|
||||
|
||||
self.assertDate("2014-01-11", ['%Y-%m-%d', '%Y-%d-%m'])
|
||||
self.assertDate("2014-11-01", ['%Y-%m-%d', '%Y-%d-%m'])
|
||||
self.assertDate("1990-05-05", ['%Y-%m-%d', '%Y-%d-%m'])
|
||||
self.assertDate("2013-12-13", ['%Y-%m-%d'])
|
||||
|
||||
self.assertDate("12/31/1999", ['%m/%d/%Y'])
|
||||
self.assertDate("11/11/1911", ['%m/%d/%Y', '%d/%m/%Y'])
|
||||
self.assertDate("5/9/1981", ['%m/%d/%Y', '%d/%m/%Y'])
|
||||
self.assertDate("6/3/1985", ['%m/%d/%Y', '%d/%m/%Y'])
|
||||
|
||||
self.assertDate("12/31/99", ['%m/%d/%y'])
|
||||
self.assertDate("11/11/11", ['%y/%m/%d', '%y/%d/%m', '%m/%d/%y', '%d/%m/%y'])
|
||||
self.assertDate("5/9/81", ['%m/%d/%y', '%d/%m/%y'])
|
||||
self.assertDate("6/3/85", ['%m/%d/%y', '%d/%m/%y'])
|
||||
|
||||
self.assertDate("31.12.91", ['%d.%m.%y'])
|
||||
self.assertDate("4.4.87", ['%m.%d.%y', '%d.%m.%y'])
|
||||
|
||||
self.assertDate("13.2.8", ['%y.%m.%d', '%y.%d.%m'])
|
||||
self.assertDate("31.12.1991", ['%d.%m.%Y'])
|
||||
self.assertDate("4.4.1987", ['%m.%d.%Y', '%d.%m.%Y'])
|
||||
self.assertDate("13.2.2008", ['%d.%m.%Y'])
|
||||
self.assertDate("31.12.91", ['%d.%m.%y'])
|
||||
self.assertDate("4.4.87", ['%m.%d.%y', '%d.%m.%y'])
|
||||
self.assertDate("13.2.8", ['%y.%m.%d', '%y.%d.%m'])
|
||||
|
||||
self.assertDate("9 May 1981", ['%d %b %Y', '%d %B %Y'])
|
||||
self.assertDate("31 Dec 1999", ['%d %b %Y'])
|
||||
self.assertDate("1 Jan 2012", ['%d %b %Y'])
|
||||
self.assertDate("3 August 2009", ['%d %B %Y'])
|
||||
self.assertDate("2 May 1980", ['%d %B %Y', '%d %b %Y'])
|
||||
|
||||
self.assertDate("13/1/2012", ['%d/%m/%Y'])
|
||||
|
||||
self.assertDate("Aug 1st 2014", ['%b %dst %Y'])
|
||||
self.assertDate("12/22/2015 00:00:00.10", ['%m/%d/%Y %H:%M:%S.%f'])
|
||||
|
||||
def test_guess_datetimes(self):
|
||||
self.assertDate("Thu Sep 25 10:36:28 2003", ['%a %b %d %H:%M:%S %Y'])
|
||||
self.assertDate("Thu Sep 25 2003 10:36:28", ['%a %b %d %Y %H:%M:%S'])
|
||||
self.assertDate("10:36:28 Thu Sep 25 2003", ['%H:%M:%S %a %b %d %Y'])
|
||||
|
||||
self.assertDate("2014-01-11T12:21:05", ['%Y-%m-%dT%H:%M:%S', '%Y-%d-%mT%H:%M:%S'])
|
||||
self.assertDate("2015-02-16T16:05:31", ['%Y-%m-%dT%H:%M:%S'])
|
||||
# TODO remove all except first one
|
||||
self.assertDate("2015-02-16T16:05", ['%Y-%m-%dT%H:%M', '%Y-%H-%MT%d:%m',
|
||||
'%Y-%m-%HT%M:%d', '%Y-%d-%HT%M:%m'])
|
||||
self.assertDate("2015-02-16T16", ['%Y-%m-%dT%H', '%Y-%m-%HT%d']) #TODO remove second one
|
||||
|
||||
self.assertDate("Mon Jan 13 9:52:52 am MST 2014", ['%a %b %d %I:%M:%S %p %Z %Y'])
|
||||
self.assertDate("Tue Jan 21 3:30:00 PM EST 2014", ['%a %b %d %I:%M:%S %p %Z %Y'])
|
||||
self.assertDate("Mon Jan 13 09:52:52 MST 2014", ['%a %b %d %H:%M:%S %Z %Y'])
|
||||
self.assertDate("Tue Jan 21 15:30:00 EST 2014", ['%a %b %d %H:%M:%S %Z %Y'])
|
||||
self.assertDate("Mon Jan 13 9:52 am MST 2014", ['%a %b %d %I:%M %p %Z %Y'])
|
||||
self.assertDate("Tue Jan 21 3:30 PM EST 2014", ['%a %b %d %I:%M %p %Z %Y'])
|
||||
|
||||
self.assertDate("2014-01-11T12:21:05", ['%Y-%m-%dT%H:%M:%S', '%Y-%d-%mT%H:%M:%S'])
|
||||
self.assertDate("2015-02-16T16:05:31", ['%Y-%m-%dT%H:%M:%S'])
|
||||
self.assertDate("Thu Sep 25 10:36:28 2003", ['%a %b %d %H:%M:%S %Y'])
|
||||
self.assertDate("10:36:28 Thu Sep 25 2003", ['%H:%M:%S %a %b %d %Y'])
|
||||
|
||||
self.assertDate("2014-01-11T12:21:05+0000", ['%Y-%d-%mT%H:%M:%S%z', '%Y-%m-%dT%H:%M:%S%z'])
|
||||
self.assertDate("2015-02-16T16:05:31-0400", ['%Y-%m-%dT%H:%M:%S%z'])
|
||||
self.assertDate("Thu, 25 Sep 2003 10:49:41 -0300", ['%a, %d %b %Y %H:%M:%S %z'])
|
||||
self.assertDate("Thu, 25 Sep 2003 10:49:41 +0300", ['%a, %d %b %Y %H:%M:%S %z'])
|
||||
|
||||
self.assertDate("2003-09-25T10:49:41", ['%Y-%m-%dT%H:%M:%S'])
|
||||
self.assertDate("2003-09-25T10:49", ['%Y-%m-%dT%H:%M'])
|
||||
|
||||
def test_guess_bulk_dates(self):
|
||||
self.assertDates(["11/11/1911", "25/11/1911", "11/11/1911", "11/11/1911"], 0.0, ['%d/%m/%Y'])
|
||||
self.assertDates(["25/11/1911", "25/25/1911", "11/11/1911", "11/11/1911"], 0.0, [])
|
||||
self.assertDates(["25/11/1911", "25/25/1911", "11/11/1911", "11/11/1911"], 0.5, ['%d/%m/%Y'])
|
||||
|
||||
self.assertDates(["25/11/1911", "25/25/1911", "11/11/1911", "11/11/1911"], 0.1, [])
|
||||
self.assertDates(["23/11/1911", '2004 May 12', "11/11/1911", "11/11/1911"], 0.5, ['%d/%m/%Y'])
|
||||
|
||||
self.assertDates(['2004 May 12', "11/11/1911", "11/11/1911", "23/11/1911"], 0.5, ['%d/%m/%Y'])
|
||||
self.assertDates(['2004 May 12', "11/11/1911", "11/11/1911", "23/11/1911"], 0.0, [])
|
||||
self.assertDates(['12/22/2015', "12/22/2015 1:15pm", "2018-02-27 16:08:39 +0000"], 0.1, [])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -4,8 +4,6 @@ import textwrap
|
||||
import unittest
|
||||
from six import BytesIO, text_type
|
||||
import csv
|
||||
import calendar
|
||||
import datetime
|
||||
|
||||
from imports import import_csv
|
||||
|
||||
@@ -22,9 +20,15 @@ def bytes_io_from_str(string):
|
||||
|
||||
class TestImportCSV(unittest.TestCase):
|
||||
|
||||
def _check_col(self, sheet, index, name, typename, values):
|
||||
def _check_col(self, sheet, index, name, _typename, values):
|
||||
self.assertEqual(sheet["column_metadata"][index]["id"], name)
|
||||
self.assertEqual(sheet["column_metadata"][index]["type"], typename)
|
||||
# Previously, strings were parsed and types were guessed in CSV imports.
|
||||
# Now all data is kept as strings and the column type is left as Any
|
||||
# so that type guessing and parsing can happen elsewhere.
|
||||
# To avoid updating 85 calls to _check_col, the typename argument was kept but can be ignored,
|
||||
# and all values are converted back to strings for comparison.
|
||||
self.assertEqual(sheet["column_metadata"][index]["type"], "Any")
|
||||
values = [text_type(v) for v in values]
|
||||
self.assertEqual(sheet["table_data"][index], values)
|
||||
|
||||
def _check_num_cols(self, sheet, exp_cols):
|
||||
@@ -40,18 +44,16 @@ class TestImportCSV(unittest.TestCase):
|
||||
self._check_col(sheet, 1, "int2", "Int", [5, '', ''])
|
||||
self._check_col(sheet, 2, "textint", "Text", ["12345678902345689", '', ''])
|
||||
self._check_col(sheet, 3, "bigint", "Text", ["320150170634561830", '', ''])
|
||||
self._check_col(sheet, 4, "num2", "Numeric", [123456789.123456, '', ''])
|
||||
self._check_col(sheet, 5, "bignum", "Numeric", [7.22597e+86, '', ''])
|
||||
self._check_col(sheet, 4, "num2", "Numeric", ['123456789.1234560000', '', ''])
|
||||
self._check_col(sheet, 5, "bignum", "Numeric", ['7.22597E+86', '', ''])
|
||||
self._check_col(sheet, 6, "date1", "DateTime",
|
||||
[calendar.timegm(datetime.datetime(2015, 12, 22, 11, 59, 00).timetuple()), None, None])
|
||||
[u'12/22/15 11:59 AM', u'', u''])
|
||||
self._check_col(sheet, 7, "date2", "Date",
|
||||
[calendar.timegm(datetime.datetime(2015, 12, 20, 0, 0, 0).timetuple()), None, None])
|
||||
[u'December 20, 2015', u'', u''])
|
||||
self._check_col(sheet, 8, "datetext", "Date",
|
||||
[calendar.timegm(datetime.date(2015, 12, 22).timetuple()), None, None])
|
||||
[u'12/22/2015', u'', u''])
|
||||
self._check_col(sheet, 9, "datetimetext", "DateTime",
|
||||
[calendar.timegm(datetime.datetime(2015, 12, 22, 0, 0, 0).timetuple()),
|
||||
calendar.timegm(datetime.datetime(2015, 12, 22, 13, 15, 0).timetuple()),
|
||||
calendar.timegm(datetime.datetime(2018, 2, 27, 16, 8, 39).timetuple())])
|
||||
[u'12/22/2015 00:00:00', u'12/22/2015 13:15:00', u'02/27/2018 16:08:39'])
|
||||
|
||||
|
||||
def test_user_parse_options(self):
|
||||
@@ -68,7 +70,11 @@ class TestImportCSV(unittest.TestCase):
|
||||
self._check_col(parsed_file, 2, "PHONE", "Text", ['201-343-3434', '201.343.3434',
|
||||
'2013433434', '(201)343-3434'])
|
||||
self._check_col(parsed_file, 3, "VALUE", "Int", [45, 4545, 0, 4])
|
||||
self._check_col(parsed_file, 4, "DATE", "DateTime", [1519747719.0, 1519744119.0, 1519751319.0, None])
|
||||
self._check_col(parsed_file, 4, "DATE", "DateTime",
|
||||
[u'2018-02-27 16:08:39 +0000',
|
||||
u'2018-02-27 16:08:39 +0100',
|
||||
u'2018-02-27 16:08:39 -0100',
|
||||
u''])
|
||||
|
||||
def test_wrong_cols1(self):
|
||||
file_obj = bytes_io_from_str(textwrap.dedent(
|
||||
|
||||
@@ -16,31 +16,33 @@ class TestImportXLS(unittest.TestCase):
|
||||
def _check_col(self, sheet, index, name, typename, values):
|
||||
self.assertEqual(sheet["column_metadata"][index]["id"], name)
|
||||
self.assertEqual(sheet["column_metadata"][index]["type"], typename)
|
||||
if typename == "Any":
|
||||
# Convert values to strings to reduce changes to tests after imports were overhauled.
|
||||
values = [str(v) for v in values]
|
||||
self.assertEqual(sheet["table_data"][index], values)
|
||||
|
||||
def test_excel(self):
|
||||
parsed_file = import_xls.parse_file(*_get_fixture('test_excel.xlsx'))
|
||||
|
||||
# check that column type was correctly set to int and values are properly parsed
|
||||
self.assertEqual(parsed_file[1][0]["column_metadata"][0], {"type": "Int", "id": "numbers"})
|
||||
# check that column type was correctly set to numeric and values are properly parsed
|
||||
self.assertEqual(parsed_file[1][0]["column_metadata"][0], {"type": "Numeric", "id": "numbers"})
|
||||
self.assertEqual(parsed_file[1][0]["table_data"][0], [1, 2, 3, 4, 5, 6, 7, 8])
|
||||
|
||||
# check that column type was correctly set to text and values are properly parsed
|
||||
self.assertEqual(parsed_file[1][0]["column_metadata"][1], {"type": "Text", "id": "letters"})
|
||||
self.assertEqual(parsed_file[1][0]["column_metadata"][1], {"type": "Any", "id": "letters"})
|
||||
self.assertEqual(parsed_file[1][0]["table_data"][1],
|
||||
["a", "b", "c", "d", "e", "f", "g", "h"])
|
||||
|
||||
# messy tables does not support bool types yet, it classifies them as ints
|
||||
self.assertEqual(parsed_file[1][0]["column_metadata"][2], {"type": "Bool", "id": "boolean"})
|
||||
self.assertEqual(parsed_file[1][False]["table_data"][2],
|
||||
[True, False, True, False, True, False, True, False])
|
||||
# 0s and 1s become Numeric, not boolean like in the past
|
||||
self.assertEqual(parsed_file[1][0]["column_metadata"][2], {"type": "Numeric", "id": "boolean"})
|
||||
self.assertEqual(parsed_file[1][0]["table_data"][2], [1, 0, 1, 0, 1, 0, 1, 0])
|
||||
|
||||
# check that column type was correctly set to text and values are properly parsed
|
||||
self.assertEqual(parsed_file[1][0]["column_metadata"][3],
|
||||
{"type": "Text", "id": "corner-cases"})
|
||||
{"type": "Any", "id": "corner-cases"})
|
||||
self.assertEqual(parsed_file[1][0]["table_data"][3],
|
||||
# The type is detected as text, so all values should be text.
|
||||
[u'=function()', '3.0', u'two spaces after ',
|
||||
[u'=function()', u'3.0', u'two spaces after ',
|
||||
u' two spaces before', u'!@#$', u'€€€', u'√∫abc$$', u'line\nbreak'])
|
||||
|
||||
# check that multiple tables are created when there are multiple sheets in a document
|
||||
@@ -51,23 +53,19 @@ class TestImportXLS(unittest.TestCase):
|
||||
def test_excel_types(self):
|
||||
parsed_file = import_xls.parse_file(*_get_fixture('test_excel_types.xlsx'))
|
||||
sheet = parsed_file[1][0]
|
||||
self._check_col(sheet, 0, "int1", "Int", [-1234123, '', ''])
|
||||
self._check_col(sheet, 1, "int2", "Int", [5, '', ''])
|
||||
self._check_col(sheet, 2, "textint", "Text", ["12345678902345689", '', ''])
|
||||
self._check_col(sheet, 3, "bigint", "Text", ["320150170634561830", '', ''])
|
||||
self._check_col(sheet, 0, "int1", "Numeric", [-1234123, '', ''])
|
||||
self._check_col(sheet, 1, "int2", "Numeric", [5, '', ''])
|
||||
self._check_col(sheet, 2, "textint", "Any", ["12345678902345689", '', ''])
|
||||
self._check_col(sheet, 3, "bigint", "Any", ["320150170634561830", '', ''])
|
||||
self._check_col(sheet, 4, "num2", "Numeric", [123456789.123456, '', ''])
|
||||
self._check_col(sheet, 5, "bignum", "Numeric", [math.exp(200), '', ''])
|
||||
self._check_col(sheet, 6, "date1", "DateTime",
|
||||
[calendar.timegm(datetime.datetime(2015, 12, 22, 11, 59, 00).timetuple()), None, None])
|
||||
self._check_col(sheet, 7, "date2", "Date",
|
||||
[calendar.timegm(datetime.datetime(2015, 12, 20, 0, 0, 0).timetuple()), None, None])
|
||||
self._check_col(sheet, 8, "datetext", "Date",
|
||||
[calendar.timegm(datetime.date(2015, 12, 22).timetuple()), None, None])
|
||||
# TODO: all dates have different format
|
||||
# self._check_col(sheet, 9, "datetimetext", "DateTime",
|
||||
# [calendar.timegm(datetime.datetime(2015, 12, 22, 0, 0, 0).timetuple()),
|
||||
# calendar.timegm(datetime.datetime(2015, 12, 22, 13, 15, 0).timetuple()),
|
||||
# calendar.timegm(datetime.datetime(2018, 02, 27, 16, 8, 39).timetuple())])
|
||||
self._check_col(sheet, 8, "datetext", "Any", ['12/22/2015', '', ''])
|
||||
self._check_col(sheet, 9, "datetimetext", "Any",
|
||||
[u'12/22/2015', u'12/22/2015 1:15pm', u'2018-02-27 16:08:39 +0000'])
|
||||
|
||||
def test_excel_type_detection(self):
|
||||
# This tests goes over the second sheet of the fixture doc, which has multiple rows that try
|
||||
@@ -81,23 +79,20 @@ class TestImportXLS(unittest.TestCase):
|
||||
1454544000.0, 1199577600.0, 1451692800.0, 1451549340.0, 1483214940.0])
|
||||
self._check_col(sheet, 1, "float_not_int", "Numeric",
|
||||
[1,2,3,4,5,"",6,7,8,9,10,10.25,11,12,13,14,15,16,17,18])
|
||||
self._check_col(sheet, 2, "int_not_bool", "Int",
|
||||
self._check_col(sheet, 2, "int_not_bool", "Any",
|
||||
[0, 0, 1, 0, 1, 0, 0, 1, 0, 2, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0])
|
||||
self._check_col(sheet, 3, "float_not_bool", "Numeric",
|
||||
self._check_col(sheet, 3, "float_not_bool", "Any",
|
||||
[0, 0, 1, 0, 1, 0, 0, 1, 0, 0.5, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0])
|
||||
self._check_col(sheet, 4, "text_as_bool", "Bool",
|
||||
self._check_col(sheet, 4, "text_as_bool", "Any",
|
||||
[0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0])
|
||||
self._check_col(sheet, 5, "int_as_bool", "Bool",
|
||||
self._check_col(sheet, 5, "int_as_bool", "Numeric",
|
||||
[0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0])
|
||||
self._check_col(sheet, 6, "float_not_date", "Numeric",
|
||||
self._check_col(sheet, 6, "float_not_date", "Any",
|
||||
[4.0, 6.0, 4.0, 4.0, 6.0, 4.0, '--', 6.0, 4.0, 4.0, 4.0, 4.0, 4.0, 6.0, 6.0,
|
||||
4.0, 6.0, '3-4', 4.0, 6.5])
|
||||
self._check_col(sheet, 7, "float_not_text", "Numeric",
|
||||
[-10.25, -8.00, -5.75, -3.50, "n/a", 1.00, " ??? ", 5.50, "", "-",
|
||||
[-10.25, -8.00, -5.75, -3.50, "n/a", ' 1. ', " ??? ", 5.50, "", "-",
|
||||
12.25, 0.00, "", 0.00, "--", 23.50, "NA", 28.00, 30.25, 32.50])
|
||||
self._check_col(sheet, 8, "dollar_amts", "Numeric",
|
||||
[0.00, 0.75, 1.50, '', 3.00, 0.00, 0.75, 1.50, '--', 3.00, 1234.56, 1000,
|
||||
1001.50, '-', 3000000.000, 0000.00, 1234.56, 1000, 1001.50, 1000.01])
|
||||
|
||||
def test_excel_single_merged_cell(self):
|
||||
# An older version of xlrd had a bug where a single cell marked as 'merged' would cause an
|
||||
@@ -107,11 +102,11 @@ class TestImportXLS(unittest.TestCase):
|
||||
self.assertEqual(tables, [{
|
||||
'table_name': u'Transaction Report',
|
||||
'column_metadata': [
|
||||
{'type': 'Text', 'id': u''},
|
||||
{'type': 'Any', 'id': u''},
|
||||
{'type': 'Numeric', 'id': u'Start'},
|
||||
{'type': 'Numeric', 'id': u''},
|
||||
{'type': 'Numeric', 'id': u''},
|
||||
{'type': 'Text', 'id': u'Seek no easy ways'},
|
||||
{'type': 'Any', 'id': u'Seek no easy ways'},
|
||||
],
|
||||
'table_data': [
|
||||
[u'SINGLE MERGED', u'The End'],
|
||||
@@ -133,15 +128,15 @@ class TestImportXLS(unittest.TestCase):
|
||||
self.assertEqual(tables, [{
|
||||
'table_name': u'Sheet1',
|
||||
'column_metadata': [
|
||||
{'id': 'a', 'type': 'Text'},
|
||||
{'id': 'a', 'type': 'Any'},
|
||||
{'id': 'b', 'type': 'Date'},
|
||||
{'id': 'c', 'type': 'Text'},
|
||||
{'id': 'd', 'type': 'Text'},
|
||||
{'id': 'c', 'type': 'Any'},
|
||||
{'id': 'd', 'type': 'Any'},
|
||||
{'id': 'e', 'type': 'Numeric'},
|
||||
{'id': 'f', 'type': 'Int'},
|
||||
{'id': 'g', 'type': 'Date'},
|
||||
{'id': 'f', 'type': 'Numeric'},
|
||||
{'id': 'g', 'type': 'Any'},
|
||||
{'id': 'h', 'type': 'Date'},
|
||||
{'id': 'i', 'type': 'Bool'},
|
||||
{'id': 'i', 'type': 'Numeric'},
|
||||
],
|
||||
'table_data': [
|
||||
[u'21:14:00'],
|
||||
@@ -150,9 +145,9 @@ class TestImportXLS(unittest.TestCase):
|
||||
[u'10:20:30'],
|
||||
[4.180902777777778],
|
||||
[20],
|
||||
[-6106060800.0],
|
||||
[u'7/4/1776'],
|
||||
[205286400.0],
|
||||
[False], # This is not great either, we should be able to distinguish 0 from FALSE.
|
||||
[0],
|
||||
],
|
||||
}])
|
||||
|
||||
|
||||
Reference in New Issue
Block a user