mirror of
				https://github.com/gristlabs/grist-core.git
				synced 2025-06-13 20:53:59 +00:00 
			
		
		
		
	Remove messytables dependency from xlsx import
This commit is contained in:
		
							parent
							
								
									b2b0950c9c
								
							
						
					
					
						commit
						54703e2794
					
				| @ -4,6 +4,7 @@ Helper functions for import plugins | |||||||
| import itertools | import itertools | ||||||
| import logging | import logging | ||||||
| import os | import os | ||||||
|  | from collections import defaultdict | ||||||
| 
 | 
 | ||||||
| import six | import six | ||||||
| from six.moves import zip | from six.moves import zip | ||||||
| @ -14,6 +15,20 @@ if six.PY2: | |||||||
| 
 | 
 | ||||||
| log = logging.getLogger(__name__) | log = logging.getLogger(__name__) | ||||||
| 
 | 
 | ||||||
|  | def column_count_modal(rows): | ||||||
|  |   """ Return the modal value of columns in the row_set's | ||||||
|  |   sample. This can be assumed to be the number of columns | ||||||
|  |   of the table. """ | ||||||
|  |   counts = defaultdict(int) | ||||||
|  |   for row in rows: | ||||||
|  |     length = len([c for c in row if not empty(c)]) | ||||||
|  |     if length > 1: | ||||||
|  |       counts[length] += 1 | ||||||
|  |   if not len(counts): | ||||||
|  |     return 0 | ||||||
|  |   return max(list(counts.items()), key=lambda k_v: k_v[1])[0] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| def empty(value): | def empty(value): | ||||||
|   """ Stringify the value and check that it has a length. """ |   """ Stringify the value and check that it has a length. """ | ||||||
| @ -33,7 +48,7 @@ def capitalize(word): | |||||||
|   return word[0].capitalize() + word[1:] |   return word[0].capitalize() + word[1:] | ||||||
| 
 | 
 | ||||||
| def _is_numeric(text): | def _is_numeric(text): | ||||||
|   for t in six.integer_types + (float, complex): |   for t in six.integer_types + (float,): | ||||||
|     try: |     try: | ||||||
|       t(text) |       t(text) | ||||||
|       return True |       return True | ||||||
| @ -54,7 +69,6 @@ def _is_header(header, data_rows): | |||||||
| 
 | 
 | ||||||
|   # If it's all text, see if the values in the first row repeat in other rows. That's uncommon for |   # If it's all text, see if the values in the first row repeat in other rows. That's uncommon for | ||||||
|   # a header. |   # a header. | ||||||
|   count_repeats = [0 for cell in header] |  | ||||||
|   for row in data_rows: |   for row in data_rows: | ||||||
|     for cell, header_cell in zip(row, header): |     for cell, header_cell in zip(row, header): | ||||||
|       if cell and cell == header_cell: |       if cell and cell == header_cell: | ||||||
| @ -78,8 +92,11 @@ def find_first_non_empty_row(rows): | |||||||
|   Returns (data_offset, header) of the first row with non-empty fields |   Returns (data_offset, header) of the first row with non-empty fields | ||||||
|   or (0, []) if there are no non-empty rows. |   or (0, []) if there are no non-empty rows. | ||||||
|   """ |   """ | ||||||
|  |   tolerance = 1 | ||||||
|  |   modal = column_count_modal(rows) | ||||||
|   for i, row in enumerate(rows): |   for i, row in enumerate(rows): | ||||||
|     if _count_nonempty(row) > 0: |     length = _count_nonempty(row) | ||||||
|  |     if length >= modal - tolerance: | ||||||
|       return i + 1, row |       return i + 1, row | ||||||
|   # No non-empty rows. |   # No non-empty rows. | ||||||
|   return 0, [] |   return 0, [] | ||||||
|  | |||||||
| @ -4,7 +4,6 @@ and returns a object formatted so that it can be used by grist for a bulk add re | |||||||
| """ | """ | ||||||
| import logging | import logging | ||||||
| 
 | 
 | ||||||
| import messytables |  | ||||||
| import six | import six | ||||||
| import openpyxl | import openpyxl | ||||||
| from openpyxl.utils.datetime import from_excel | from openpyxl.utils.datetime import from_excel | ||||||
| @ -66,15 +65,10 @@ def parse_open_file(file_obj): | |||||||
|       # `if not any(row)` would be slightly faster, but would count `0` as empty. |       # `if not any(row)` would be slightly faster, but would count `0` as empty. | ||||||
|       if not set(row) <= {None, ""} |       if not set(row) <= {None, ""} | ||||||
|     ] |     ] | ||||||
|     sample = [ |     # Resetting dimensions via openpyxl causes rows to not be padded. Make sure | ||||||
|       # Create messytables.Cells for the sake of messytables.headers_guess |     # sample rows are padded; get_table_data will handle padding the rest. | ||||||
|       [messytables.Cell(cell) for cell in row] |     sample = _with_padding(rows[:1000]) | ||||||
|       # Resetting dimensions via openpyxl causes rows to not be padded. Make sure |     data_offset, headers = import_utils.headers_guess(sample) | ||||||
|       # sample rows are padded; get_table_data will handle padding the rest. |  | ||||||
|       for row in _with_padding(rows[:1000]) |  | ||||||
|     ] |  | ||||||
|     offset, headers = messytables.headers_guess(sample) |  | ||||||
|     data_offset = offset + 1  # Add the header line |  | ||||||
|     rows = rows[data_offset:] |     rows = rows[data_offset:] | ||||||
| 
 | 
 | ||||||
|     # Make sure all header values are strings. |     # Make sure all header values are strings. | ||||||
|  | |||||||
| @ -13,6 +13,8 @@ def _get_fixture(filename): | |||||||
| 
 | 
 | ||||||
| class TestImportXLS(unittest.TestCase): | class TestImportXLS(unittest.TestCase): | ||||||
| 
 | 
 | ||||||
|  |   maxDiff = None  # Display full diff if any. | ||||||
|  | 
 | ||||||
|   def _check_col(self, sheet, index, name, typename, values): |   def _check_col(self, sheet, index, name, typename, values): | ||||||
|     self.assertEqual(sheet["column_metadata"][index]["id"], name) |     self.assertEqual(sheet["column_metadata"][index]["id"], name) | ||||||
|     self.assertEqual(sheet["column_metadata"][index]["type"], typename) |     self.assertEqual(sheet["column_metadata"][index]["type"], typename) | ||||||
| @ -103,17 +105,17 @@ class TestImportXLS(unittest.TestCase): | |||||||
|       'table_name': u'Transaction Report', |       'table_name': u'Transaction Report', | ||||||
|       'column_metadata': [ |       'column_metadata': [ | ||||||
|         {'type': 'Any', 'id': u''}, |         {'type': 'Any', 'id': u''}, | ||||||
|         {'type': 'Numeric', 'id': u'Start'}, |         {'type': 'Any', 'id': u''}, | ||||||
|         {'type': 'Numeric', 'id': u''}, |         {'type': 'Numeric', 'id': u''}, | ||||||
|         {'type': 'Numeric', 'id': u''}, |         {'type': 'Numeric', 'id': u''}, | ||||||
|         {'type': 'Any', 'id': u'Seek no easy ways'}, |         {'type': 'Any', 'id': u''}, | ||||||
|       ], |       ], | ||||||
|       'table_data': [ |       'table_data': [ | ||||||
|         [u'SINGLE MERGED', u'The End'], |         ['', u'SINGLE MERGED', u'The End'], | ||||||
|         [1637384.52, None], |         ['Start', '1637384.52', ''], | ||||||
|         [2444344.06, None], |         [None, 2444344.06, None], | ||||||
|         [2444344.06, None], |         [None, 2444344.06, None], | ||||||
|         [u'', u''], |         ['Seek no easy ways', u'', u''], | ||||||
|       ], |       ], | ||||||
|     }]) |     }]) | ||||||
| 
 | 
 | ||||||
|  | |||||||
| @ -19,7 +19,6 @@ jdcal==1.4.1 | |||||||
| json_table_schema==0.2.1 | json_table_schema==0.2.1 | ||||||
| lazy_object_proxy==1.6.0 | lazy_object_proxy==1.6.0 | ||||||
| lxml==4.6.3                # used in csv plugin only? | lxml==4.6.3                # used in csv plugin only? | ||||||
| messytables==0.15.2 |  | ||||||
| python_dateutil==2.8.2 | python_dateutil==2.8.2 | ||||||
| openpyxl==3.0.10 | openpyxl==3.0.10 | ||||||
| python_magic==0.4.12 | python_magic==0.4.12 | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user