# This Python file uses the following encoding: utf-8 import calendar import datetime import math import os import unittest from imports import import_xls def _get_fixture(filename): return [os.path.join(os.path.dirname(__file__), "fixtures", filename)] class TestImportXLS(unittest.TestCase): def _check_col(self, sheet, index, name, typename, values): self.assertEqual(sheet["column_metadata"][index]["id"], name) self.assertEqual(sheet["column_metadata"][index]["type"], typename) if typename == "Any": # Convert values to strings to reduce changes to tests after imports were overhauled. values = [str(v) for v in values] self.assertEqual(sheet["table_data"][index], values) def test_excel(self): parsed_file = import_xls.parse_file(*_get_fixture('test_excel.xlsx')) # check that column type was correctly set to numeric and values are properly parsed self.assertEqual(parsed_file[1][0]["column_metadata"][0], {"type": "Numeric", "id": "numbers"}) self.assertEqual(parsed_file[1][0]["table_data"][0], [1, 2, 3, 4, 5, 6, 7, 8]) # check that column type was correctly set to text and values are properly parsed self.assertEqual(parsed_file[1][0]["column_metadata"][1], {"type": "Any", "id": "letters"}) self.assertEqual(parsed_file[1][0]["table_data"][1], ["a", "b", "c", "d", "e", "f", "g", "h"]) # 0s and 1s become Numeric, not boolean like in the past self.assertEqual(parsed_file[1][0]["column_metadata"][2], {"type": "Numeric", "id": "boolean"}) self.assertEqual(parsed_file[1][0]["table_data"][2], [1, 0, 1, 0, 1, 0, 1, 0]) # check that column type was correctly set to text and values are properly parsed self.assertEqual(parsed_file[1][0]["column_metadata"][3], {"type": "Any", "id": "corner-cases"}) self.assertEqual(parsed_file[1][0]["table_data"][3], # The type is detected as text, so all values should be text. [u'=function()', u'3', u'two spaces after ', u' two spaces before', u'!@#$', u'€€€', u'√∫abc$$', u'line\nbreak']) # check that multiple tables are created when there are multiple sheets in a document self.assertEqual(parsed_file[1][0]["table_name"], u"Sheet1") self.assertEqual(parsed_file[1][1]["table_name"], u"Sheet2") self.assertEqual(parsed_file[1][1]["table_data"][0], ["a", "b", "c", "d"]) def test_excel_types(self): parsed_file = import_xls.parse_file(*_get_fixture('test_excel_types.xlsx')) sheet = parsed_file[1][0] self._check_col(sheet, 0, "int1", "Numeric", [-1234123, None, None]) self._check_col(sheet, 1, "int2", "Numeric", [5, None, None]) self._check_col(sheet, 2, "textint", "Any", ["12345678902345689", '', '']) self._check_col(sheet, 3, "bigint", "Any", ["320150170634561830", '', '']) self._check_col(sheet, 4, "num2", "Numeric", [123456789.123456, None, None]) self._check_col(sheet, 5, "bignum", "Numeric", [math.exp(200), None, None]) self._check_col(sheet, 6, "date1", "DateTime", [calendar.timegm(datetime.datetime(2015, 12, 22, 11, 59, 00).timetuple()), None, None]) self._check_col(sheet, 7, "date2", "Date", [calendar.timegm(datetime.datetime(2015, 12, 20, 0, 0, 0).timetuple()), None, None]) self._check_col(sheet, 8, "datetext", "Any", ['12/22/2015', '', '']) self._check_col(sheet, 9, "datetimetext", "Any", [u'12/22/2015', u'12/22/2015 1:15pm', u'2018-02-27 16:08:39 +0000']) def test_excel_type_detection(self): # This tests goes over the second sheet of the fixture doc, which has multiple rows that try # to throw off the type detection. parsed_file = import_xls.parse_file(*_get_fixture('test_excel_types.xlsx')) sheet = parsed_file[1][1] self._check_col(sheet, 0, "date_with_other", "DateTime", [1467676800.0, 1451606400.0, 1451692800.0, 1454544000.0, 1199577600.0, 1467732614.0, u'n/a', 1207958400.0, 1451865600.0, 1451952000.0, None, 1452038400.0, 1451549340.0, 1483214940.0, None, 1454544000.0, 1199577600.0, 1451692800.0, 1451549340.0, 1483214940.0]) self._check_col(sheet, 1, "float_not_int", "Numeric", [1,2,3,4,5,None,6,7,8,9,10,10.25,11,12,13,14,15,16,17,18]) self._check_col(sheet, 2, "int_not_bool", "Any", [0, 0, 1, 0, 1, 0, 0, 1, 0, 2, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0]) self._check_col(sheet, 3, "float_not_bool", "Any", [0, 0, 1, 0, 1, 0, 0, 1, 0, 0.5, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0]) self._check_col(sheet, 4, "text_as_bool", "Any", [0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0]) self._check_col(sheet, 5, "int_as_bool", "Numeric", [0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0]) self._check_col(sheet, 6, "float_not_date", "Any", [4.0, 6.0, 4.0, 4.0, 6.0, 4.0, '--', 6.0, 4.0, 4.0, 4.0, 4.0, 4.0, 6.0, 6.0, 4.0, 6.0, '3-4', 4.0, 6.5]) self._check_col(sheet, 7, "float_not_text", "Numeric", [-10.25, -8.00, -5.75, -3.50, "n/a", ' 1. ', " ??? ", 5.50, None, "-", 12.25, 0.00, None, 0.00, "--", 23.50, "NA", 28.00, 30.25, 32.50]) def test_excel_single_merged_cell(self): # An older version had a bug where a single cell marked as 'merged' would cause an exception. parsed_file = import_xls.parse_file(*_get_fixture('test_single_merged_cell.xlsx')) tables = parsed_file[1] self.assertEqual(tables, [{ 'table_name': u'Transaction Report', 'column_metadata': [ {'type': 'Any', 'id': u''}, {'type': 'Numeric', 'id': u'Start'}, {'type': 'Numeric', 'id': u''}, {'type': 'Numeric', 'id': u''}, {'type': 'Any', 'id': u'Seek no easy ways'}, ], 'table_data': [ [u'SINGLE MERGED', u'The End'], [1637384.52, None], [2444344.06, None], [2444344.06, None], [u'', u''], ], }]) def test_excel_strange_dates(self): # Check that we don't fail when encountering unusual dates and times (e.g. 0 or 38:00:00). parsed_file = import_xls.parse_file(*_get_fixture('strange_dates.xlsx')) tables = parsed_file[1] # We test non-failure, but the result is not really what we want. E.g. "1:10" and "100:20:30" # would be best left as text. self.assertEqual(tables, [{ 'table_name': u'Sheet1', 'column_metadata': [ {'id': 'a', 'type': 'Any'}, {'id': 'b', 'type': 'Date'}, {'id': 'c', 'type': 'Any'}, {'id': 'd', 'type': 'Any'}, {'id': 'e', 'type': 'DateTime'}, {'id': 'f', 'type': 'Date'}, {'id': 'g', 'type': 'Any'}, {'id': 'h', 'type': 'Date'}, {'id': 'i', 'type': 'Date'}, ], 'table_data': [ [u'21:14:00'], [1568851200.0], [u'01:10:00'], [u'10:20:30'], [-2208713970.0], [-2207347200.0], [u'7/4/1776'], [205286400.0], [-2209161600.0], ], }]) def test_empty_rows(self): # Check that empty rows aren't imported, # and that files with lots of empty rows are imported quickly. # The fixture file is mostly empty but has data in the last row, # with over a million empty rows in between. parsed_file = import_xls.parse_file(*_get_fixture('test_empty_rows.xlsx')) tables = parsed_file[1] self.assertEqual(tables, [{ 'table_name': u'Sheet1', 'column_metadata': [ {'id': 'a', 'type': 'Numeric'}, {'id': 'b', 'type': 'Numeric'}, ], 'table_data': [ [0, None, 1], [None, 0, 2], ], }]) if __name__ == '__main__': unittest.main()