From a4621a24e746333378109839aadf49b3e7ae4ba0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niccol=C3=B2=20Cant=C3=B9?= Date: Fri, 23 Feb 2024 09:30:50 +0100 Subject: [PATCH] handle binary excels --- wizard/parsers/parser.py | 46 ++++++++++++++++++++++------- wizard/parsers/parser_base.py | 30 ++++++++++++++++++- wizard/parsers/parser_excel.py | 33 +++++++++++++++++++++ wizard/parsers/tests/parser_test.py | 4 +++ 4 files changed, 102 insertions(+), 11 deletions(-) create mode 100644 wizard/parsers/parser_excel.py diff --git a/wizard/parsers/parser.py b/wizard/parsers/parser.py index 4a2990e..edfc314 100644 --- a/wizard/parsers/parser.py +++ b/wizard/parsers/parser.py @@ -2,12 +2,13 @@ import logging from chardet.universaldetector import UniversalDetector -from .parser_base import Parser +from .parser_base import Parser, ParserNotSupported from .parser_gps import PARSERS as GPS_PARSERS from .parser_accelerometer import PARSERS as ACCELEROMETER_PARSERS from .parser_tdr import PARSERS as TDR_PARSERS from .parser_gpx import GPXParser from .parser_pathtrack import PathtrackParser, PathtrackParserNoUnknown +from .parser_excel import GPSUnknownFormatExcelParser available_parsers = [ PathtrackParser, @@ -15,23 +16,48 @@ GPXParser, ] + GPS_PARSERS + ACCELEROMETER_PARSERS + TDR_PARSERS +binary_parsers = [ + GPSUnknownFormatExcelParser, +] + def detect(stream) -> Parser: - for parser in available_parsers: - try: - stream.seek(0) - return parser(stream) - except: - logging.warning(traceback.format_exc()) + if 'b' in stream.mode: + for parser in binary_parsers: + try: + stream.seek(0) + return parser(stream) + except ParserNotSupported: + logging.warning('Expected: ' + traceback.format_exc()) + except: + logging.error(traceback.format_exc()) + else: + for parser in available_parsers: + try: + stream.seek(0) + return parser(stream) + except ParserNotSupported: + logging.debug('Expected: ' + traceback.format_exc()) + except: + logging.error(traceback.format_exc()) raise NotImplementedError("File not supported") def detect_file(path): encoding = detect_encoding(path) - print(encoding) - with open(path, encoding=encoding) as stream: - return detect(stream) + if encoding: + print('here...') + with open(path, 'r', encoding=encoding) as stream: + return detect(stream) + else: + try: + with open(path, 'r') as stream: + stream.read() + return detect(stream) + except UnicodeDecodeError: + with open(path, 'rb') as stream: + return detect(stream) def detect_encoding(path): diff --git a/wizard/parsers/parser_base.py b/wizard/parsers/parser_base.py index 3375695..9a5161d 100644 --- a/wizard/parsers/parser_base.py +++ b/wizard/parsers/parser_base.py @@ -5,7 +5,7 @@ import pyarrow as pa import pyarrow.parquet as pq import pyarrow.csv as pacsw - +import logging MAX_SPEED = float(os.environ.get('MAX_SPEED', default="10")) @@ -88,6 +88,10 @@ class CSVParser(Parser): def __init__(self, stream): super().__init__(stream) + + if 'b' in self.stream.mode: + self._raise_not_supported('Stream is binary') + if not self.stream.seekable(): self._raise_not_supported('Stream not seekable') @@ -98,3 +102,27 @@ def __init__(self, stream): self.stream.seek(0) self.data = pd.read_csv(self.stream, header=1, names=self.FIELDS, sep=self.SEPARATOR, index_col=False) + + +class ExcelParser(Parser): + DATATYPE = "generic_excel" + FIELDS = [] + SKIPROWS = 0 + + def __init__(self, stream): + super().__init__(stream) + + if not 'b' in self.stream.mode: + self._raise_not_supported('Stream is not binary') + + if 'xls' not in pathlib.Path(self.stream.name).suffix: + self._raise_not_supported('Extension is not xls') + + self.data = pd.read_excel(self.stream, header=0, index_col=False, skiprows=self.SKIPROWS) + if set(self.data.columns.values) != set(self.FIELDS): + self._raise_not_supported(f'Field name not matching: { + str({ + "missing": list(set(self.data.columns.values) - set(self.FIELDS)), + "extra": list(set(self.FIELDS) - set(self.data.columns.values)), + }) + }') diff --git a/wizard/parsers/parser_excel.py b/wizard/parsers/parser_excel.py new file mode 100644 index 0000000..bff05be --- /dev/null +++ b/wizard/parsers/parser_excel.py @@ -0,0 +1,33 @@ +from .parser_base import ExcelParser + + +class GPSUnknownFormatExcelParser(ExcelParser): + ''' + Parser for a format, its a GPS excel like format + with the following fields + ''' + DATATYPE = "gps_unknown" + FIELDS = [ +"ID","DateTime","Date","Time","Altitude","Speed","Type","Course","Distance","Latitude","Longitude","Tripnr","DistAdj","DistMax"] + + MAPPINGS = { + "id": "ID", + "date": "Date", + "time": "Time", + "latitude": "Latitude", + "longitude": "Longitude", + "altitude": "Altitude", + "speed_km_h": "Speed", + "type": "Type", + "distance": "Distance", + "course": "Course", + "hdop": None, + "pdop": None, + "satellites_count": None, + "temperature": None, + "solar_I_mA": None, + "bat_soc_pct": None, + "ring_nr": None, + "trip_nr": "Tripnr", + } + diff --git a/wizard/parsers/tests/parser_test.py b/wizard/parsers/tests/parser_test.py index 0cfe8c9..0180bd3 100644 --- a/wizard/parsers/tests/parser_test.py +++ b/wizard/parsers/tests/parser_test.py @@ -11,6 +11,10 @@ ] IGNORED_DIRS = [ # 'gps_gpx', + # 'gps_cattrack', + # 'gps_igotugl', + # 'gps_unknown', + # 'gps_2jm', # 'accelerometer', # 'gps_pathtrack', # 'tdr',