From a4621a24e746333378109839aadf49b3e7ae4ba0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Niccol=C3=B2=20Cant=C3=B9?= <niccolo.cantu@nina.no>
Date: Fri, 23 Feb 2024 09:30:50 +0100
Subject: [PATCH] handle binary excels

---
 wizard/parsers/parser.py            | 46 ++++++++++++++++++++++-------
 wizard/parsers/parser_base.py       | 30 ++++++++++++++++++-
 wizard/parsers/parser_excel.py      | 33 +++++++++++++++++++++
 wizard/parsers/tests/parser_test.py |  4 +++
 4 files changed, 102 insertions(+), 11 deletions(-)
 create mode 100644 wizard/parsers/parser_excel.py

diff --git a/wizard/parsers/parser.py b/wizard/parsers/parser.py
index 4a2990e..edfc314 100644
--- a/wizard/parsers/parser.py
+++ b/wizard/parsers/parser.py
@@ -2,12 +2,13 @@
 import logging
 from chardet.universaldetector import UniversalDetector
 
-from .parser_base import Parser
+from .parser_base import Parser, ParserNotSupported
 from .parser_gps import PARSERS as GPS_PARSERS
 from .parser_accelerometer import PARSERS as ACCELEROMETER_PARSERS
 from .parser_tdr import PARSERS as TDR_PARSERS
 from .parser_gpx import GPXParser
 from .parser_pathtrack import PathtrackParser, PathtrackParserNoUnknown
+from .parser_excel import GPSUnknownFormatExcelParser
 
 available_parsers = [
     PathtrackParser,
@@ -15,23 +16,48 @@
     GPXParser,
 ] + GPS_PARSERS + ACCELEROMETER_PARSERS + TDR_PARSERS
 
+binary_parsers = [
+    GPSUnknownFormatExcelParser,
+]
+
 
 def detect(stream) -> Parser:
-    for parser in available_parsers:
-        try:
-            stream.seek(0)
-            return parser(stream)
-        except:
-            logging.warning(traceback.format_exc())
+    if 'b' in stream.mode:
+        for parser in binary_parsers:
+            try:
+                stream.seek(0)
+                return parser(stream)
+            except ParserNotSupported:
+                logging.warning('Expected: ' + traceback.format_exc())
+            except:
+                logging.error(traceback.format_exc())
+    else:
+        for parser in available_parsers:
+            try:
+                stream.seek(0)
+                return parser(stream)
+            except ParserNotSupported:
+                logging.debug('Expected: ' + traceback.format_exc())
+            except:
+                logging.error(traceback.format_exc())
     
     raise NotImplementedError("File not supported")
 
 
 def detect_file(path):
     encoding = detect_encoding(path)
-    print(encoding)
-    with open(path, encoding=encoding) as stream:
-        return detect(stream)
+    if encoding:
+        print('here...')
+        with open(path, 'r', encoding=encoding) as stream:
+            return detect(stream)
+    else:
+        try:
+            with open(path, 'r') as stream:
+                stream.read()
+                return detect(stream)
+        except UnicodeDecodeError:
+            with open(path, 'rb') as stream:
+                return detect(stream)
 
 
 def detect_encoding(path):
diff --git a/wizard/parsers/parser_base.py b/wizard/parsers/parser_base.py
index 3375695..9a5161d 100644
--- a/wizard/parsers/parser_base.py
+++ b/wizard/parsers/parser_base.py
@@ -5,7 +5,7 @@
 import pyarrow as pa
 import pyarrow.parquet as pq
 import pyarrow.csv as pacsw
-
+import logging
 
 MAX_SPEED = float(os.environ.get('MAX_SPEED', default="10"))
 
@@ -88,6 +88,10 @@ class CSVParser(Parser):
 
     def __init__(self, stream):
         super().__init__(stream)
+
+        if 'b' in self.stream.mode:
+            self._raise_not_supported('Stream is binary')
+
         if not self.stream.seekable():
             self._raise_not_supported('Stream not seekable')
 
@@ -98,3 +102,27 @@ def __init__(self, stream):
 
         self.stream.seek(0)
         self.data = pd.read_csv(self.stream, header=1, names=self.FIELDS, sep=self.SEPARATOR, index_col=False)
+
+
+class ExcelParser(Parser):
+    DATATYPE = "generic_excel"
+    FIELDS = []
+    SKIPROWS = 0
+
+    def __init__(self, stream):
+        super().__init__(stream)
+
+        if not 'b' in self.stream.mode:
+            self._raise_not_supported('Stream is not binary')
+
+        if 'xls' not in pathlib.Path(self.stream.name).suffix:
+            self._raise_not_supported('Extension is not xls')
+
+        self.data = pd.read_excel(self.stream, header=0, index_col=False, skiprows=self.SKIPROWS)
+        if set(self.data.columns.values) != set(self.FIELDS):
+            self._raise_not_supported(f'Field name not matching: {
+                str({
+                    "missing": list(set(self.data.columns.values) - set(self.FIELDS)),
+                    "extra": list(set(self.FIELDS) - set(self.data.columns.values)),
+                })
+            }')
diff --git a/wizard/parsers/parser_excel.py b/wizard/parsers/parser_excel.py
new file mode 100644
index 0000000..bff05be
--- /dev/null
+++ b/wizard/parsers/parser_excel.py
@@ -0,0 +1,33 @@
+from .parser_base import ExcelParser
+
+
+class GPSUnknownFormatExcelParser(ExcelParser):
+    '''
+    Parser for a format, its a GPS excel like format
+    with the following fields
+    '''
+    DATATYPE = "gps_unknown"
+    FIELDS = [
+"ID","DateTime","Date","Time","Altitude","Speed","Type","Course","Distance","Latitude","Longitude","Tripnr","DistAdj","DistMax"]
+    
+    MAPPINGS = {
+        "id": "ID",
+        "date": "Date",
+        "time": "Time",
+        "latitude": "Latitude",
+        "longitude": "Longitude",
+        "altitude": "Altitude",
+        "speed_km_h": "Speed",
+        "type": "Type",
+        "distance": "Distance",
+        "course": "Course",
+        "hdop": None,
+        "pdop": None,
+        "satellites_count": None,
+        "temperature": None,
+        "solar_I_mA": None,
+        "bat_soc_pct": None,
+        "ring_nr": None,
+        "trip_nr": "Tripnr",
+    }
+
diff --git a/wizard/parsers/tests/parser_test.py b/wizard/parsers/tests/parser_test.py
index 0cfe8c9..0180bd3 100644
--- a/wizard/parsers/tests/parser_test.py
+++ b/wizard/parsers/tests/parser_test.py
@@ -11,6 +11,10 @@
 ]
 IGNORED_DIRS = [
     # 'gps_gpx',
+    # 'gps_cattrack',
+    # 'gps_igotugl',
+    # 'gps_unknown',
+    # 'gps_2jm',
     # 'accelerometer',
     # 'gps_pathtrack',
     # 'tdr',