Skip to content

Commit

Permalink
handle binary excels
Browse files Browse the repository at this point in the history
  • Loading branch information
nicokant committed Feb 23, 2024
1 parent 3a51eac commit a4621a2
Show file tree
Hide file tree
Showing 4 changed files with 102 additions and 11 deletions.
46 changes: 36 additions & 10 deletions wizard/parsers/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,36 +2,62 @@
import logging
from chardet.universaldetector import UniversalDetector

from .parser_base import Parser
from .parser_base import Parser, ParserNotSupported
from .parser_gps import PARSERS as GPS_PARSERS
from .parser_accelerometer import PARSERS as ACCELEROMETER_PARSERS
from .parser_tdr import PARSERS as TDR_PARSERS
from .parser_gpx import GPXParser
from .parser_pathtrack import PathtrackParser, PathtrackParserNoUnknown
from .parser_excel import GPSUnknownFormatExcelParser

available_parsers = [
PathtrackParser,
PathtrackParserNoUnknown,
GPXParser,
] + GPS_PARSERS + ACCELEROMETER_PARSERS + TDR_PARSERS

binary_parsers = [
GPSUnknownFormatExcelParser,
]


def detect(stream) -> Parser:
for parser in available_parsers:
try:
stream.seek(0)
return parser(stream)
except:
logging.warning(traceback.format_exc())
if 'b' in stream.mode:
for parser in binary_parsers:
try:
stream.seek(0)
return parser(stream)
except ParserNotSupported:
logging.warning('Expected: ' + traceback.format_exc())
except:
logging.error(traceback.format_exc())
else:
for parser in available_parsers:
try:
stream.seek(0)
return parser(stream)
except ParserNotSupported:
logging.debug('Expected: ' + traceback.format_exc())
except:
logging.error(traceback.format_exc())

raise NotImplementedError("File not supported")


def detect_file(path):
encoding = detect_encoding(path)
print(encoding)
with open(path, encoding=encoding) as stream:
return detect(stream)
if encoding:
print('here...')
with open(path, 'r', encoding=encoding) as stream:
return detect(stream)
else:
try:
with open(path, 'r') as stream:
stream.read()
return detect(stream)
except UnicodeDecodeError:
with open(path, 'rb') as stream:
return detect(stream)


def detect_encoding(path):
Expand Down
30 changes: 29 additions & 1 deletion wizard/parsers/parser_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.csv as pacsw

import logging

MAX_SPEED = float(os.environ.get('MAX_SPEED', default="10"))

Expand Down Expand Up @@ -88,6 +88,10 @@ class CSVParser(Parser):

def __init__(self, stream):
super().__init__(stream)

if 'b' in self.stream.mode:
self._raise_not_supported('Stream is binary')

if not self.stream.seekable():
self._raise_not_supported('Stream not seekable')

Expand All @@ -98,3 +102,27 @@ def __init__(self, stream):

self.stream.seek(0)
self.data = pd.read_csv(self.stream, header=1, names=self.FIELDS, sep=self.SEPARATOR, index_col=False)


class ExcelParser(Parser):
DATATYPE = "generic_excel"
FIELDS = []
SKIPROWS = 0

def __init__(self, stream):
super().__init__(stream)

if not 'b' in self.stream.mode:
self._raise_not_supported('Stream is not binary')

if 'xls' not in pathlib.Path(self.stream.name).suffix:
self._raise_not_supported('Extension is not xls')

self.data = pd.read_excel(self.stream, header=0, index_col=False, skiprows=self.SKIPROWS)
if set(self.data.columns.values) != set(self.FIELDS):
self._raise_not_supported(f'Field name not matching: {
str({
"missing": list(set(self.data.columns.values) - set(self.FIELDS)),
"extra": list(set(self.FIELDS) - set(self.data.columns.values)),
})
}')
33 changes: 33 additions & 0 deletions wizard/parsers/parser_excel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from .parser_base import ExcelParser


class GPSUnknownFormatExcelParser(ExcelParser):
'''
Parser for a format, its a GPS excel like format
with the following fields
'''
DATATYPE = "gps_unknown"
FIELDS = [
"ID","DateTime","Date","Time","Altitude","Speed","Type","Course","Distance","Latitude","Longitude","Tripnr","DistAdj","DistMax"]

MAPPINGS = {
"id": "ID",
"date": "Date",
"time": "Time",
"latitude": "Latitude",
"longitude": "Longitude",
"altitude": "Altitude",
"speed_km_h": "Speed",
"type": "Type",
"distance": "Distance",
"course": "Course",
"hdop": None,
"pdop": None,
"satellites_count": None,
"temperature": None,
"solar_I_mA": None,
"bat_soc_pct": None,
"ring_nr": None,
"trip_nr": "Tripnr",
}

4 changes: 4 additions & 0 deletions wizard/parsers/tests/parser_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@
]
IGNORED_DIRS = [
# 'gps_gpx',
# 'gps_cattrack',
# 'gps_igotugl',
# 'gps_unknown',
# 'gps_2jm',
# 'accelerometer',
# 'gps_pathtrack',
# 'tdr',
Expand Down

0 comments on commit a4621a2

Please sign in to comment.