-
Notifications
You must be signed in to change notification settings - Fork 137
/
Copy pathxls2json_backends.py
298 lines (266 loc) · 10.2 KB
/
xls2json_backends.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
# -*- coding: utf-8 -*-
"""
XLS-to-dict and csv-to-dict are essentially backends for xls2json.
"""
import csv
import datetime
import re
from collections import OrderedDict
from functools import reduce
from io import StringIO
import xlrd
from xlrd import XLRDError
from xlrd.xldate import XLDateAmbiguous
from pyxform import constants
from pyxform.errors import PyXFormError
XL_DATE_AMBIGOUS_MSG = (
"The xls file provided has an invalid date on the %s sheet, under"
" the %s column on row number %s"
)
def _list_to_dict_list(list_items):
"""
Takes a list and creates a dict with the list values as keys.
Returns a list of the created dict or an empty list
"""
if list_items:
k = OrderedDict()
for item in list_items:
k["%s" % item] = ""
return [k]
return []
def xls_to_dict(path_or_file):
"""
Return a Python dictionary with a key for each worksheet
name. For each sheet there is a list of dictionaries, each
dictionary corresponds to a single row in the worksheet. A
dictionary has keys taken from the column headers and values
equal to the cell value for that row and column.
All the keys and leaf elements are unicode text.
"""
try:
if isinstance(path_or_file, str):
workbook = xlrd.open_workbook(filename=path_or_file)
else:
workbook = xlrd.open_workbook(file_contents=path_or_file.read())
except XLRDError as error:
raise PyXFormError("Error reading .xls file: %s" % error)
def xls_to_dict_normal_sheet(sheet):
def iswhitespace(string):
return isinstance(string, str) and len(string.strip()) == 0
# Check for duplicate column headers
column_header_list = list()
for column in range(0, sheet.ncols):
column_header = sheet.cell_value(0, column)
if column_header in column_header_list:
raise PyXFormError("Duplicate column header: %s" % column_header)
# xls file with 3 columns mostly have a 3 more columns that are
# blank by default or something, skip during check
if column_header is not None:
if not iswhitespace(column_header):
# strip whitespaces from the header
clean_header = re.sub(r"( )+", " ", column_header.strip())
column_header_list.append(clean_header)
result = []
for row in range(1, sheet.nrows):
row_dict = OrderedDict()
for column in range(0, sheet.ncols):
# Changing to cell_value function
# convert to string, in case it is not string
key = "%s" % sheet.cell_value(0, column)
key = key.strip()
value = sheet.cell_value(row, column)
# remove whitespace at the beginning and end of value
if isinstance(value, str):
value = value.strip()
value_type = sheet.cell_type(row, column)
if value is not None:
if not iswhitespace(value):
try:
row_dict[key] = xls_value_to_unicode(
value, value_type, workbook.datemode
)
except XLDateAmbiguous:
raise PyXFormError(
XL_DATE_AMBIGOUS_MSG % (sheet.name, column_header, row)
)
# Taking this condition out so I can get accurate row numbers.
# TODO: Do the same for csvs
# if row_dict != {}:
result.append(row_dict)
return result, _list_to_dict_list(column_header_list)
def xls_value_from_sheet(sheet, row, column):
value = sheet.cell_value(row, column)
value_type = sheet.cell_type(row, column)
if value is not None and value != "":
try:
return xls_value_to_unicode(value, value_type, workbook.datemode)
except XLDateAmbiguous:
raise PyXFormError(XL_DATE_AMBIGOUS_MSG % (sheet.name, column, row))
else:
raise PyXFormError("Empty Value")
result = OrderedDict()
for sheet in workbook.sheets():
# Note that the sheet exists but do no further processing here.
result[sheet.name] = []
# Do not process sheets that have nothing to do with XLSForm.
if sheet.name not in constants.SUPPORTED_SHEET_NAMES:
if len(workbook.sheets()) == 1:
(
result[constants.SURVEY],
result["%s_header" % constants.SURVEY],
) = xls_to_dict_normal_sheet(sheet)
else:
continue
else:
(
result[sheet.name],
result["%s_header" % sheet.name],
) = xls_to_dict_normal_sheet(sheet)
return result
def xls_value_to_unicode(value, value_type, datemode):
"""
Take a xls formatted value and try to make a unicode string
representation.
"""
if value_type == xlrd.XL_CELL_BOOLEAN:
return "TRUE" if value else "FALSE"
elif value_type == xlrd.XL_CELL_NUMBER:
# Try to display as an int if possible.
int_value = int(value)
if int_value == value:
return str(int_value)
else:
return str(value)
elif value_type is xlrd.XL_CELL_DATE:
# Warn that it is better to single quote as a string.
# error_location = cellFormatString % (ss_row_idx, ss_col_idx)
# raise Exception(
# "Cannot handle excel formatted date at " + error_location)
datetime_or_time_only = xlrd.xldate_as_tuple(value, datemode)
if datetime_or_time_only[:3] == (0, 0, 0):
# must be time only
return str(datetime.time(*datetime_or_time_only[3:]))
return str(datetime.datetime(*datetime_or_time_only))
else:
# ensure unicode and replace nbsp spaces with normal ones
# to avoid this issue:
# https://github.com/modilabs/pyxform/issues/83
return str(value).replace(chr(160), " ")
def get_cascading_json(sheet_list, prefix, level):
return_list = []
for row in sheet_list:
if "stopper" in row:
if row["stopper"] == level:
# last element's name IS the prefix; doesn't need level
return_list[-1]["name"] = prefix
return return_list
else:
continue
elif "lambda" in row:
def replace_prefix(d, prefix):
for k, v in d.items():
if isinstance(v, str):
d[k] = v.replace("$PREFIX$", prefix)
elif isinstance(v, dict):
d[k] = replace_prefix(v, prefix)
elif isinstance(v, list):
d[k] = map(lambda x: replace_prefix(x, prefix), v)
return d
return_list.append(replace_prefix(row["lambda"], prefix))
raise PyXFormError(
"Found a cascading_select "
+ level
+ ", but could not find "
+ level
+ "in cascades sheet."
)
def csv_to_dict(path_or_file):
if isinstance(path_or_file, str):
csv_data = open(path_or_file, "r", encoding="utf-8", newline="")
else:
csv_data = path_or_file
_dict = OrderedDict()
def first_column_as_sheet_name(row):
if len(row) == 0:
return None, None
elif len(row) == 1:
return row[0], None
else:
s_or_c = row[0]
content = row[1:]
if s_or_c == "":
s_or_c = None
# concatenate all the strings in content
if reduce(lambda x, y: x + y, content) == "":
# content is a list of empty strings
content = None
return s_or_c, content
reader = csv.reader(csv_data)
sheet_name = None
current_headers = None
for row in reader:
survey_or_choices, content = first_column_as_sheet_name(row)
if survey_or_choices is not None:
sheet_name = survey_or_choices
if sheet_name not in _dict:
_dict[str(sheet_name)] = []
current_headers = None
if content is not None:
if current_headers is None:
current_headers = content
_dict["%s_header" % sheet_name] = _list_to_dict_list(current_headers)
else:
_d = OrderedDict()
for key, val in zip(current_headers, content):
if val != "":
# Slight modification so values are striped
# this is because csvs often spaces following commas
# (but the csv reader might already handle that.)
_d[str(key)] = str(val.strip())
_dict[sheet_name].append(_d)
csv_data.close()
return _dict
"""
I want the ability to go:
xls => csv
so that we can go:
xls => csv => survey
and some day:
csv => xls
"""
def convert_file_to_csv_string(path):
"""
This will open a csv or xls file and return a CSV in the format:
sheet_name1
,col1,col2
,r1c1,r1c2
,r2c1,r2c2
sheet_name2
,col1,col2
,r1c1,r1c2
,r2c1,r2c2
Currently, it processes csv files and xls files to ensure consistent
csv delimiters, etc. for tests.
"""
if path.endswith(".csv"):
imported_sheets = csv_to_dict(path)
else:
imported_sheets = xls_to_dict(path)
foo = StringIO(newline="")
writer = csv.writer(foo, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
for sheet_name, rows in imported_sheets.items():
writer.writerow([sheet_name])
out_keys = []
out_rows = []
for row in rows:
out_row = []
for key in row.keys():
if key not in out_keys:
out_keys.append(key)
for out_key in out_keys:
out_row.append(row.get(out_key, None))
out_rows.append(out_row)
writer.writerow([None] + out_keys)
for out_row in out_rows:
writer.writerow([None] + out_row)
return foo.getvalue()