-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathODSReader.py
250 lines (218 loc) · 10.1 KB
/
ODSReader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
# Copyright 2011 Marco Conti
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Thanks to grt for the fixes
import odf.opendocument
from odf.table import Table, TableRow, TableCell
from odf.text import P
# http://stackoverflow.com/a/4544699/1846474
class GrowingList(list):
def __setitem__(self, index, value):
if index >= len(self):
self.extend([None]*(index + 1 - len(self)))
list.__setitem__(self, index, value)
class ODSReader:
# loads the file
def __init__(self, file, clonespannedcolumns=None):
self.clonespannedcolumns = clonespannedcolumns
self.doc = odf.opendocument.load(file)
self.SHEETS = {}
for sheet in self.doc.spreadsheet.getElementsByType(Table):
self.readSheet(sheet)
# reads a sheet in the sheet dictionary, storing each sheet as an
# array (rows) of arrays (columns)
def readSheet(self, sheet):
name = sheet.getAttribute("name")
rows = sheet.getElementsByType(TableRow)
arrRows = []
# for each row
for row in rows:
row_comment = ""
arrCells = GrowingList()
cells = row.getElementsByType(TableCell)
# for each cell
count = 0
for cell in cells:
# repeated value?
repeat = cell.getAttribute("numbercolumnsrepeated")
if(not repeat):
repeat = 1
spanned = int(cell.getAttribute('numbercolumnsspanned') or 0)
# clone spanned cells
if self.clonespannedcolumns is not None and spanned > 1:
repeat = spanned
ps = cell.getElementsByType(P)
textContent = ""
# for each text/text:span node
for p in ps:
for n in p.childNodes:
if (n.nodeType == 1 and
((n.tagName == "text:span") or (n.tagName == "text:a"))):
for c in n.childNodes:
if (c.nodeType == 3):
textContent = u'{}{}'.format(textContent, c.data)
if (n.nodeType == 3):
textContent = u'{}{}'.format(textContent, n.data)
if(textContent):
if(textContent[0] != "#"): # ignore comments cells
for rr in range(int(repeat)): # repeated?
arrCells[count]=textContent
count+=1
else:
row_comment = row_comment + textContent + " "
else:
for rr in range(int(repeat)):
count+=1
# if row contained something
if(len(arrCells)):
arrRows.append(arrCells)
#else:
# print ("Empty or commented row (", row_comment, ")")
self.SHEETS[name] = arrRows
# returns a sheet as an array (rows) of arrays (columns)
def getSheet(self, name):
return self.SHEETS[name]
def keyval_sheet_to_dict(sheet, sheetname, funcs=None):
'''For a sheet with rows of 1 key and 1 value, returns a dictionary.
sheet is an ODSReader().
sheetname is the worksheet name.
Example: keyval(sheet, sheetname, str, int)
If only one function is provided, it will apply the function to keys and values.'''
outsheet = sheet.getSheet(sheetname)
out = {}
if not funcs:
for row in outsheet:
out[row[0]] = row[1]
else:
for row in outsheet:
if len(funcs) == 1:
out[funcs[0](row[0])] = funcs[0](row[1])
else:
out[funcs[0](row[0])] = funcs[1](row[1])
return out
def convert_dict_vals_to_objs_in_dict_of_dicts(dictin, objclass, depth=1):
'''Converts, in place, a dict of dicts into a dict of objects, with any
nesting depth. Typically depth will be the length of the keys of the
dictionary, though this method is typically only called from within this
module.'''
assert depth >= 1, 'Depth must be 1 or higher.'
if depth == 1:
for k,v in dictin.items():
dictin[k] = objclass(v)
else:
for k in dictin:
convert_dict_vals_to_objs_in_dict_of_dicts(dictin[k], objclass, depth-1)
def dict_sheet_to_dict_of_objs(sheet, sheetname, objclass, keys=None, funcs=None, nones='fill'):
'''Creates a dict of objects for a particular sheet in an ODSReader() object.
sheet is an ODSReader().
sheetname is the worksheet name.
key is the column that should be the key of the new dict of objs
objclass is the class that will be called via __init__(**kwargs), with kwargs populated from the rows.
funcs are functions that should be applied to the data as it becomes entries in the dict.
nones describes how to handle empty fields. 'fill' fills with None, 'trim' removes, 'string' fills with 'None'.'''
out = dict_sheet_to_dict_of_dicts(sheet, sheetname, keys, funcs, nones)
convert_dict_vals_to_objs_in_dict_of_dicts(out, objclass, len(keys))
return out
def interpret_none(key, interpreted_dict, nones='fill'):
'''Enters a value into row[key] based on nones.
'fill' will enter a None
'string' will enter 'None'
'trim' is valid and will do nothing.
Other values will raise assertion error.'''
if nones == 'fill':
interpreted_dict[key] = None
elif nones == 'string':
interpreted_dict[key] = 'None'
else:
assert nones == 'trim', f'Unknown interpretation of None: {nones}'
def row_to_dict(key_row, row, funcs=None, nones='fill'):
'''Takes a row of a data from a spreadsheet (list), converts to a dict.
Applies function to row items, with the default function being str'''
out = {}
for i,e in enumerate(key_row):
# Is the examined element of the row populated?
if len(row)-1 >= i:
if row[i] is None:
interpret_none(e, out, nones)
else:
# Does the examined element of the row have a function?
if funcs is not None and len(funcs)-1 >= i:
out[e] = funcs[i](row[i])
else:
# If element is beyond range of funcs, it defaults to str
out[e] = str(row[i])
else:
# If row doesn't extend this far, it is None
interpret_none(e, out, nones)
return out
def rows_to_list_of_dicts(sheet, funcs=None, nones='fill'):
'''Outputs a list of dicts from a spreadsheet, accepting functions to change the elements of the dicts.
First row is labels and is untouched. If number of elements exceeds the functions provided, the rest are just handled as strings.
Nones by default are "fill" (with None), "trim" (exclude from the dict), and "string" ("None")...'''
out = []
key_row = sheet[0]
for row in sheet[1:]:
out.append(row_to_dict(key_row, row, funcs, nones=nones))
return out
def dict_to_dict_of_dicts(dictin, keys):
'''Given keys, this creates a nested dictionary (any depth).'''
# Because this assumes creating a single dict of dicts using a single dict,
# no caution has to be taken to avoid overwriting,
# and therefore you can populate from the innermost to outermost layer.
assert keys != [], 'keys can not be empty list.'
out = {}
out[dictin[keys[-1]]] = dictin
if len(keys) > 0:
for k in reversed(keys[:-1]):
temp = out
out = {}
out[dictin[k]] = temp
return out
def add_dict_to_dict_of_dicts(dictin, keys, out):
'''Adds a dict to a dict of dicts.'''
assert keys, 'Need populated list.'
if len(keys) >= 2:
# If you have further depth before populating.
if dictin[keys[0]] in out:
out = out[dictin[keys[0]]]
add_dict_to_dict_of_dicts(dictin, keys[1:], out)
else:
out[dictin[keys[0]]] = {}
out = out[dictin[keys[0]]]
add_dict_to_dict_of_dicts(dictin, keys[1:], out)
else:
out[dictin[keys[0]]] = dictin
def list_of_dicts_to_dict_of_dicts(dicts, keys):
'''Converts list of dicts into dict of dicts (any depth).'''
out = {}
for d in dicts:
add_dict_to_dict_of_dicts(d, keys, out)
return out
def dict_sheet_to_dict_of_dicts(sheet, sheetname, keys, funcs=None, nones='fill'):
'''Creates a dict of dicts (a mini-database) for a particular sheet in an ODSReader() object.
sheet is an ODSReader().
sheetname is the worksheet name.
key is the column that should be the key of the new dict of dicts
funcs are functions that should be applied to the data as it becomes entries in the dict.
nones describes how to handle empty fields. 'fill' fills with None, 'trim' removes, 'string' fills with 'None'.'''
out = sheet.getSheet(sheetname)
out = rows_to_list_of_dicts(out, funcs, nones)
out = list_of_dicts_to_dict_of_dicts(out, keys)
return out
def dict_sheet_to_list_of_dicts(sheet, sheetname, keys, funcs=None, nones='fill'):
'''Creates a list of dicts for a particular sheet in an ODSReader() object.
sheet is an ODSReader().
sheetname is the worksheet name.
key is the column that should be the key of the new dict of dicts
funcs are functions that should be applied to the data as it becomes entries in the dict.
nones describes how to handle empty fields. 'fill' fills with None, 'trim' removes, 'string' fills with 'None'.'''
out = sheet.getSheet(sheetname)
out = rows_to_list_of_dicts(out, funcs, nones)
return out