-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathInterlinearLoaders.py
More file actions
376 lines (310 loc) · 12.8 KB
/
InterlinearLoaders.py
File metadata and controls
376 lines (310 loc) · 12.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
from abc import ABC, abstractmethod
import openpyxl
from xml.etree.ElementTree import Element, SubElement, tostring
from xml.dom import minidom
class InterlinearLoader(ABC):
"""
Abstract class for loading an interlinear loader.
Includes the following:
- issuccess (attribute)
- isdone (property)
- progress (property)
- run (method)
Concrete child classes must have:
-a next_step attribute which defines a processing function
"""
def __init__(self):
self.issuccess = False
super().__init__() # for multiple inheritance...
self._progress = 0
# a Loader object must define the following
assert(hasattr(self, 'next_step'))
@property
def isdone(self):
"""
Returns True if there are no more processing steps to take.
"""
return bool(self.next_step is None)
@property
def progress(self):
"""
Returns a float in the range of 0.0 - 1.0 representing processing progress.
"""
return self._progress
def run(self):
"""
Run all steps directly (no breaks)
"""
while not self.isdone:
self.next_step()
class InterlinearXML:
"""
An XML object for interlinear data.
The XML root contains a "metadata" element and a "body" element.
New paragraphs are added to the body.
New line are added to the current paragraph.
Similarly, all other new elements are added to the current parent element.
There is no class-specific approach to retrieving previous lines, paragraphs, etc.
but the XML objects have methods for this.
InterlinearXML Methods:
(new_xml_* take no arguments)
new_xml_paragraph()
new_xml_line()
new_xml_il_lines()
new_xml_vernacular_line()
new_xml_gloss_line()
(add_xml_* take text argument)
add_xml_vernacular_word(text)
add_xml_gloss_word(text)
add_xml_free(text)
(output handling)
get_pretty_xml()
write(filename)
"""
def __init__(self):
self.xml_root = Element('text')
self.xml_metadata = SubElement(self.xml_root, 'text_metadata')
self.xml_body = SubElement(self.xml_root, 'body')
self.new_xml_paragraph()
super().__init__() # for multiple inheritance...
def new_xml_paragraph(self):
self.xml_paragraph = SubElement(self.xml_body, 'paragraph')
def new_xml_line(self):
self.xml_line = SubElement(self.xml_paragraph, 'line')
def new_xml_il_lines(self):
self.xml_il_lines = SubElement(self.xml_line, 'il-lines')
def new_xml_vernacular_line(self):
self.xml_vern_line = SubElement(self.xml_il_lines, 'vernacular-line')
def new_xml_gloss_line(self):
self.xml_gloss_line = SubElement(self.xml_il_lines, 'gloss-line')
def add_xml_vernacular_word(self, text):
wrd = SubElement(self.xml_vern_line, 'wrd')
wrd.text = text
def add_xml_gloss_word(self, text):
wrd = SubElement(self.xml_gloss_line, 'gls')
wrd.text = text
def add_xml_free(self, text):
xml_free = SubElement(self.xml_line, 'free')
xml_free.text = text
def get_pretty_xml(self):
"""
Return a pretty-printed XML string for the root XML.
"""
# Ensure correct output encoding for the header
rough_string = tostring(self.xml_root, encoding='utf-8')
reparsed = minidom.parseString(rough_string)
return reparsed.toprettyxml(indent=" ")
def write(self, filename):
"""
Write pretty-printed XML to file.
"""
pretty = self.get_pretty_xml()
with open(filename, 'w', encoding='utf-8') as f:
f.write(pretty)
class ExcelInterlinearLoader(InterlinearLoader, InterlinearXML):
"""
Handles loading of an Excel interlinear from template.
Direct usage:
e = ExcelInterlinearLoader(name_and_path_of_excel_file_to_load)
e.run()
txt = e.get_pretty_xml()
To use tqdm for displaying progress:
e = ExcelInterlinearLoader(name_and_path_of_excel_file_to_load)
with tqdm(total=1.0, desc="Processing Excel File") as pbar:
while not e.isdone:
e.next_step()
pbar.update(e.progress)
"""
def __init__(self, loadname):
"""
Construct ExcelInterlinearLoader object with definitions and initialization
"""
self.METADATA_CELLS = {
'title': 'C2',
'author': 'C3',
'transcriber': 'C4',
'writing_system_vernacular': 'N2',
'writing_system_free': 'N3',
'writing_system_gloss': 'N4'
}
self.DATA_START_ROW = 6
self.DATA_START_COLUMN = 3 # Column C
self.DATA_END_COLUMN = 26 # Column Z (where the free translation merge ends)
self.ROWS_PER_LINE_BLOCK = 4
self.BLANK_BLOCK_EXIT_THRESHOLD = 5 # Exit after 5 consecutive empty 4-row blocks (20 blank rows)
self.FILE_LOAD_PROGRESS_WEIGHT = 0.5
self.warning_list = [] # fatal errors are raised as exceptions to be handled elsewhere
self.consecutive_empty_blocks = 0
self.loadname = loadname
self.n_blocks = None
self.current_block = None
self.next_step = self.load_sheet
super().__init__()
self.debug = False # for printing to console
def update_progress(self, value=None):
"""
Update the progress attribute, either from current_block or with a set value.
load_sheet is assigned self.FILE_LOAD_PROGRESS_WEIGHT of the progress.
The rest of the progressbar is evenly divided among:
read_metadata: 1 unit
read_one_block: 1 unit per block
cleanup: 1 unit
"""
w = self.FILE_LOAD_PROGRESS_WEIGHT
if value is None:
self._progress = w + (1 - w) * self.current_block / (self.n_blocks + 2)
elif value == -1:
self._progress = w + (1 - w) * (self.n_blocks + 1) / (self.n_blocks + 2)
else:
self._progress = float(value)
def load_sheet(self):
"""
Load the Excel sheet as an openpyxl object and count the rows.
"""
try:
workbook = openpyxl.load_workbook(self.loadname, data_only=True)
except Exception as e:
raise Exception(f"Error loading Excel file '{self.loadname}'") from e
try:
self.sheet = workbook.worksheets[0]
except Exception as e:
raise Exception(f"Error loading first sheet of Excel file '{self.loadname}'") from e
n_data_rows = self.sheet.max_row - self.DATA_START_ROW + 1
if self.debug:
print(f' Total rows: {self.sheet.max_row}')
print(f' Data rows: {n_data_rows}')
if n_data_rows % self.ROWS_PER_LINE_BLOCK:
# extra row or part of an interlinear line
n_extra = n_data_rows % self.ROWS_PER_LINE_BLOCK
self.warning_list.append('Partial interlinear block of data will be ignored ' +
f'({n_extra} extra data rows found)')
self.n_blocks = (n_data_rows // self.ROWS_PER_LINE_BLOCK)
self.update_progress(self.FILE_LOAD_PROGRESS_WEIGHT)
self.next_step = self.read_metadata
# need approach for if a step fails, how to let GUI know?
# self.next_step = None
# self.isdone = True
# self.success = False # something like that
if self.debug:
print(f'load_sheet: n_blocks = {self.n_blocks}')
def read_metadata(self):
"""
Read the metadata cells of the spreadsheet
"""
for tag, cell_coord in self.METADATA_CELLS.items():
cell = self.sheet[cell_coord]
cell_value = self.get_cell_value(cell.row, cell.column)
element = SubElement(self.xml_metadata, tag)
element.text = cell_value if cell_value else ""
self.current_block = 1
self.update_progress()
self.next_step = self.read_one_block
def read_one_block(self):
"""
Read one interlinear line (block) of data, and check if done.
"""
vernacular_row = self.DATA_START_ROW + (self.current_block - 1) * self.ROWS_PER_LINE_BLOCK
gloss_row = vernacular_row + 1
free_row = vernacular_row + 2
# blank_row = vernacular_row + 3 # worth checking for blankness or no?
vern_words = []
gloss_words = []
for col in range(self.DATA_START_COLUMN, self.DATA_END_COLUMN+1):
vern_val = self.get_cell_value(vernacular_row, col)
gloss_val = self.get_cell_value(gloss_row, col)
vern_is_present = bool(vern_val)
gloss_is_present = bool(gloss_val)
# Check alignment
if vern_is_present != gloss_is_present:
problem_cell = f"{chr(col + 64)}{vernacular_row}" if vern_is_present else f"{chr(col + 64)}{gloss_row}"
self.warning_list.append(
f"Alignment Error: Mismatched word/gloss at column {chr(col + 64)}. "
f"Non-empty cell: {problem_cell} (Rows {vernacular_row} and {gloss_row})."
)
if vern_is_present:
vern_words.append(vern_val)
gloss_words.append(gloss_val if gloss_val else "")
elif gloss_is_present:
pass # Ignore if only a gloss exists, but error is logged above
free_translation = self.get_cell_value(free_row, self.DATA_START_COLUMN)
if free_translation is None:
free_translation = ""
is_block_empty = (not vern_words) and (not free_translation)
if not is_block_empty:
self.consecutive_empty_blocks = 0
self.new_xml_line()
self.new_xml_il_lines()
self.new_xml_vernacular_line()
for word in vern_words:
self.add_xml_vernacular_word(word)
self.new_xml_gloss_line()
for word in gloss_words:
self.add_xml_gloss_word(word)
self.add_xml_free(free_translation)
# elif is_block_empty and self.current_row > self.DATA_START_ROW:
# ^^^ why the 2nd condition? why ignore the first blank line?
else:
# Paragraph break / Early Exit Logic
self.consecutive_empty_blocks += 1
if self.consecutive_empty_blocks >= self.BLANK_BLOCK_EXIT_THRESHOLD:
# self.warning_list.append(
# f"Finishing early due to {self.consecutive_empty_blocks} consecutive empty interlinear lines")
self.update_progress(-1)
self.next_step = self.cleanup
return None
elif list(self.xml_paragraph):
self.new_xml_paragraph()
self.current_block += 1
self.update_progress()
if self.current_block > self.n_blocks:
# self.update_progress(-1)
self.next_step = self.cleanup
def cleanup(self):
"""
Post-processing: Remove the last paragraph element if it ended up empty.
Then indicate that the processing is completed.
"""
if not list(self.xml_body) or not list(self.xml_paragraph):
for p in list(self.xml_body):
if not list(p):
self.xml_body.remove(p)
self.next_step = None
self.update_progress(1.0)
self.issuccess = True
def get_cell_value(self, row, col):
"""
Get value of one cell of self.sheet, cleanly
"""
cell = self.sheet.cell(row=row, column=col)
if cell.value is None:
return None
else:
return str(cell.value).strip()
if __name__ == "__main__":
# TEMP: for testing
# filename = r'Cerita Juari Atau (Barnabas) - in template.xlsx'
filename = r'Interlinear Text Excel Template (80 lines)2.xlsx'
xl = ExcelInterlinearLoader(filename)
test_tqdm = False
if test_tqdm:
from tqdm import tqdm
with tqdm(total=1.0, desc="Processing Excel File") as pbar:
while xl.next_step is not None:
xl.next_step()
pbar.update(xl.progress)
outputname = filename[:-5] + r'_ClassTestTqdm.xml'
xl.write(outputname)
print(f'Output written to: {outputname}')
print('')
print(f'issuccess: {xl.issuccess}')
print(f'next_step: {xl.next_step}')
print(f'warning_list: {xl.warning_list}')
else:
xl.run()
outputname = filename[:-5] + r'_ClassTest.xml'
xl.write(outputname)
print(f'Output written to: {outputname}')
print('')
print(f'issuccess: {xl.issuccess}')
print(f'next_step: {xl.next_step}')
print(f'warning_list: {xl.warning_list}')