FlexText-Excel-Import/xml_to_flextext.py at main · rulingAnts/FlexText-Excel-Import · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
#!/usr/bin/env python3
import argparse
import os
import sys
import traceback
from xml.etree.ElementTree import Element, SubElement, tostring, parse
from xml.dom import minidom

def transform_to_flextext_dom(xml_root_in, ws_vernacular, ws_gloss, ws_freetrans):
    """
    [MAIN CONVERSION FUNCTION]
    Transforms the custom interlinear XML DOM object into the FLExText XML format,
    using the user-provided writing system codes for the 'lang' attributes.

    Args:
        xml_root_in (xml.etree.ElementTree.Element): The root XML element (<text> object).
        ws_vernacular (str): Writing system code for the vernacular text (e.g., 'xku').
        ws_gloss (str): Writing system code for the word glosses (e.g., 'gls').
        ws_freetrans (str): Writing system code for the free translation (e.g., 'en').

    Returns:
        tuple: (xml.etree.ElementTree.Element, int) The root <document> element (FLExText object)
               and the count of missing free translations.
    """

    # Nested Helper Function (Now scoped locally)
    def create_languages_block():
        """
        Creates the essential <languages> element for the FLExText file,
        using the writing system codes from the parent function's scope.
        """
        languages = Element('languages')

        # 1. Vernacular Language (The language of the text, marked as vernacular)
        SubElement(languages, 'language', lang=ws_vernacular, font="Charis SIL", vernacular="true")

        # 2. Analysis/Gloss Language (Used for word glosses and often for the title)
        SubElement(languages, 'language', lang=ws_gloss, font="Times New Roman")

        # 3. Free Translation/Title Language (If different from the gloss language)
        if ws_freetrans != ws_gloss:
            SubElement(languages, 'language', lang=ws_freetrans, font="Times New Roman")

        return languages

    # Initialize counter for debugging output
    missing_freetrans_count = 0

    # 1. Create the outermost root: <document>
    document_root = Element('document')
    document_root.set('version', '2')

    # 2. Create the <interlinear-text> container
    flextext_root = SubElement(document_root, 'interlinear-text')

    # 3. Extract and create the Title
    title_element = xml_root_in.find('.//title')
    title_text = title_element.text if title_element is not None and title_element.text else "Untitled Text"

    title_item = SubElement(flextext_root, 'item')
    title_item.set('type', 'title')
    title_item.set('lang', ws_freetrans)
    title_item.text = title_text

    # 4. Process the Body (Paragraphs and Phrases)
    paragraphs_container = SubElement(flextext_root, 'paragraphs')

    for paragraph_in in xml_root_in.findall('.//paragraph'):

        # Create a new <paragraph>
        paragraph_out = SubElement(paragraphs_container, 'paragraph')
        phrases_container = SubElement(paragraph_out, 'phrases')

        # Iterate through lines (which become phrases)
        for line in paragraph_in.findall('./line'):

            phrase_element = SubElement(phrases_container, 'phrase')

            # Get Vernacular and Gloss lines first, as we need the word data
            vern_line = line.find('./il-lines/vernacular-line')
            gloss_line = line.find('./il-lines/gloss-line')

            if vern_line is None or gloss_line is None:
                # Skip creation if no interlinear data exists
                continue

            vern_words = vern_line.findall('./wrd')
            glosses = gloss_line.findall('./gls')


            # --- STEP 4.1: Add Vernacular Sentence (type="txt") - SHOULD BE FIRST ---
            # Reconstruct the full sentence by joining the text of all vernacular words.
            full_vernacular_text = " ".join(
                wrd.text for wrd in vern_words if wrd.text is not None and wrd.text.strip() != ""
            )

            if full_vernacular_text:
                vern_item = SubElement(phrase_element, 'item')
                vern_item.set('type', 'txt')
                vern_item.set('lang', ws_vernacular)
                vern_item.text = full_vernacular_text


            # --- STEP 4.2: Add Free Translation (<item type="gls">) - SHOULD BE SECOND ---
            free_element = line.find('./free')

            if free_element is not None and free_element.text is not None:
                free_translation_text = free_element.text.strip()
            else:
                free_translation_text = ""

            if free_translation_text:
                free_item = SubElement(phrase_element, 'item')
                free_item.set('type', 'gls')
                free_item.set('lang', ws_freetrans) # Use Free Translation WS code
                free_item.text = free_translation_text
            else:
                # Increment counter if no meaningful free translation text was found
                missing_freetrans_count += 1


            # --- STEP 4.3: Prepare and fill the <words> container (The interlinear data) - SHOULD BE LAST ---
            words_container = SubElement(phrase_element, 'words')

            # Process words and glosses 1:1
            for i in range(min(len(vern_words), len(glosses))):
                vern_word = vern_words[i]
                word_gloss = glosses[i]

                word_element = SubElement(words_container, 'word')

                # Add Vernacular Word (<item type="txt">)
                txt_item = SubElement(word_element, 'item')
                txt_item.set('type', 'txt')
                txt_item.set('lang', ws_vernacular) # Use Vernacular WS code
                txt_item.text = vern_word.text if vern_word.text else ""

                # Add Word Gloss (<item type="gls">)
                gls_item = SubElement(word_element, 'item')
                gls_item.set('type', 'gls')
                gls_item.set('lang', ws_gloss) # Use Gloss WS code
                gls_item.text = word_gloss.text if word_gloss.text else ""

    # 5. Add the mandatory <languages> block
    languages_block = create_languages_block() # Called without arguments
    flextext_root.append(languages_block)

    return document_root, missing_freetrans_count

# ======================================================================
# --- HELPER FUNCTIONS (Outside main conversion) ---
# ======================================================================

def prettify_xml(element):
    """Return a pretty-printed XML string for the given element."""
    rough_string = tostring(element, encoding='utf-8')
    reparsed = minidom.parseString(rough_string)
    # Ensure UTF-8 output
    return reparsed.toprettyxml(encoding='utf-8', indent="  ").decode('utf-8')

# ======================================================================
# --- CLI WRAPPER (Execution Block) ---
# ======================================================================

def cli_wrapper():
    """Handles command-line arguments, user input, I/O, and error logging."""
    parser = argparse.ArgumentParser(
        description="Transforms a custom XML format into a FLExText XML format."
    )
    parser.add_argument(
        "input_xml_file",
        help="The path to the input XML document (e.g., output_text.xml)."
    )
    args = parser.parse_args()

    input_path = os.path.abspath(args.input_xml_file)
    base_name, _ = os.path.splitext(input_path)
    output_flextext_path = base_name + ".flextext"
    error_log_path = base_name + "_error.log"

    # 1. Input File Validation and Prompts
    if not os.path.exists(input_path):
        error_message = f"FATAL ERROR: Input XML file not found at path: {input_path}\n"
        with open(error_log_path, 'w', encoding='utf-8') as f:
            f.write(error_message)
        print(f"FATAL ERROR: Input XML file not found. Details logged to {os.path.basename(error_log_path)}")
        sys.exit(1)

    print("\n--- FLEx Writing System Configuration ---")
    print("Please enter the exact writing system codes (WS Codes) used in your FLEx project.")
    print("This ensures the text imports correctly into the corresponding fields.")
    print("(You can find these under Tools -> Configure -> Writing Systems...)\n")

    ws_vernacular = input("1. Enter Vernacular (Baseline) WS Code (e.g., 'fau' or 'v'): ").strip()
    ws_gloss = input("2. Enter Word Gloss (Analysis) WS Code (e.g., 'en' or 'gls'): ").strip()
    ws_freetrans = input("3. Enter Free Translation WS Code (e.g., 'en' or 'ft'): ").strip()

    if not (ws_vernacular and ws_gloss and ws_freetrans):
        error_message = "\nFATAL ERROR: All three writing system codes must be provided."
        with open(error_log_path, 'w', encoding='utf-8') as f:
            f.write(error_message + '\n')
        print(error_message)
        sys.exit(1)

    input_root = None # XML object placeholder

    # 2. Parse Input XML
    try:
        print("\n1. Parsing Input XML...")
        xml_tree = parse(input_path)
        input_root = xml_tree.getroot()
        if input_root.tag != 'text':
             raise ValueError(f"Root tag expected to be 'text', found '{input_root.tag}'")
        print("   - Input XML successfully parsed.")
    except Exception:
        error_message = f"\nFATAL ERROR during XML Parsing:\n{traceback.format_exc()}"
        with open(error_log_path, 'w', encoding='utf-8') as f:
            f.write(error_message)
        print(f"\nFATAL ERROR: Could not parse XML file. Details logged to {os.path.basename(error_log_path)}")
        sys.exit(1)

    # 3. Perform Conversion (Calls the main modular function)
    try:
        print("2. Transforming XML to FLExText object...")
        # Capture both the XML document root and the missing free translations count
        document_root, missing_freetrans_count = transform_to_flextext_dom(
            input_root, ws_vernacular, ws_gloss, ws_freetrans
        )
        print("   - Transformation successful.")
    except Exception:
        error_message = f"\nFATAL ERROR during XML Transformation:\n{traceback.format_exc()}"
        with open(error_log_path, 'w', encoding='utf-8') as f:
            f.write(error_message)
        print(f"\nFATAL ERROR: Conversion failed. Details logged to {os.path.basename(error_log_path)}")
        sys.exit(1)

    # 4. Write Output XML (FlexText)
    try:
        print("3. Writing output FlexText file...")
        pretty_xml = prettify_xml(document_root)

        with open(output_flextext_path, 'w', encoding='utf-8') as f:
            f.write(pretty_xml)

        print(f"\nCOMPLETED SUCCESSFULLY.")
        print(f"   - FlexText output saved to: '{os.path.basename(output_flextext_path)}'")

        # --- Debugging Output Alert ---
        if missing_freetrans_count > 0:
            print(f"\n*** WARNING ***")
            print(f"The script skipped adding the Free Translation for {missing_freetrans_count} line(s).")
            print("This usually happens if the corresponding '<free>' element in your source XML was missing or contained only empty space.")

        # Clean up the error log if the entire process was successful
        if os.path.exists(error_log_path):
            os.remove(error_log_path)

    except Exception:
        # This error handles failure during the final writing phase
        error_message = f"\nERROR during file writing. Output file may be incomplete.\n{traceback.format_exc()}"
        with open(error_log_path, 'a', encoding='utf-8') as f:
            f.write(error_message)
        print(f"ERROR: Could not write FlexText file. Details logged to {os.path.basename(error_log_path)}")
        sys.exit(1)

if __name__ == "__main__":
    cli_wrapper()