-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathXMLUtil.py
More file actions
37 lines (32 loc) · 1.35 KB
/
XMLUtil.py
File metadata and controls
37 lines (32 loc) · 1.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from nltk.corpus import stopwords
import re
import xml.etree.ElementTree as ET
stop_words = set(stopwords.words("english"))
def compress_XML(input_file, output_file):
"""
Processes the XML to reduces its length for saving context window quota.
"""
with open(input_file, "r", encoding="utf-8") as input_file:
input_text = input_file.read()
def process_text(text):
text = re.sub(r"[\n\r]+", "\n", text)
text = re.sub(r"[^a-zA-Z0-9\s_.,!?:;@#$%^&*()+\-=[\]{}|\\<>`~'\"/]+", "", text)
text = re.sub(r"\s+", " ", text)
text = text.lower()
words = text.split()
words = [word for word in words if word not in stop_words]
return " ".join(words)
try:
root = ET.fromstring(input_text)
for elem in root.iter():
if elem.text:
elem.text = process_text(elem.text)
if elem.tail:
elem.tail = process_text(elem.tail)
tree = ET.ElementTree(root)
tree.write(output_file, encoding="utf-8", xml_declaration=True)
except ET.ParseError:
processed_text = process_text(input_text)
with open(output_file, "w", encoding="utf-8") as out_file:
out_file.write(processed_text)
print("Parsing failed, Text preprocessing finished but without XML structure.")