diff --git a/docx2everything/converters/markdown_converter.py b/docx2everything/converters/markdown_converter.py index 0ff408f..79e8389 100644 --- a/docx2everything/converters/markdown_converter.py +++ b/docx2everything/converters/markdown_converter.py @@ -83,32 +83,30 @@ def parse_run_to_markdown(r_elem, hyperlinks=None, images=None, img_dir=None, zi text = '' rPr = r_elem.find(qn('w:rPr')) - # Extract text from runs - for t_elem in r_elem.findall(qn('w:t')): - if t_elem.text: - text += t_elem.text - - # Handle tabs and breaks - for tab in r_elem.findall(qn('w:tab')): - text += ' ' # Convert tab to 4 spaces - for br in r_elem.findall(qn('w:br')): - text += '\n' - - # Handle footnote references - for footnote_ref in r_elem.findall(qn('w:footnoteReference')): - footnote_id = footnote_ref.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id') - if footnotes and footnote_id in footnotes: - text += f'[^{footnote_id}]' - else: - text += f'[^{footnote_id}]' - - # Handle endnote references - for endnote_ref in r_elem.findall(qn('w:endnoteReference')): - endnote_id = endnote_ref.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id') - if endnotes and endnote_id in endnotes: - text += f'[^{endnote_id}]' - else: - text += f'[^{endnote_id}]' + # Extract run content in document order. + for child in r_elem: + if child.tag == qn('w:t') and child.text: + text += child.text + elif child.tag == qn('w:tab'): + text += ' ' # Convert tab to 4 spaces + elif child.tag == qn('w:br'): + break_type = child.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}type') + if break_type == 'page': + text += '\n\n\n\n' + else: + text += '\n' + elif child.tag == qn('w:footnoteReference'): + footnote_id = child.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id') + if footnotes and footnote_id in footnotes: + text += f'[^{footnote_id}]' + else: + text += f'[^{footnote_id}]' + elif child.tag == qn('w:endnoteReference'): + endnote_id = child.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id') + if endnotes and endnote_id in endnotes: + text += f'[^{endnote_id}]' + else: + text += f'[^{endnote_id}]' if not text: return '' diff --git a/tests/test_markdown_manual_page_break.py b/tests/test_markdown_manual_page_break.py new file mode 100644 index 0000000..6248683 --- /dev/null +++ b/tests/test_markdown_manual_page_break.py @@ -0,0 +1,42 @@ +import xml.etree.ElementTree as ET + +from docx2everything.converters.markdown_converter import parse_run_to_markdown + + +W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + + +def run_with_break(break_type=None): + type_attr = "" if break_type is None else f' w:type="{break_type}"' + xml = f""" + + Before + + After + + """ + return ET.fromstring(xml) + + +def test_page_break_run_outputs_page_break_marker(): + run = run_with_break("page") + + markdown = parse_run_to_markdown(run) + + assert markdown == "Before\n\n\n\nAfter" + + +def test_regular_break_run_outputs_newline(): + run = run_with_break() + + markdown = parse_run_to_markdown(run) + + assert markdown == "Before\nAfter" + + +def test_text_wrapping_break_run_outputs_newline(): + run = run_with_break("textWrapping") + + markdown = parse_run_to_markdown(run) + + assert markdown == "Before\nAfter"