Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 24 additions & 26 deletions docx2everything/converters/markdown_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,32 +83,30 @@ def parse_run_to_markdown(r_elem, hyperlinks=None, images=None, img_dir=None, zi
text = ''
rPr = r_elem.find(qn('w:rPr'))

# Extract text from runs
for t_elem in r_elem.findall(qn('w:t')):
if t_elem.text:
text += t_elem.text

# Handle tabs and breaks
for tab in r_elem.findall(qn('w:tab')):
text += ' ' # Convert tab to 4 spaces
for br in r_elem.findall(qn('w:br')):
text += '\n'

# Handle footnote references
for footnote_ref in r_elem.findall(qn('w:footnoteReference')):
footnote_id = footnote_ref.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id')
if footnotes and footnote_id in footnotes:
text += f'[^{footnote_id}]'
else:
text += f'[^{footnote_id}]'

# Handle endnote references
for endnote_ref in r_elem.findall(qn('w:endnoteReference')):
endnote_id = endnote_ref.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id')
if endnotes and endnote_id in endnotes:
text += f'[^{endnote_id}]'
else:
text += f'[^{endnote_id}]'
# Extract run content in document order.
for child in r_elem:
if child.tag == qn('w:t') and child.text:
text += child.text
elif child.tag == qn('w:tab'):
text += ' ' # Convert tab to 4 spaces
elif child.tag == qn('w:br'):
break_type = child.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}type')
if break_type == 'page':
text += '\n\n<!-- Page Break -->\n\n'
else:
text += '\n'
elif child.tag == qn('w:footnoteReference'):
footnote_id = child.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id')
if footnotes and footnote_id in footnotes:
text += f'[^{footnote_id}]'
else:
text += f'[^{footnote_id}]'
elif child.tag == qn('w:endnoteReference'):
endnote_id = child.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}id')
if endnotes and endnote_id in endnotes:
text += f'[^{endnote_id}]'
else:
text += f'[^{endnote_id}]'

if not text:
return ''
Expand Down
42 changes: 42 additions & 0 deletions tests/test_markdown_manual_page_break.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import xml.etree.ElementTree as ET

from docx2everything.converters.markdown_converter import parse_run_to_markdown


W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"


def run_with_break(break_type=None):
type_attr = "" if break_type is None else f' w:type="{break_type}"'
xml = f"""
<w:r xmlns:w="{W_NS}">
<w:t>Before</w:t>
<w:br{type_attr}/>
<w:t>After</w:t>
</w:r>
"""
return ET.fromstring(xml)


def test_page_break_run_outputs_page_break_marker():
run = run_with_break("page")

markdown = parse_run_to_markdown(run)

assert markdown == "Before\n\n<!-- Page Break -->\n\nAfter"


def test_regular_break_run_outputs_newline():
run = run_with_break()

markdown = parse_run_to_markdown(run)

assert markdown == "Before\nAfter"


def test_text_wrapping_break_run_outputs_newline():
run = run_with_break("textWrapping")

markdown = parse_run_to_markdown(run)

assert markdown == "Before\nAfter"