From b8e26859997ca68f59cbc9d85c494e4864b45fa4 Mon Sep 17 00:00:00 2001 From: Bruce-anle <840596168@qq.com> Date: Sat, 9 May 2026 00:30:14 +0800 Subject: [PATCH] fix: detect paragraph property section breaks Background: DOCX section properties are normally stored under w:pPr/w:sectPr. The markdown converter only checked for direct w:p/w:sectPr children, so normal section breaks were missed.\n\nChanges: check both direct and paragraph-property section breaks, with tests for both forms.\n\nVerification: /home/brucean/doc4agent/.venv/bin/python -m pytest tests -q -p no:cacheprovider passed. --- .../converters/markdown_converter.py | 3 +- tests/test_markdown_section_break.py | 46 +++++++++++++++++++ 2 files changed, 48 insertions(+), 1 deletion(-) create mode 100644 tests/test_markdown_section_break.py diff --git a/docx2everything/converters/markdown_converter.py b/docx2everything/converters/markdown_converter.py index 0ff408f..475f714 100644 --- a/docx2everything/converters/markdown_converter.py +++ b/docx2everything/converters/markdown_converter.py @@ -167,7 +167,8 @@ def parse_paragraph_to_markdown(p_elem, numbering_info=None, hyperlinks=None, im # Check for section break has_section_break = False sectPr = p_elem.find(qn('w:sectPr')) - if sectPr is not None: + nested_sectPr = pPr.find(qn('w:sectPr')) if pPr is not None else None + if sectPr is not None or nested_sectPr is not None: has_section_break = True # Check for heading diff --git a/tests/test_markdown_section_break.py b/tests/test_markdown_section_break.py new file mode 100644 index 0000000..ce71a6e --- /dev/null +++ b/tests/test_markdown_section_break.py @@ -0,0 +1,46 @@ +import xml.etree.ElementTree as ET + +from docx2everything.converters.markdown_converter import parse_paragraph_to_markdown + + +W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + + +def paragraph_with_nested_section_break(): + xml = f""" + + + + + Body text + + """ + return ET.fromstring(xml) + + +def paragraph_with_direct_section_break(): + xml = f""" + + + Body text + + """ + return ET.fromstring(xml) + + +def test_nested_section_break_outputs_section_break_marker(): + paragraph = paragraph_with_nested_section_break() + + markdown = parse_paragraph_to_markdown(paragraph) + + assert markdown.startswith("") + assert markdown.endswith("Body text") + + +def test_direct_section_break_still_outputs_section_break_marker(): + paragraph = paragraph_with_direct_section_break() + + markdown = parse_paragraph_to_markdown(paragraph) + + assert markdown.startswith("") + assert markdown.endswith("Body text")