From 65fc820d08b868bf7c7360cb153f57a20e6886af Mon Sep 17 00:00:00 2001 From: Bruce-anle <840596168@qq.com> Date: Sat, 9 May 2026 00:01:36 +0800 Subject: [PATCH] fix: respect disabled pageBreakBefore values Background: OpenXML CT_OnOff treats missing w:val as enabled, but explicit 0/false/off as disabled. The markdown converter previously treated any w:pageBreakBefore element as enabled, producing false Page Break markers.\n\nChanges: add CT_OnOff helper and tests for missing, true, and false pageBreakBefore values.\n\nVerification: /home/brucean/doc4agent/.venv/bin/python -m pytest tests -q -p no:cacheprovider passed; fixed smoke converted fumin/wenling false pageBreakBefore samples to 0 page break markers while retaining no-val page break markers in official samples. --- .../converters/markdown_converter.py | 19 ++++++- tests/test_markdown_pagebreak.py | 57 +++++++++++++++++++ 2 files changed, 74 insertions(+), 2 deletions(-) create mode 100644 tests/test_markdown_pagebreak.py diff --git a/docx2everything/converters/markdown_converter.py b/docx2everything/converters/markdown_converter.py index 0ff408f..f32470a 100644 --- a/docx2everything/converters/markdown_converter.py +++ b/docx2everything/converters/markdown_converter.py @@ -65,6 +65,22 @@ def get_heading_level(pStyle_val, styles_info=None): return None +def is_on_off_enabled(elem): + """ + Returns the effective value of an OpenXML CT_OnOff element. + + Missing w:val defaults to true; explicit 0/false/off disables it. + """ + if elem is None: + return False + + val = elem.get(qn('w:val')) + if val is None: + return True + + return val.strip().lower() not in ('0', 'false', 'off') + + def parse_run_to_markdown(r_elem, hyperlinks=None, images=None, img_dir=None, zipf=None, link_url=None, footnotes=None, endnotes=None): """ Converts a text run () to markdown with formatting. @@ -161,8 +177,7 @@ def parse_paragraph_to_markdown(p_elem, numbering_info=None, hyperlinks=None, im has_page_break = False if pPr is not None: page_break_before = pPr.find(qn('w:pageBreakBefore')) - if page_break_before is not None: - has_page_break = True + has_page_break = is_on_off_enabled(page_break_before) # Check for section break has_section_break = False diff --git a/tests/test_markdown_pagebreak.py b/tests/test_markdown_pagebreak.py new file mode 100644 index 0000000..bd11df9 --- /dev/null +++ b/tests/test_markdown_pagebreak.py @@ -0,0 +1,57 @@ +import xml.etree.ElementTree as ET + +from docx2everything.converters.markdown_converter import parse_paragraph_to_markdown + + +W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + + +def paragraph_with_page_break_before(value_marker): + value_attr = "" if value_marker is None else f' w:val="{value_marker}"' + xml = f""" + + + + + Body text + + """ + return ET.fromstring(xml) + + +def test_page_break_before_val_zero_is_disabled(): + paragraph = paragraph_with_page_break_before("0") + + markdown = parse_paragraph_to_markdown(paragraph) + + assert "" not in markdown + assert markdown == "Body text" + + +def test_page_break_before_without_val_is_enabled(): + paragraph = paragraph_with_page_break_before(None) + + markdown = parse_paragraph_to_markdown(paragraph) + + assert markdown.startswith("") + assert markdown.endswith("Body text") + + +def test_page_break_before_true_values_are_enabled(): + for value in ("1", "true", "on"): + paragraph = paragraph_with_page_break_before(value) + + markdown = parse_paragraph_to_markdown(paragraph) + + assert markdown.startswith("") + assert markdown.endswith("Body text") + + +def test_page_break_before_false_values_are_disabled(): + for value in ("false", "off"): + paragraph = paragraph_with_page_break_before(value) + + markdown = parse_paragraph_to_markdown(paragraph) + + assert "" not in markdown + assert markdown == "Body text"