From 7bf02cbc733b02075e31fa276a2b2ea3fcab943a Mon Sep 17 00:00:00 2001 From: wei Date: Thu, 11 Aug 2022 17:31:27 +0800 Subject: [PATCH 1/3] table support cell merging --- htmldocx/h2d.py | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/htmldocx/h2d.py b/htmldocx/h2d.py index 6a0e113..13a6b1f 100644 --- a/htmldocx/h2d.py +++ b/htmldocx/h2d.py @@ -346,16 +346,29 @@ def handle_table(self): cols = self.get_table_columns(row) cell_col = 0 for col in cols: + colspan = int(col.attrs.get('colspan', 1)) + rowspan = int(col.attrs.get('rowspan', 1)) + cell_html = self.get_cell_html(col) if col.name == 'th': cell_html = "%s" % cell_html + docx_cell = self.table.cell(cell_row, cell_col) + while docx_cell.text != '': # Skip the merged cell + cell_col += 1 + docx_cell = self.table.cell(cell_row, cell_col) + + cell_to_merge = self.table.cell(cell_row + rowspan - 1, cell_col + colspan - 1) + if docx_cell != cell_to_merge: + docx_cell.merge(cell_to_merge) + child_parser = HtmlToDocx() child_parser.copy_settings_from(self) - child_parser.add_html_to_cell(cell_html, docx_cell) - cell_col += 1 + child_parser.add_html_to_cell(cell_html or ' ', docx_cell) # occupy the position + + cell_col += colspan cell_row += 1 - + # skip all tags until corresponding closing tag self.instances_to_skip = len(table_soup.find_all('table')) self.skip_tag = 'table' @@ -581,7 +594,13 @@ def get_table_dimensions(self, table_soup): # Thus the row dimensions and column dimensions are assumed to be 0 cols = self.get_table_columns(rows[0]) if rows else [] - return len(rows), len(cols) + # Add colspan calculation column number + col_count = 0 + for col in cols: + colspan = col.attrs.get('colspan', 1) + col_count += int(colspan) + + return len(rows), col_count def get_tables(self): if not hasattr(self, 'soup'): From 8f6695a778c68befb302e48ac0ed5201ddbd4524 Mon Sep 17 00:00:00 2001 From: wei Date: Fri, 12 Aug 2022 00:31:32 +0800 Subject: [PATCH 2/3] support tables cell merging --- .gitignore | 1 + htmldocx/h2d.py | 27 +++++++++++++++++++++++---- tests/tables3.html | 22 ++++++++++++++++++++++ tests/test_tables_cell_merging.py | 9 +++++++++ 4 files changed, 55 insertions(+), 4 deletions(-) create mode 100644 tests/tables3.html create mode 100644 tests/test_tables_cell_merging.py diff --git a/.gitignore b/.gitignore index 526144e..f8923a4 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ *.pyc venv +/.idea diff --git a/htmldocx/h2d.py b/htmldocx/h2d.py index 6a0e113..13a6b1f 100644 --- a/htmldocx/h2d.py +++ b/htmldocx/h2d.py @@ -346,16 +346,29 @@ def handle_table(self): cols = self.get_table_columns(row) cell_col = 0 for col in cols: + colspan = int(col.attrs.get('colspan', 1)) + rowspan = int(col.attrs.get('rowspan', 1)) + cell_html = self.get_cell_html(col) if col.name == 'th': cell_html = "%s" % cell_html + docx_cell = self.table.cell(cell_row, cell_col) + while docx_cell.text != '': # Skip the merged cell + cell_col += 1 + docx_cell = self.table.cell(cell_row, cell_col) + + cell_to_merge = self.table.cell(cell_row + rowspan - 1, cell_col + colspan - 1) + if docx_cell != cell_to_merge: + docx_cell.merge(cell_to_merge) + child_parser = HtmlToDocx() child_parser.copy_settings_from(self) - child_parser.add_html_to_cell(cell_html, docx_cell) - cell_col += 1 + child_parser.add_html_to_cell(cell_html or ' ', docx_cell) # occupy the position + + cell_col += colspan cell_row += 1 - + # skip all tags until corresponding closing tag self.instances_to_skip = len(table_soup.find_all('table')) self.skip_tag = 'table' @@ -581,7 +594,13 @@ def get_table_dimensions(self, table_soup): # Thus the row dimensions and column dimensions are assumed to be 0 cols = self.get_table_columns(rows[0]) if rows else [] - return len(rows), len(cols) + # Add colspan calculation column number + col_count = 0 + for col in cols: + colspan = col.attrs.get('colspan', 1) + col_count += int(colspan) + + return len(rows), col_count def get_tables(self): if not hasattr(self, 'soup'): diff --git a/tests/tables3.html b/tests/tables3.html new file mode 100644 index 0000000..1fd7694 --- /dev/null +++ b/tests/tables3.html @@ -0,0 +1,22 @@ + + + + + + + + + + + + + + + + + + + + + +
aabb
cc
ddee
ffgghh
iijj
\ No newline at end of file diff --git a/tests/test_tables_cell_merging.py b/tests/test_tables_cell_merging.py new file mode 100644 index 0000000..c809ba5 --- /dev/null +++ b/tests/test_tables_cell_merging.py @@ -0,0 +1,9 @@ +import os +from .context import HtmlToDocx, test_dir + +# Manual test (requires inspection of result) for converting html with nested tables + +filename = os.path.join(test_dir, 'tables3.html') +d = HtmlToDocx() + +d.parse_html_file(filename) From 0a965ba6a9fb63572432054bcc413ab7eecc0d82 Mon Sep 17 00:00:00 2001 From: wei Date: Mon, 12 Dec 2022 09:42:07 +0800 Subject: [PATCH 3/3] fixed handle rgba(38, 42, 51, 0.9) error "invalid literal for int() with base 10: '0.9'" --- htmldocx/h2d.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/htmldocx/h2d.py b/htmldocx/h2d.py index 13a6b1f..ad8de1c 100644 --- a/htmldocx/h2d.py +++ b/htmldocx/h2d.py @@ -230,7 +230,7 @@ def add_styles_to_run(self, style): if 'color' in style: if 'rgb' in style['color']: color = re.sub(r'[a-z()]+', '', style['color']) - colors = [int(x) for x in color.split(',')] + colors = [int(x) for x in color.split(',')[:3]] # 原来处理color: rgba(38, 42, 51, 0.9); 时,有后面的0.9透明度就会报错,现在只截取前3个 elif '#' in style['color']: color = style['color'].lstrip('#') colors = tuple(int(color[i:i+2], 16) for i in (0, 2, 4)) @@ -309,7 +309,13 @@ def handle_img(self, current_attrs): if image: try: if isinstance(self.doc, docx.document.Document): - self.doc.add_picture(image) + width = current_attrs.get('width') + height = current_attrs.get('height') + self.doc.add_picture( + image_path_or_stream=image, + width=Inches(int(width) / 72) if width else None, # 72 is the default dpi + height=Inches(int(height) / 72) if height else None + ) else: self.add_image_to_cell(self.doc, image) except FileNotFoundError: @@ -616,16 +622,20 @@ def run_process(self, html): html = str(self.soup) if self.include_tables: self.get_tables() + self.feed(html) + def add_html_to_document(self, html, document): if not isinstance(html, str): raise ValueError('First argument needs to be a %s' % str) elif not isinstance(document, docx.document.Document) and not isinstance(document, docx.table._Cell): raise ValueError('Second argument needs to be a %s' % docx.document.Document) + self.set_initial_attrs(document) self.run_process(html) + def add_html_to_cell(self, html, cell): if not isinstance(cell, docx.table._Cell): raise ValueError('Second argument needs to be a %s' % docx.table._Cell)