From 7bf02cbc733b02075e31fa276a2b2ea3fcab943a Mon Sep 17 00:00:00 2001
From: wei <my.cow@163.com>
Date: Thu, 11 Aug 2022 17:31:27 +0800
Subject: [PATCH 1/3] table support cell merging

---
 htmldocx/h2d.py | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)
diff --git a/htmldocx/h2d.py b/htmldocx/h2d.py
index 6a0e113..13a6b1f 100644
--- a/htmldocx/h2d.py
+++ b/htmldocx/h2d.py
@@ -346,16 +346,29 @@ def handle_table(self):
             cols = self.get_table_columns(row)
             cell_col = 0
             for col in cols:
+                colspan = int(col.attrs.get('colspan', 1))
+                rowspan = int(col.attrs.get('rowspan', 1))
+
                 cell_html = self.get_cell_html(col)
                 if col.name == 'th':
                     cell_html = "<b>%s</b>" % cell_html
+
                 docx_cell = self.table.cell(cell_row, cell_col)
+                while docx_cell.text != '':  # Skip the merged cell
+                    cell_col += 1
+                    docx_cell = self.table.cell(cell_row, cell_col)
+
+                cell_to_merge = self.table.cell(cell_row + rowspan - 1, cell_col + colspan - 1)
+                if docx_cell != cell_to_merge:
+                    docx_cell.merge(cell_to_merge)
+
                 child_parser = HtmlToDocx()
                 child_parser.copy_settings_from(self)
-                child_parser.add_html_to_cell(cell_html, docx_cell)
-                cell_col += 1
+                child_parser.add_html_to_cell(cell_html or ' ', docx_cell)  # occupy the position
+
+                cell_col += colspan
             cell_row += 1
-        
+
         # skip all tags until corresponding closing tag
         self.instances_to_skip = len(table_soup.find_all('table'))
         self.skip_tag = 'table'
@@ -581,7 +594,13 @@ def get_table_dimensions(self, table_soup):
         # Thus the row dimensions and column dimensions are assumed to be 0
 
         cols = self.get_table_columns(rows[0]) if rows else []
-        return len(rows), len(cols)
+        # Add colspan calculation column number
+        col_count = 0
+        for col in cols:
+            colspan = col.attrs.get('colspan', 1)
+            col_count += int(colspan)
+
+        return len(rows), col_count
 
     def get_tables(self):
         if not hasattr(self, 'soup'):

From 8f6695a778c68befb302e48ac0ed5201ddbd4524 Mon Sep 17 00:00:00 2001
From: wei <my.cow@163.com>
Date: Fri, 12 Aug 2022 00:31:32 +0800
Subject: [PATCH 2/3] support tables cell merging

---
 .gitignore                        |  1 +
 htmldocx/h2d.py                   | 27 +++++++++++++++++++++++----
 tests/tables3.html                | 22 ++++++++++++++++++++++
 tests/test_tables_cell_merging.py |  9 +++++++++
 4 files changed, 55 insertions(+), 4 deletions(-)
 create mode 100644 tests/tables3.html
 create mode 100644 tests/test_tables_cell_merging.py

diff --git a/.gitignore b/.gitignore
index 526144e..f8923a4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@
 *.pyc
 
 venv
+/.idea
diff --git a/htmldocx/h2d.py b/htmldocx/h2d.py
index 6a0e113..13a6b1f 100644
--- a/htmldocx/h2d.py
+++ b/htmldocx/h2d.py
@@ -346,16 +346,29 @@ def handle_table(self):
             cols = self.get_table_columns(row)
             cell_col = 0
             for col in cols:
+                colspan = int(col.attrs.get('colspan', 1))
+                rowspan = int(col.attrs.get('rowspan', 1))
+
                 cell_html = self.get_cell_html(col)
                 if col.name == 'th':
                     cell_html = "<b>%s</b>" % cell_html
+
                 docx_cell = self.table.cell(cell_row, cell_col)
+                while docx_cell.text != '':  # Skip the merged cell
+                    cell_col += 1
+                    docx_cell = self.table.cell(cell_row, cell_col)
+
+                cell_to_merge = self.table.cell(cell_row + rowspan - 1, cell_col + colspan - 1)
+                if docx_cell != cell_to_merge:
+                    docx_cell.merge(cell_to_merge)
+
                 child_parser = HtmlToDocx()
                 child_parser.copy_settings_from(self)
-                child_parser.add_html_to_cell(cell_html, docx_cell)
-                cell_col += 1
+                child_parser.add_html_to_cell(cell_html or ' ', docx_cell)  # occupy the position
+
+                cell_col += colspan
             cell_row += 1
-        
+
         # skip all tags until corresponding closing tag
         self.instances_to_skip = len(table_soup.find_all('table'))
         self.skip_tag = 'table'
@@ -581,7 +594,13 @@ def get_table_dimensions(self, table_soup):
         # Thus the row dimensions and column dimensions are assumed to be 0
 
         cols = self.get_table_columns(rows[0]) if rows else []
-        return len(rows), len(cols)
+        # Add colspan calculation column number
+        col_count = 0
+        for col in cols:
+            colspan = col.attrs.get('colspan', 1)
+            col_count += int(colspan)
+
+        return len(rows), col_count
 
     def get_tables(self):
         if not hasattr(self, 'soup'):
diff --git a/tests/tables3.html b/tests/tables3.html
new file mode 100644
index 0000000..1fd7694
--- /dev/null
+++ b/tests/tables3.html
@@ -0,0 +1,22 @@
+<table border="1">
+    <tr>
+        <td rowspan="2" colspan="2">aa</td>
+        <td>bb</td>
+    </tr>
+    <tr>
+        <td>cc</td>
+    </tr>
+    <tr>
+        <td>dd</td>
+        <td colspan="2">ee</td>
+    </tr>
+    <tr>
+        <td rowspan="2">ff</td>
+        <td>gg</td>
+        <td>hh</td>
+    </tr>
+    <tr>
+        <td>ii</td>
+        <td>jj</td>
+    </tr>
+</table>
\ No newline at end of file
diff --git a/tests/test_tables_cell_merging.py b/tests/test_tables_cell_merging.py
new file mode 100644
index 0000000..c809ba5
--- /dev/null
+++ b/tests/test_tables_cell_merging.py
@@ -0,0 +1,9 @@
+import os
+from .context import HtmlToDocx, test_dir
+
+# Manual test (requires inspection of result) for converting html with nested tables
+
+filename = os.path.join(test_dir, 'tables3.html')
+d = HtmlToDocx()
+
+d.parse_html_file(filename)

From 0a965ba6a9fb63572432054bcc413ab7eecc0d82 Mon Sep 17 00:00:00 2001
From: wei <my.cow@163.com>
Date: Mon, 12 Dec 2022 09:42:07 +0800
Subject: [PATCH 3/3] fixed handle rgba(38, 42, 51, 0.9) error "invalid literal
 for int() with base 10: '0.9'"

---
 htmldocx/h2d.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/htmldocx/h2d.py b/htmldocx/h2d.py
index 13a6b1f..ad8de1c 100644
--- a/htmldocx/h2d.py
+++ b/htmldocx/h2d.py
@@ -230,7 +230,7 @@ def add_styles_to_run(self, style):
         if 'color' in style:
             if 'rgb' in style['color']:
                 color = re.sub(r'[a-z()]+', '', style['color'])
-                colors = [int(x) for x in color.split(',')]
+                colors = [int(x) for x in color.split(',')[:3]] # 原来处理color: rgba(38, 42, 51, 0.9); 时，有后面的0.9透明度就会报错，现在只截取前3个
             elif '#' in style['color']:
                 color = style['color'].lstrip('#')
                 colors = tuple(int(color[i:i+2], 16) for i in (0, 2, 4))
@@ -309,7 +309,13 @@ def handle_img(self, current_attrs):
         if image:
             try:
                 if isinstance(self.doc, docx.document.Document):
-                    self.doc.add_picture(image)
+                    width = current_attrs.get('width')
+                    height = current_attrs.get('height')
+                    self.doc.add_picture(
+                        image_path_or_stream=image,
+                        width=Inches(int(width) / 72) if width else None,  # 72 is the default dpi
+                        height=Inches(int(height) / 72) if height else None
+                    )
                 else:
                     self.add_image_to_cell(self.doc, image)
             except FileNotFoundError:
@@ -616,16 +622,20 @@ def run_process(self, html):
             html = str(self.soup)
         if self.include_tables:
             self.get_tables()
+
         self.feed(html)
 
+
     def add_html_to_document(self, html, document):
         if not isinstance(html, str):
             raise ValueError('First argument needs to be a %s' % str)
         elif not isinstance(document, docx.document.Document) and not isinstance(document, docx.table._Cell):
             raise ValueError('Second argument needs to be a %s' % docx.document.Document)
+
         self.set_initial_attrs(document)
         self.run_process(html)
 
+
     def add_html_to_cell(self, html, cell):
         if not isinstance(cell, docx.table._Cell):
             raise ValueError('Second argument needs to be a %s' % docx.table._Cell)