From f152aaa0182aa615ab5659a7085f0f2c5e3e07a6 Mon Sep 17 00:00:00 2001
From: TheOutdoorProgrammer <joey@theoutdoorprogrammer.com>
Date: Fri, 26 Dec 2025 23:05:50 -0500
Subject: [PATCH] feat: remove nodes from html by default and allow data-llm to
 choose whether to remove nodes forcibly or not

---
 README.md                            | 11 ++--
 cmd/libschema/main.go                | 29 +++++++++--
 ffi_tests/nodejs/main.js             |  8 +--
 ffi_tests/python/main.py             |  9 ++--
 internal/converter/converter.go      | 50 +++++++++++-------
 internal/converter/converter_test.go | 76 +++++++++++++++++++++++++++-
 internal/middleware/middleware.go    |  2 +-
 scripts/run-ffi-tests.sh             | 18 +++++++
 8 files changed, 169 insertions(+), 34 deletions(-)
 create mode 100755 scripts/run-ffi-tests.sh
diff --git a/README.md b/README.md
index b9fc9d9..7cd0f6c 100644
--- a/README.md
+++ b/README.md
@@ -101,17 +101,20 @@ This generates:
 ### Using the Library from Python
 
 ```python
-from ctypes import cdll, c_char_p
+from ctypes import cdll, c_char_p, POINTER, c_int
 
 lib = cdll.LoadLibrary('./build/libschema.so')
-lib.Convert.argtypes = [c_char_p]
+lib.Convert.argtypes = [c_char_p, POINTER(c_char_p), c_int]
 lib.Convert.restype = c_char_p
 
-result = lib.Convert(b'<html><header>...</header><main>...</main></html>')
+# Convert HTML with optional elements to strip
+elements_to_strip = []
+arr = (c_char_p * len(elements_to_strip))(*[s.encode('utf-8') for s in elements_to_strip])
+result = lib.Convert(b'<html><header>...</header><main>...</main></html>', arr, len(elements_to_strip))
 print(result.decode())  # Header stripped, main content preserved
 ```
 
-See `examples/test_ffi.py` for a complete working example.
+See `ffi_tests/python/main.py` for a complete working example.
 
 ### Automated Releases
 
diff --git a/cmd/libschema/main.go b/cmd/libschema/main.go
index 71b173b..9dc9df3 100644
--- a/cmd/libschema/main.go
+++ b/cmd/libschema/main.go
@@ -11,20 +11,41 @@ import (
 )
 
 //export Convert
-func Convert(htmlInput *C.char, stripNav C.int, stripAside C.int, stripScript C.int) *C.char {
+// Convert processes HTML with optional element stripping configuration.
+//
+// IMPORTANT: The function signature uses **C.char and C.int instead of []*C.char because
+// Go slices cannot be properly passed across CGO/FFI boundaries. Go slices require a
+// slice header (pointer, length, capacity), but when called from Python/Node.js FFI,
+// the caller can only pass a C array pointer. Using a Go slice parameter causes a nil
+// pointer dereference because the slice header is not properly initialized.
+//
+// The correct pattern for passing arrays through CGO is to pass the array pointer
+// (as **C.char) and its length (as C.int) separately, then use unsafe.Slice to convert
+// to a Go slice inside the function.
+func Convert(htmlInput *C.char, elementsToStrip **C.char, elementsLen C.int) *C.char {
 	if htmlInput == nil {
 		return C.CString("")
 	}
 
 	// Convert C string to Go string
 	goHTML := C.GoString(htmlInput)
+	var goElementsToStrip []string
+
+	// Convert C array to Go slice using pointer arithmetic
+	if elementsToStrip != nil && elementsLen > 0 {
+		// Create a slice from the C array
+		cArray := unsafe.Slice(elementsToStrip, elementsLen)
+		for _, cstr := range cArray {
+			if cstr != nil {
+				goElementsToStrip = append(goElementsToStrip, C.GoString(cstr))
+			}
+		}
+	}
 
 	// Use the converter package to process HTML with options
 	// Convert C ints to Go bools
 	stripConfig := converter.StripConfig{
-		StripNav:    stripNav != 0,
-		StripAside:  stripAside != 0,
-		StripScript: stripScript != 0,
+		ElementsToStrip: goElementsToStrip,
 	}
 	processed, err := converter.ProcessHTML([]byte(goHTML), stripConfig)
 	if err != nil {
diff --git a/ffi_tests/nodejs/main.js b/ffi_tests/nodejs/main.js
index 70f58da..8e222d8 100644
--- a/ffi_tests/nodejs/main.js
+++ b/ffi_tests/nodejs/main.js
@@ -4,7 +4,8 @@ const koffi = require('koffi')
 const lib = koffi.load('./build/libschema.so')
 
 // Define function signatures - using char* for auto string conversion
-const Convert = lib.func('char* Convert(char* htmlInput, int stripNav, int stripAside, int stripScript)')
+// Second param is char** (array of strings), third is int (array length)
+const Convert = lib.func('char* Convert(char* htmlInput, char** elementsToStrip, int elementsLen)')
 
 // Note: Not using Free() due to koffi memory management complexity
 // In production, you'd need a proper memory management strategy
@@ -25,8 +26,9 @@ console.log('Testing Convert()...\n')
 console.log('Input HTML:')
 console.log(htmlInput)
 
-// Call with all options enabled (strip nav, aside, script)
-const result = Convert(htmlInput, 1, 1, 1)
+// Call with empty array (use defaults)
+const elementsToStrip = []
+const result = Convert(htmlInput, elementsToStrip, elementsToStrip.length)
 
 console.log('\nOutput HTML:')
 console.log(result)
diff --git a/ffi_tests/python/main.py b/ffi_tests/python/main.py
index 124faaa..3c50e92 100644
--- a/ffi_tests/python/main.py
+++ b/ffi_tests/python/main.py
@@ -3,7 +3,7 @@
 Test script to verify the CGO library works via Python FFI
 """
 
-from ctypes import cdll, c_char_p, c_bool
+from ctypes import cdll, c_char_p, POINTER
 import os
 
 # Load the shared library
@@ -11,7 +11,8 @@
 lib = cdll.LoadLibrary(lib_path)
 
 # Set up function signatures
-lib.Convert.argtypes = [c_char_p, c_bool, c_bool, c_bool]
+from ctypes import c_int
+lib.Convert.argtypes = [c_char_p, POINTER(c_char_p), c_int]
 lib.Convert.restype = c_char_p
 
 # Test HTML
@@ -27,7 +28,9 @@
 </html>"""
 
 print("Testing Convert()...")
-result = lib.Convert(test_html, True, True, True)
+strings = []
+arr = (c_char_p * len(strings))(*[s.encode('utf-8') for s in strings])
+result = lib.Convert(test_html, arr, len(strings))
 result_str = result.decode('utf-8')
 
 print("\nInput HTML:")
diff --git a/internal/converter/converter.go b/internal/converter/converter.go
index 2778cdc..c473bbe 100644
--- a/internal/converter/converter.go
+++ b/internal/converter/converter.go
@@ -7,9 +7,7 @@ import (
 )
 
 type StripConfig struct {
-	StripNav    bool
-	StripAside  bool
-	StripScript bool
+	ElementsToStrip []string
 }
 
 // StripElements removes specified HTML elements from the DOM
@@ -26,9 +24,34 @@ func StripElements(n *html.Node, tags ...string) {
 
 		for c := n.FirstChild; c != nil; c = c.NextSibling {
 			if c.Type == html.ElementNode && tagSet[c.Data] {
-				toRemove = append(toRemove, c)
+				shouldKeep := false
+
+				for _, attr := range c.Attr {
+					// Check for data-llm attribute, which overrides stripping
+					if attr.Key == "data-llm" && attr.Val == "keep" {
+						shouldKeep = true
+					}
+				}
+
+				// If we decided not to keep, mark for removal
+				if !shouldKeep {
+					toRemove = append(toRemove, c)
+				}
 			} else {
-				f(c) // Recurse on children
+				shouldKeep := true
+
+				for _, attr := range c.Attr {
+					// Check for data-llm attribute, which overrides stripping
+					if attr.Key == "data-llm" && attr.Val == "drop" {
+						shouldKeep = false
+					}
+				}
+
+				if !shouldKeep {
+					toRemove = append(toRemove, c)
+				} else {
+					f(c)
+				}
 			}
 		}
 
@@ -47,20 +70,13 @@ func ProcessHTML(htmlContent []byte, stripConfig StripConfig) ([]byte, error) {
 		return nil, err
 	}
 
-	// Build list of tags to strip (always include header and footer)
-	tags := []string{"header", "footer"}
-	if stripConfig.StripNav {
-		tags = append(tags, "nav")
-	}
-	if stripConfig.StripAside {
-		tags = append(tags, "aside")
-	}
-	if stripConfig.StripScript {
-		tags = append(tags, "script", "style")
-	}
+	// Add default elements to strip, if folks want to keep these they can use
+	// data-llm="keep" on them
+	elementsToStrip := stripConfig.ElementsToStrip
+	elementsToStrip = append(elementsToStrip, "header", "footer")
 
 	// Strip specified tags
-	StripElements(doc, tags...)
+	StripElements(doc, elementsToStrip...)
 
 	// Serialize back to HTML
 	var buf bytes.Buffer
diff --git a/internal/converter/converter_test.go b/internal/converter/converter_test.go
index 6cdc4f2..d429e7e 100644
--- a/internal/converter/converter_test.go
+++ b/internal/converter/converter_test.go
@@ -16,7 +16,7 @@ func TestProcessHTML(t *testing.T) {
 </body>
 </html>`
 
-	result, err := ProcessHTML([]byte(input), StripConfig{StripNav: true, StripAside: true, StripScript: true})
+	result, err := ProcessHTML([]byte(input), StripConfig{ElementsToStrip: []string{}})
 	if err != nil {
 		t.Fatalf("processHTML failed: %v", err)
 	}
@@ -38,6 +38,78 @@ func TestProcessHTML(t *testing.T) {
 	if !strings.Contains(resultStr, "Main Content") {
 		t.Error("Result missing main content")
 	}
+}
+
+func TestProcessHTMLWithElementsToStrip(t *testing.T) {
+	input := `<!DOCTYPE html>
+<html>
+<head><title>Test</title></head>
+<body>
+	<div><p>Content to keep</p></div>
+	<span><p>Content to strip</p></span>
+</body>
+</html>`
+
+	result, err := ProcessHTML([]byte(input), StripConfig{ElementsToStrip: []string{"span"}})
+	if err != nil {
+		t.Fatalf("processHTML failed: %v", err)
+	}
+
+	resultStr := string(result)
+
+	// Check that span is removed
+	if strings.Contains(resultStr, "<span>") {
+		t.Error("Result still contains <span> tag")
+	}
+}
+
+func TestProcessHTMLWithDataLLMKeep(t *testing.T) {
+	input := `<!DOCTYPE html>
+<html>
+<head><title>Test</title></head>
+<body>
+	<header data-llm="keep"><h1>Header Content</h1></header>
+	<footer><p>Footer Content</p></footer>
+</body>
+</html>`
+
+	result, err := ProcessHTML([]byte(input), StripConfig{ElementsToStrip: []string{}})
+	if err != nil {
+		t.Fatalf("processHTML failed: %v", err)
+	}
+
+	resultStr := string(result)
+	// Check that header with data-llm="keep" is preserved
+	if !strings.Contains(resultStr, "<header data-llm=\"keep\">") {
+		t.Error("Result missing <header> tag with data-llm=\"keep\"")
+	}
+	// Check that footer is removed
+	if strings.Contains(resultStr, "<footer>") {
+		t.Error("Result still contains <footer> tag")
+	}
+}
+
+func TestProcessHTMLWithDataLLMDrop(t *testing.T) {
+	input := `<!DOCTYPE html>
+<html>
+<head><title>Test</title></head>
+<body>
+	<div data-llm="drop"><h1>drop this</h1></header>
+	<footer><p>Footer Content</p></footer>
+</body>
+</html>`
+	result, err := ProcessHTML([]byte(input), StripConfig{ElementsToStrip: []string{}})
+	if err != nil {
+		t.Fatalf("processHTML failed: %v", err)
+	}
+
+	resultStr := string(result)
 
-	t.Logf("Processed HTML:\n%s", resultStr)
+	if strings.Contains(resultStr, "<div data-llm=\"drop\">") {
+		t.Error("Result still contains <div> tag with data-llm=\"drop\"")
+	}
+
+	if strings.Contains(resultStr, "drop this") {
+		t.Error("Result still contains <div> tag with data-llm=\"drop\"")
+	}
 }
diff --git a/internal/middleware/middleware.go b/internal/middleware/middleware.go
index 1ee359c..0565c94 100644
--- a/internal/middleware/middleware.go
+++ b/internal/middleware/middleware.go
@@ -58,7 +58,7 @@ func GremllmMiddleware(next http.Handler) http.Handler {
 			next.ServeHTTP(rw, r)
 
 			// Process the HTML: strip header and footer tags using converter
-			processed, err := converter.ProcessHTML(rw.body.Bytes(), converter.StripConfig{StripNav: true, StripAside: true, StripScript: true})
+			processed, err := converter.ProcessHTML(rw.body.Bytes(), converter.StripConfig{})
 			if err != nil {
 				// If processing fails, return the original HTML
 				http.Error(w, err.Error(), http.StatusInternalServerError)
diff --git a/scripts/run-ffi-tests.sh b/scripts/run-ffi-tests.sh
new file mode 100755
index 0000000..6501418
--- /dev/null
+++ b/scripts/run-ffi-tests.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+
+set -e
+
+mkdir -p build
+go build -buildmode=c-shared -o build/libschema.so ./cmd/libschema/
+
+cd ffi_tests/nodejs/
+if [ ! -d "node_modules" ]; then
+    npm install
+fi
+cd -
+
+echo "Running Python FFI test..."
+python3 ffi_tests/python/main.py
+
+echo "Running NodeJS FFI test..."
+node ffi_tests/nodejs/main.js
\ No newline at end of file