From f152aaa0182aa615ab5659a7085f0f2c5e3e07a6 Mon Sep 17 00:00:00 2001 From: TheOutdoorProgrammer Date: Fri, 26 Dec 2025 23:05:50 -0500 Subject: [PATCH] feat: remove nodes from html by default and allow data-llm to choose whether to remove nodes forcibly or not --- README.md | 11 ++-- cmd/libschema/main.go | 29 +++++++++-- ffi_tests/nodejs/main.js | 8 +-- ffi_tests/python/main.py | 9 ++-- internal/converter/converter.go | 50 +++++++++++------- internal/converter/converter_test.go | 76 +++++++++++++++++++++++++++- internal/middleware/middleware.go | 2 +- scripts/run-ffi-tests.sh | 18 +++++++ 8 files changed, 169 insertions(+), 34 deletions(-) create mode 100755 scripts/run-ffi-tests.sh diff --git a/README.md b/README.md index b9fc9d9..7cd0f6c 100644 --- a/README.md +++ b/README.md @@ -101,17 +101,20 @@ This generates: ### Using the Library from Python ```python -from ctypes import cdll, c_char_p +from ctypes import cdll, c_char_p, POINTER, c_int lib = cdll.LoadLibrary('./build/libschema.so') -lib.Convert.argtypes = [c_char_p] +lib.Convert.argtypes = [c_char_p, POINTER(c_char_p), c_int] lib.Convert.restype = c_char_p -result = lib.Convert(b'
...
...
') +# Convert HTML with optional elements to strip +elements_to_strip = [] +arr = (c_char_p * len(elements_to_strip))(*[s.encode('utf-8') for s in elements_to_strip]) +result = lib.Convert(b'
...
...
', arr, len(elements_to_strip)) print(result.decode()) # Header stripped, main content preserved ``` -See `examples/test_ffi.py` for a complete working example. +See `ffi_tests/python/main.py` for a complete working example. ### Automated Releases diff --git a/cmd/libschema/main.go b/cmd/libschema/main.go index 71b173b..9dc9df3 100644 --- a/cmd/libschema/main.go +++ b/cmd/libschema/main.go @@ -11,20 +11,41 @@ import ( ) //export Convert -func Convert(htmlInput *C.char, stripNav C.int, stripAside C.int, stripScript C.int) *C.char { +// Convert processes HTML with optional element stripping configuration. +// +// IMPORTANT: The function signature uses **C.char and C.int instead of []*C.char because +// Go slices cannot be properly passed across CGO/FFI boundaries. Go slices require a +// slice header (pointer, length, capacity), but when called from Python/Node.js FFI, +// the caller can only pass a C array pointer. Using a Go slice parameter causes a nil +// pointer dereference because the slice header is not properly initialized. +// +// The correct pattern for passing arrays through CGO is to pass the array pointer +// (as **C.char) and its length (as C.int) separately, then use unsafe.Slice to convert +// to a Go slice inside the function. +func Convert(htmlInput *C.char, elementsToStrip **C.char, elementsLen C.int) *C.char { if htmlInput == nil { return C.CString("") } // Convert C string to Go string goHTML := C.GoString(htmlInput) + var goElementsToStrip []string + + // Convert C array to Go slice using pointer arithmetic + if elementsToStrip != nil && elementsLen > 0 { + // Create a slice from the C array + cArray := unsafe.Slice(elementsToStrip, elementsLen) + for _, cstr := range cArray { + if cstr != nil { + goElementsToStrip = append(goElementsToStrip, C.GoString(cstr)) + } + } + } // Use the converter package to process HTML with options // Convert C ints to Go bools stripConfig := converter.StripConfig{ - StripNav: stripNav != 0, - StripAside: stripAside != 0, - StripScript: stripScript != 0, + ElementsToStrip: goElementsToStrip, } processed, err := converter.ProcessHTML([]byte(goHTML), stripConfig) if err != nil { diff --git a/ffi_tests/nodejs/main.js b/ffi_tests/nodejs/main.js index 70f58da..8e222d8 100644 --- a/ffi_tests/nodejs/main.js +++ b/ffi_tests/nodejs/main.js @@ -4,7 +4,8 @@ const koffi = require('koffi') const lib = koffi.load('./build/libschema.so') // Define function signatures - using char* for auto string conversion -const Convert = lib.func('char* Convert(char* htmlInput, int stripNav, int stripAside, int stripScript)') +// Second param is char** (array of strings), third is int (array length) +const Convert = lib.func('char* Convert(char* htmlInput, char** elementsToStrip, int elementsLen)') // Note: Not using Free() due to koffi memory management complexity // In production, you'd need a proper memory management strategy @@ -25,8 +26,9 @@ console.log('Testing Convert()...\n') console.log('Input HTML:') console.log(htmlInput) -// Call with all options enabled (strip nav, aside, script) -const result = Convert(htmlInput, 1, 1, 1) +// Call with empty array (use defaults) +const elementsToStrip = [] +const result = Convert(htmlInput, elementsToStrip, elementsToStrip.length) console.log('\nOutput HTML:') console.log(result) diff --git a/ffi_tests/python/main.py b/ffi_tests/python/main.py index 124faaa..3c50e92 100644 --- a/ffi_tests/python/main.py +++ b/ffi_tests/python/main.py @@ -3,7 +3,7 @@ Test script to verify the CGO library works via Python FFI """ -from ctypes import cdll, c_char_p, c_bool +from ctypes import cdll, c_char_p, POINTER import os # Load the shared library @@ -11,7 +11,8 @@ lib = cdll.LoadLibrary(lib_path) # Set up function signatures -lib.Convert.argtypes = [c_char_p, c_bool, c_bool, c_bool] +from ctypes import c_int +lib.Convert.argtypes = [c_char_p, POINTER(c_char_p), c_int] lib.Convert.restype = c_char_p # Test HTML @@ -27,7 +28,9 @@ """ print("Testing Convert()...") -result = lib.Convert(test_html, True, True, True) +strings = [] +arr = (c_char_p * len(strings))(*[s.encode('utf-8') for s in strings]) +result = lib.Convert(test_html, arr, len(strings)) result_str = result.decode('utf-8') print("\nInput HTML:") diff --git a/internal/converter/converter.go b/internal/converter/converter.go index 2778cdc..c473bbe 100644 --- a/internal/converter/converter.go +++ b/internal/converter/converter.go @@ -7,9 +7,7 @@ import ( ) type StripConfig struct { - StripNav bool - StripAside bool - StripScript bool + ElementsToStrip []string } // StripElements removes specified HTML elements from the DOM @@ -26,9 +24,34 @@ func StripElements(n *html.Node, tags ...string) { for c := n.FirstChild; c != nil; c = c.NextSibling { if c.Type == html.ElementNode && tagSet[c.Data] { - toRemove = append(toRemove, c) + shouldKeep := false + + for _, attr := range c.Attr { + // Check for data-llm attribute, which overrides stripping + if attr.Key == "data-llm" && attr.Val == "keep" { + shouldKeep = true + } + } + + // If we decided not to keep, mark for removal + if !shouldKeep { + toRemove = append(toRemove, c) + } } else { - f(c) // Recurse on children + shouldKeep := true + + for _, attr := range c.Attr { + // Check for data-llm attribute, which overrides stripping + if attr.Key == "data-llm" && attr.Val == "drop" { + shouldKeep = false + } + } + + if !shouldKeep { + toRemove = append(toRemove, c) + } else { + f(c) + } } } @@ -47,20 +70,13 @@ func ProcessHTML(htmlContent []byte, stripConfig StripConfig) ([]byte, error) { return nil, err } - // Build list of tags to strip (always include header and footer) - tags := []string{"header", "footer"} - if stripConfig.StripNav { - tags = append(tags, "nav") - } - if stripConfig.StripAside { - tags = append(tags, "aside") - } - if stripConfig.StripScript { - tags = append(tags, "script", "style") - } + // Add default elements to strip, if folks want to keep these they can use + // data-llm="keep" on them + elementsToStrip := stripConfig.ElementsToStrip + elementsToStrip = append(elementsToStrip, "header", "footer") // Strip specified tags - StripElements(doc, tags...) + StripElements(doc, elementsToStrip...) // Serialize back to HTML var buf bytes.Buffer diff --git a/internal/converter/converter_test.go b/internal/converter/converter_test.go index 6cdc4f2..d429e7e 100644 --- a/internal/converter/converter_test.go +++ b/internal/converter/converter_test.go @@ -16,7 +16,7 @@ func TestProcessHTML(t *testing.T) { ` - result, err := ProcessHTML([]byte(input), StripConfig{StripNav: true, StripAside: true, StripScript: true}) + result, err := ProcessHTML([]byte(input), StripConfig{ElementsToStrip: []string{}}) if err != nil { t.Fatalf("processHTML failed: %v", err) } @@ -38,6 +38,78 @@ func TestProcessHTML(t *testing.T) { if !strings.Contains(resultStr, "Main Content") { t.Error("Result missing main content") } +} + +func TestProcessHTMLWithElementsToStrip(t *testing.T) { + input := ` + +Test + +

Content to keep

+

Content to strip

+ +` + + result, err := ProcessHTML([]byte(input), StripConfig{ElementsToStrip: []string{"span"}}) + if err != nil { + t.Fatalf("processHTML failed: %v", err) + } + + resultStr := string(result) + + // Check that span is removed + if strings.Contains(resultStr, "") { + t.Error("Result still contains tag") + } +} + +func TestProcessHTMLWithDataLLMKeep(t *testing.T) { + input := ` + +Test + +

Header Content

+

Footer Content

+ +` + + result, err := ProcessHTML([]byte(input), StripConfig{ElementsToStrip: []string{}}) + if err != nil { + t.Fatalf("processHTML failed: %v", err) + } + + resultStr := string(result) + // Check that header with data-llm="keep" is preserved + if !strings.Contains(resultStr, "
") { + t.Error("Result missing
tag with data-llm=\"keep\"") + } + // Check that footer is removed + if strings.Contains(resultStr, "
") { + t.Error("Result still contains
tag") + } +} + +func TestProcessHTMLWithDataLLMDrop(t *testing.T) { + input := ` + +Test + +

drop this

+

Footer Content

+ +` + result, err := ProcessHTML([]byte(input), StripConfig{ElementsToStrip: []string{}}) + if err != nil { + t.Fatalf("processHTML failed: %v", err) + } + + resultStr := string(result) - t.Logf("Processed HTML:\n%s", resultStr) + if strings.Contains(resultStr, "
") { + t.Error("Result still contains
tag with data-llm=\"drop\"") + } + + if strings.Contains(resultStr, "drop this") { + t.Error("Result still contains
tag with data-llm=\"drop\"") + } } diff --git a/internal/middleware/middleware.go b/internal/middleware/middleware.go index 1ee359c..0565c94 100644 --- a/internal/middleware/middleware.go +++ b/internal/middleware/middleware.go @@ -58,7 +58,7 @@ func GremllmMiddleware(next http.Handler) http.Handler { next.ServeHTTP(rw, r) // Process the HTML: strip header and footer tags using converter - processed, err := converter.ProcessHTML(rw.body.Bytes(), converter.StripConfig{StripNav: true, StripAside: true, StripScript: true}) + processed, err := converter.ProcessHTML(rw.body.Bytes(), converter.StripConfig{}) if err != nil { // If processing fails, return the original HTML http.Error(w, err.Error(), http.StatusInternalServerError) diff --git a/scripts/run-ffi-tests.sh b/scripts/run-ffi-tests.sh new file mode 100755 index 0000000..6501418 --- /dev/null +++ b/scripts/run-ffi-tests.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +set -e + +mkdir -p build +go build -buildmode=c-shared -o build/libschema.so ./cmd/libschema/ + +cd ffi_tests/nodejs/ +if [ ! -d "node_modules" ]; then + npm install +fi +cd - + +echo "Running Python FFI test..." +python3 ffi_tests/python/main.py + +echo "Running NodeJS FFI test..." +node ffi_tests/nodejs/main.js \ No newline at end of file