Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,17 +101,20 @@ This generates:
### Using the Library from Python

```python
from ctypes import cdll, c_char_p
from ctypes import cdll, c_char_p, POINTER, c_int

lib = cdll.LoadLibrary('./build/libschema.so')
lib.Convert.argtypes = [c_char_p]
lib.Convert.argtypes = [c_char_p, POINTER(c_char_p), c_int]
lib.Convert.restype = c_char_p

result = lib.Convert(b'<html><header>...</header><main>...</main></html>')
# Convert HTML with optional elements to strip
elements_to_strip = []
arr = (c_char_p * len(elements_to_strip))(*[s.encode('utf-8') for s in elements_to_strip])
result = lib.Convert(b'<html><header>...</header><main>...</main></html>', arr, len(elements_to_strip))
print(result.decode()) # Header stripped, main content preserved
```

See `examples/test_ffi.py` for a complete working example.
See `ffi_tests/python/main.py` for a complete working example.

### Automated Releases

Expand Down
29 changes: 25 additions & 4 deletions cmd/libschema/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,20 +11,41 @@ import (
)

//export Convert
func Convert(htmlInput *C.char, stripNav C.int, stripAside C.int, stripScript C.int) *C.char {
// Convert processes HTML with optional element stripping configuration.
//
// IMPORTANT: The function signature uses **C.char and C.int instead of []*C.char because
// Go slices cannot be properly passed across CGO/FFI boundaries. Go slices require a
// slice header (pointer, length, capacity), but when called from Python/Node.js FFI,
// the caller can only pass a C array pointer. Using a Go slice parameter causes a nil
// pointer dereference because the slice header is not properly initialized.
//
// The correct pattern for passing arrays through CGO is to pass the array pointer
// (as **C.char) and its length (as C.int) separately, then use unsafe.Slice to convert
// to a Go slice inside the function.
func Convert(htmlInput *C.char, elementsToStrip **C.char, elementsLen C.int) *C.char {
if htmlInput == nil {
return C.CString("")
}

// Convert C string to Go string
goHTML := C.GoString(htmlInput)
var goElementsToStrip []string

// Convert C array to Go slice using pointer arithmetic
if elementsToStrip != nil && elementsLen > 0 {
// Create a slice from the C array
cArray := unsafe.Slice(elementsToStrip, elementsLen)
for _, cstr := range cArray {
if cstr != nil {
goElementsToStrip = append(goElementsToStrip, C.GoString(cstr))
}
}
}

// Use the converter package to process HTML with options
// Convert C ints to Go bools
stripConfig := converter.StripConfig{
StripNav: stripNav != 0,
StripAside: stripAside != 0,
StripScript: stripScript != 0,
ElementsToStrip: goElementsToStrip,
}
processed, err := converter.ProcessHTML([]byte(goHTML), stripConfig)
if err != nil {
Expand Down
8 changes: 5 additions & 3 deletions ffi_tests/nodejs/main.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ const koffi = require('koffi')
const lib = koffi.load('./build/libschema.so')

// Define function signatures - using char* for auto string conversion
const Convert = lib.func('char* Convert(char* htmlInput, int stripNav, int stripAside, int stripScript)')
// Second param is char** (array of strings), third is int (array length)
const Convert = lib.func('char* Convert(char* htmlInput, char** elementsToStrip, int elementsLen)')

// Note: Not using Free() due to koffi memory management complexity
// In production, you'd need a proper memory management strategy
Expand All @@ -25,8 +26,9 @@ console.log('Testing Convert()...\n')
console.log('Input HTML:')
console.log(htmlInput)

// Call with all options enabled (strip nav, aside, script)
const result = Convert(htmlInput, 1, 1, 1)
// Call with empty array (use defaults)
const elementsToStrip = []
const result = Convert(htmlInput, elementsToStrip, elementsToStrip.length)

console.log('\nOutput HTML:')
console.log(result)
Expand Down
9 changes: 6 additions & 3 deletions ffi_tests/python/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,16 @@
Test script to verify the CGO library works via Python FFI
"""

from ctypes import cdll, c_char_p, c_bool
from ctypes import cdll, c_char_p, POINTER
import os

# Load the shared library
lib_path = os.path.join(os.path.dirname(__file__), '..', '..', 'build', 'libschema.so')
lib = cdll.LoadLibrary(lib_path)

# Set up function signatures
lib.Convert.argtypes = [c_char_p, c_bool, c_bool, c_bool]
from ctypes import c_int
lib.Convert.argtypes = [c_char_p, POINTER(c_char_p), c_int]
lib.Convert.restype = c_char_p

# Test HTML
Expand All @@ -27,7 +28,9 @@
</html>"""

print("Testing Convert()...")
result = lib.Convert(test_html, True, True, True)
strings = []
arr = (c_char_p * len(strings))(*[s.encode('utf-8') for s in strings])
result = lib.Convert(test_html, arr, len(strings))
result_str = result.decode('utf-8')

print("\nInput HTML:")
Expand Down
50 changes: 33 additions & 17 deletions internal/converter/converter.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,7 @@ import (
)

type StripConfig struct {
StripNav bool
StripAside bool
StripScript bool
ElementsToStrip []string
}

// StripElements removes specified HTML elements from the DOM
Expand All @@ -26,9 +24,34 @@ func StripElements(n *html.Node, tags ...string) {

for c := n.FirstChild; c != nil; c = c.NextSibling {
if c.Type == html.ElementNode && tagSet[c.Data] {
toRemove = append(toRemove, c)
shouldKeep := false

for _, attr := range c.Attr {
// Check for data-llm attribute, which overrides stripping
if attr.Key == "data-llm" && attr.Val == "keep" {
shouldKeep = true
}
}

// If we decided not to keep, mark for removal
if !shouldKeep {
toRemove = append(toRemove, c)
}
} else {
f(c) // Recurse on children
shouldKeep := true

for _, attr := range c.Attr {
// Check for data-llm attribute, which overrides stripping
if attr.Key == "data-llm" && attr.Val == "drop" {
shouldKeep = false
}
}

if !shouldKeep {
toRemove = append(toRemove, c)
} else {
f(c)
}
}
}

Expand All @@ -47,20 +70,13 @@ func ProcessHTML(htmlContent []byte, stripConfig StripConfig) ([]byte, error) {
return nil, err
}

// Build list of tags to strip (always include header and footer)
tags := []string{"header", "footer"}
if stripConfig.StripNav {
tags = append(tags, "nav")
}
if stripConfig.StripAside {
tags = append(tags, "aside")
}
if stripConfig.StripScript {
tags = append(tags, "script", "style")
}
// Add default elements to strip, if folks want to keep these they can use
// data-llm="keep" on them
elementsToStrip := stripConfig.ElementsToStrip
elementsToStrip = append(elementsToStrip, "header", "footer")

// Strip specified tags
StripElements(doc, tags...)
StripElements(doc, elementsToStrip...)

// Serialize back to HTML
var buf bytes.Buffer
Expand Down
76 changes: 74 additions & 2 deletions internal/converter/converter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ func TestProcessHTML(t *testing.T) {
</body>
</html>`

result, err := ProcessHTML([]byte(input), StripConfig{StripNav: true, StripAside: true, StripScript: true})
result, err := ProcessHTML([]byte(input), StripConfig{ElementsToStrip: []string{}})
if err != nil {
t.Fatalf("processHTML failed: %v", err)
}
Expand All @@ -38,6 +38,78 @@ func TestProcessHTML(t *testing.T) {
if !strings.Contains(resultStr, "Main Content") {
t.Error("Result missing main content")
}
}

func TestProcessHTMLWithElementsToStrip(t *testing.T) {
input := `<!DOCTYPE html>
<html>
<head><title>Test</title></head>
<body>
<div><p>Content to keep</p></div>
<span><p>Content to strip</p></span>
</body>
</html>`

result, err := ProcessHTML([]byte(input), StripConfig{ElementsToStrip: []string{"span"}})
if err != nil {
t.Fatalf("processHTML failed: %v", err)
}

resultStr := string(result)

// Check that span is removed
if strings.Contains(resultStr, "<span>") {
t.Error("Result still contains <span> tag")
}
}

func TestProcessHTMLWithDataLLMKeep(t *testing.T) {
input := `<!DOCTYPE html>
<html>
<head><title>Test</title></head>
<body>
<header data-llm="keep"><h1>Header Content</h1></header>
<footer><p>Footer Content</p></footer>
</body>
</html>`

result, err := ProcessHTML([]byte(input), StripConfig{ElementsToStrip: []string{}})
if err != nil {
t.Fatalf("processHTML failed: %v", err)
}

resultStr := string(result)
// Check that header with data-llm="keep" is preserved
if !strings.Contains(resultStr, "<header data-llm=\"keep\">") {
t.Error("Result missing <header> tag with data-llm=\"keep\"")
}
// Check that footer is removed
if strings.Contains(resultStr, "<footer>") {
t.Error("Result still contains <footer> tag")
}
}

func TestProcessHTMLWithDataLLMDrop(t *testing.T) {
input := `<!DOCTYPE html>
<html>
<head><title>Test</title></head>
<body>
<div data-llm="drop"><h1>drop this</h1></header>
<footer><p>Footer Content</p></footer>
</body>
</html>`
result, err := ProcessHTML([]byte(input), StripConfig{ElementsToStrip: []string{}})
if err != nil {
t.Fatalf("processHTML failed: %v", err)
}

resultStr := string(result)

t.Logf("Processed HTML:\n%s", resultStr)
if strings.Contains(resultStr, "<div data-llm=\"drop\">") {
t.Error("Result still contains <div> tag with data-llm=\"drop\"")
}

if strings.Contains(resultStr, "drop this") {
t.Error("Result still contains <div> tag with data-llm=\"drop\"")
}
}
2 changes: 1 addition & 1 deletion internal/middleware/middleware.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ func GremllmMiddleware(next http.Handler) http.Handler {
next.ServeHTTP(rw, r)

// Process the HTML: strip header and footer tags using converter
processed, err := converter.ProcessHTML(rw.body.Bytes(), converter.StripConfig{StripNav: true, StripAside: true, StripScript: true})
processed, err := converter.ProcessHTML(rw.body.Bytes(), converter.StripConfig{})
if err != nil {
// If processing fails, return the original HTML
http.Error(w, err.Error(), http.StatusInternalServerError)
Expand Down
18 changes: 18 additions & 0 deletions scripts/run-ffi-tests.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/usr/bin/env bash

set -e

mkdir -p build
go build -buildmode=c-shared -o build/libschema.so ./cmd/libschema/

cd ffi_tests/nodejs/
if [ ! -d "node_modules" ]; then
npm install
fi
cd -

echo "Running Python FFI test..."
python3 ffi_tests/python/main.py

echo "Running NodeJS FFI test..."
node ffi_tests/nodejs/main.js