From 75ec45237479e4a52699dae87170d3d9ae025a7d Mon Sep 17 00:00:00 2001
From: Jake Nesler <62406670+JakeNesler@users.noreply.github.com>
Date: Sat, 27 Dec 2025 02:17:12 -0500
Subject: [PATCH 1/4] added script converter
---
.gitignore | 6 +-
examples/index.html | 89 +++++++++--
internal/converter/converter.go | 140 +++++++++++++++--
internal/converter/converter_test.go | 217 +++++++++++++++++++++++++++
internal/middleware/middleware.go | 61 +++++---
5 files changed, 469 insertions(+), 44 deletions(-)
diff --git a/.gitignore b/.gitignore
index 7192557..8de2f8d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -37,4 +37,8 @@ libschema.so
**/node_modules/
# Generated release notes
-RELEASE_NOTES.md
\ No newline at end of file
+RELEASE_NOTES.md
+bakes/
+.mcp.json
+fast.yaml
+gremllm-server
diff --git a/examples/index.html b/examples/index.html
index 78f0745..bda9478 100644
--- a/examples/index.html
+++ b/examples/index.html
@@ -3,8 +3,7 @@
-
- Sample Page - LLM Schema Test
+ Sample Page - Gremllm Test
- Welcome to the LLM Schema Test Site
+ Welcome to the Gremllm Test Site
Introduction
- This is a sample HTML page designed to test the LLM Schema middleware. The middleware should be able to serve this content in two ways:
+ This is a sample HTML page designed to test the Gremllm middleware. The middleware serves content in two ways:
- As HTML: When you visit
/index.html
- - As optimized content: When you visit
/index.md
+ - As optimized content: When you visit
/index.html?gremllm
Features
- The LLM Schema project aims to provide:
+ The Gremllm project aims to provide:
- Token-optimized content for LLMs (50-80% reduction)
- Semantic preservation of meaning
@@ -38,10 +37,82 @@ Features
- Easy adoption for site owners
+ Interactive Widget Examples
+ The scripts below will be replaced with their descriptions for LLMs:
+
+ Live Clock
+
+
+
+ Calculator
+
+
+
+ Image Example
+ This image will be replaced with its alt text:
+
+
Code Example
func Convert(html string) string {
- // Convert HTML to LLM-optimized markdown
- return markdown
+ // Convert HTML to LLM-optimized content
+ return optimized
}
@@ -56,7 +127,7 @@ Newsletter Signup
diff --git a/internal/converter/converter.go b/internal/converter/converter.go
index c473bbe..447710e 100644
--- a/internal/converter/converter.go
+++ b/internal/converter/converter.go
@@ -2,17 +2,131 @@ package converter
import (
"bytes"
+ "strings"
"golang.org/x/net/html"
)
type StripConfig struct {
- ElementsToStrip []string
+ ElementsToStrip []string
+ RemoveImagesNoAlt bool // If true, remove images without alt text entirely
+}
+
+// Default elements to strip - users can preserve with data-llm="keep"
+var defaultStripElements = []string{"nav", "aside", "footer", "header", "script", "style", "noscript", "svg", "iframe"}
+
+// ProcessScripts handles script tags with data-llm-description attribute.
+// If a script has data-llm-description, it is replaced with a descriptive text node.
+// Scripts with data-llm="keep" are preserved. Scripts without description are left
+// for StripElements to remove.
+func ProcessScripts(n *html.Node) {
+ type scriptReplacement struct {
+ node *html.Node
+ desc string
+ }
+
+ var f func(*html.Node)
+ f = func(parent *html.Node) {
+ var toReplace []scriptReplacement
+
+ for c := parent.FirstChild; c != nil; c = c.NextSibling {
+ if c.Type == html.ElementNode && c.Data == "script" {
+ // Check for data-llm="keep" - if present, skip entirely
+ shouldSkip := false
+ var description string
+
+ for _, attr := range c.Attr {
+ if attr.Key == "data-llm" && attr.Val == "keep" {
+ shouldSkip = true
+ break
+ }
+ if attr.Key == "data-llm-description" {
+ description = attr.Val
+ }
+ }
+
+ if !shouldSkip && strings.TrimSpace(description) != "" {
+ toReplace = append(toReplace, scriptReplacement{node: c, desc: description})
+ }
+ // Scripts without description are left for StripElements to remove
+ } else {
+ f(c) // Recurse into non-script elements
+ }
+ }
+
+ // Replace scripts with text nodes
+ for _, item := range toReplace {
+ textNode := &html.Node{
+ Type: html.TextNode,
+ Data: "There's some javascript that has the following description: " + item.desc,
+ }
+ parent.InsertBefore(textNode, item.node)
+ parent.RemoveChild(item.node)
+ }
+ }
+ f(n)
+}
+
+// ProcessImages replaces img tags with their alt text.
+// Format: "[Image: alt text]" or "[Image]" if no alt.
+// If removeIfNoAlt is true and no alt text exists, the image is removed entirely.
+func ProcessImages(n *html.Node, removeIfNoAlt bool) {
+ type imageReplacement struct {
+ node *html.Node
+ alt string
+ remove bool
+ }
+
+ var f func(*html.Node)
+ f = func(parent *html.Node) {
+ var toProcess []imageReplacement
+
+ for c := parent.FirstChild; c != nil; c = c.NextSibling {
+ if c.Type == html.ElementNode && c.Data == "img" {
+ var altText string
+ for _, attr := range c.Attr {
+ if attr.Key == "alt" {
+ altText = attr.Val
+ break
+ }
+ }
+
+ shouldRemove := altText == "" && removeIfNoAlt
+ toProcess = append(toProcess, imageReplacement{
+ node: c,
+ alt: altText,
+ remove: shouldRemove,
+ })
+ } else {
+ f(c) // Recurse
+ }
+ }
+
+ for _, item := range toProcess {
+ if item.remove {
+ parent.RemoveChild(item.node)
+ } else {
+ var text string
+ if item.alt != "" {
+ text = "[Image: " + item.alt + "]"
+ } else {
+ text = "[Image]"
+ }
+ textNode := &html.Node{
+ Type: html.TextNode,
+ Data: text,
+ }
+ parent.InsertBefore(textNode, item.node)
+ parent.RemoveChild(item.node)
+ }
+ }
+ }
+ f(n)
}
// StripElements removes specified HTML elements from the DOM
func StripElements(n *html.Node, tags ...string) {
- tagSet := make(map[string]bool)
+ tagSet := make(map[string]bool, len(tags))
for _, tag := range tags {
tagSet[tag] = true
}
@@ -25,28 +139,23 @@ func StripElements(n *html.Node, tags ...string) {
for c := n.FirstChild; c != nil; c = c.NextSibling {
if c.Type == html.ElementNode && tagSet[c.Data] {
shouldKeep := false
-
for _, attr := range c.Attr {
- // Check for data-llm attribute, which overrides stripping
if attr.Key == "data-llm" && attr.Val == "keep" {
shouldKeep = true
+ break
}
}
-
- // If we decided not to keep, mark for removal
if !shouldKeep {
toRemove = append(toRemove, c)
}
} else {
shouldKeep := true
-
for _, attr := range c.Attr {
- // Check for data-llm attribute, which overrides stripping
if attr.Key == "data-llm" && attr.Val == "drop" {
shouldKeep = false
+ break
}
}
-
if !shouldKeep {
toRemove = append(toRemove, c)
} else {
@@ -70,10 +179,15 @@ func ProcessHTML(htmlContent []byte, stripConfig StripConfig) ([]byte, error) {
return nil, err
}
- // Add default elements to strip, if folks want to keep these they can use
- // data-llm="keep" on them
- elementsToStrip := stripConfig.ElementsToStrip
- elementsToStrip = append(elementsToStrip, "header", "footer")
+ // Process scripts with data-llm-description FIRST (before stripping)
+ // This extracts descriptions before scripts are removed
+ ProcessScripts(doc)
+
+ // Process images (replace with alt text)
+ ProcessImages(doc, stripConfig.RemoveImagesNoAlt)
+
+ // Combine user-specified elements with defaults
+ elementsToStrip := append(stripConfig.ElementsToStrip, defaultStripElements...)
// Strip specified tags
StripElements(doc, elementsToStrip...)
diff --git a/internal/converter/converter_test.go b/internal/converter/converter_test.go
index d429e7e..6385931 100644
--- a/internal/converter/converter_test.go
+++ b/internal/converter/converter_test.go
@@ -113,3 +113,220 @@ func TestProcessHTMLWithDataLLMDrop(t *testing.T) {
t.Error("Result still contains tag with data-llm=\"drop\"")
}
}
+
+func TestProcessHTMLExpandedDefaults(t *testing.T) {
+ input := `
+
+
Test
+
+
+
+
Content
+
+
+
+
+
+
+`
+
+ result, err := ProcessHTML([]byte(input), StripConfig{})
+ if err != nil {
+ t.Fatalf("ProcessHTML failed: %v", err)
+ }
+
+ resultStr := string(result)
+
+ for _, tag := range []string{"