From 75ec45237479e4a52699dae87170d3d9ae025a7d Mon Sep 17 00:00:00 2001 From: Jake Nesler <62406670+JakeNesler@users.noreply.github.com> Date: Sat, 27 Dec 2025 02:17:12 -0500 Subject: [PATCH 1/4] added script converter --- .gitignore | 6 +- examples/index.html | 89 +++++++++-- internal/converter/converter.go | 140 +++++++++++++++-- internal/converter/converter_test.go | 217 +++++++++++++++++++++++++++ internal/middleware/middleware.go | 61 +++++--- 5 files changed, 469 insertions(+), 44 deletions(-) diff --git a/.gitignore b/.gitignore index 7192557..8de2f8d 100644 --- a/.gitignore +++ b/.gitignore @@ -37,4 +37,8 @@ libschema.so **/node_modules/ # Generated release notes -RELEASE_NOTES.md \ No newline at end of file +RELEASE_NOTES.md +bakes/ +.mcp.json +fast.yaml +gremllm-server diff --git a/examples/index.html b/examples/index.html index 78f0745..bda9478 100644 --- a/examples/index.html +++ b/examples/index.html @@ -3,8 +3,7 @@ - - Sample Page - LLM Schema Test + Sample Page - Gremllm Test
-

Welcome to the LLM Schema Test Site

+

Welcome to the Gremllm Test Site

Introduction

-

This is a sample HTML page designed to test the LLM Schema middleware. The middleware should be able to serve this content in two ways:

+

This is a sample HTML page designed to test the Gremllm middleware. The middleware serves content in two ways:

Features

-

The LLM Schema project aims to provide:

+

The Gremllm project aims to provide:

  1. Token-optimized content for LLMs (50-80% reduction)
  2. Semantic preservation of meaning
  3. @@ -38,10 +37,82 @@

    Features

  4. Easy adoption for site owners
+

Interactive Widget Examples

+

The scripts below will be replaced with their descriptions for LLMs:

+ +

Live Clock

+
+
+
+
+
+
+
+
+
+ + +

Calculator

+
+
+ + + +
+ + Result: +
+ + +

Image Example

+

This image will be replaced with its alt text:

+ Architecture diagram showing HTML flowing through Gremllm middleware to produce LLM-optimized output +

Code Example

func Convert(html string) string {
-    // Convert HTML to LLM-optimized markdown
-    return markdown
+    // Convert HTML to LLM-optimized content
+    return optimized
 }
@@ -56,7 +127,7 @@

Newsletter Signup

diff --git a/internal/converter/converter.go b/internal/converter/converter.go index c473bbe..447710e 100644 --- a/internal/converter/converter.go +++ b/internal/converter/converter.go @@ -2,17 +2,131 @@ package converter import ( "bytes" + "strings" "golang.org/x/net/html" ) type StripConfig struct { - ElementsToStrip []string + ElementsToStrip []string + RemoveImagesNoAlt bool // If true, remove images without alt text entirely +} + +// Default elements to strip - users can preserve with data-llm="keep" +var defaultStripElements = []string{"nav", "aside", "footer", "header", "script", "style", "noscript", "svg", "iframe"} + +// ProcessScripts handles script tags with data-llm-description attribute. +// If a script has data-llm-description, it is replaced with a descriptive text node. +// Scripts with data-llm="keep" are preserved. Scripts without description are left +// for StripElements to remove. +func ProcessScripts(n *html.Node) { + type scriptReplacement struct { + node *html.Node + desc string + } + + var f func(*html.Node) + f = func(parent *html.Node) { + var toReplace []scriptReplacement + + for c := parent.FirstChild; c != nil; c = c.NextSibling { + if c.Type == html.ElementNode && c.Data == "script" { + // Check for data-llm="keep" - if present, skip entirely + shouldSkip := false + var description string + + for _, attr := range c.Attr { + if attr.Key == "data-llm" && attr.Val == "keep" { + shouldSkip = true + break + } + if attr.Key == "data-llm-description" { + description = attr.Val + } + } + + if !shouldSkip && strings.TrimSpace(description) != "" { + toReplace = append(toReplace, scriptReplacement{node: c, desc: description}) + } + // Scripts without description are left for StripElements to remove + } else { + f(c) // Recurse into non-script elements + } + } + + // Replace scripts with text nodes + for _, item := range toReplace { + textNode := &html.Node{ + Type: html.TextNode, + Data: "There's some javascript that has the following description: " + item.desc, + } + parent.InsertBefore(textNode, item.node) + parent.RemoveChild(item.node) + } + } + f(n) +} + +// ProcessImages replaces img tags with their alt text. +// Format: "[Image: alt text]" or "[Image]" if no alt. +// If removeIfNoAlt is true and no alt text exists, the image is removed entirely. +func ProcessImages(n *html.Node, removeIfNoAlt bool) { + type imageReplacement struct { + node *html.Node + alt string + remove bool + } + + var f func(*html.Node) + f = func(parent *html.Node) { + var toProcess []imageReplacement + + for c := parent.FirstChild; c != nil; c = c.NextSibling { + if c.Type == html.ElementNode && c.Data == "img" { + var altText string + for _, attr := range c.Attr { + if attr.Key == "alt" { + altText = attr.Val + break + } + } + + shouldRemove := altText == "" && removeIfNoAlt + toProcess = append(toProcess, imageReplacement{ + node: c, + alt: altText, + remove: shouldRemove, + }) + } else { + f(c) // Recurse + } + } + + for _, item := range toProcess { + if item.remove { + parent.RemoveChild(item.node) + } else { + var text string + if item.alt != "" { + text = "[Image: " + item.alt + "]" + } else { + text = "[Image]" + } + textNode := &html.Node{ + Type: html.TextNode, + Data: text, + } + parent.InsertBefore(textNode, item.node) + parent.RemoveChild(item.node) + } + } + } + f(n) } // StripElements removes specified HTML elements from the DOM func StripElements(n *html.Node, tags ...string) { - tagSet := make(map[string]bool) + tagSet := make(map[string]bool, len(tags)) for _, tag := range tags { tagSet[tag] = true } @@ -25,28 +139,23 @@ func StripElements(n *html.Node, tags ...string) { for c := n.FirstChild; c != nil; c = c.NextSibling { if c.Type == html.ElementNode && tagSet[c.Data] { shouldKeep := false - for _, attr := range c.Attr { - // Check for data-llm attribute, which overrides stripping if attr.Key == "data-llm" && attr.Val == "keep" { shouldKeep = true + break } } - - // If we decided not to keep, mark for removal if !shouldKeep { toRemove = append(toRemove, c) } } else { shouldKeep := true - for _, attr := range c.Attr { - // Check for data-llm attribute, which overrides stripping if attr.Key == "data-llm" && attr.Val == "drop" { shouldKeep = false + break } } - if !shouldKeep { toRemove = append(toRemove, c) } else { @@ -70,10 +179,15 @@ func ProcessHTML(htmlContent []byte, stripConfig StripConfig) ([]byte, error) { return nil, err } - // Add default elements to strip, if folks want to keep these they can use - // data-llm="keep" on them - elementsToStrip := stripConfig.ElementsToStrip - elementsToStrip = append(elementsToStrip, "header", "footer") + // Process scripts with data-llm-description FIRST (before stripping) + // This extracts descriptions before scripts are removed + ProcessScripts(doc) + + // Process images (replace with alt text) + ProcessImages(doc, stripConfig.RemoveImagesNoAlt) + + // Combine user-specified elements with defaults + elementsToStrip := append(stripConfig.ElementsToStrip, defaultStripElements...) // Strip specified tags StripElements(doc, elementsToStrip...) diff --git a/internal/converter/converter_test.go b/internal/converter/converter_test.go index d429e7e..6385931 100644 --- a/internal/converter/converter_test.go +++ b/internal/converter/converter_test.go @@ -113,3 +113,220 @@ func TestProcessHTMLWithDataLLMDrop(t *testing.T) { t.Error("Result still contains
tag with data-llm=\"drop\"") } } + +func TestProcessHTMLExpandedDefaults(t *testing.T) { + input := ` + +Test + + + +
Content
+ + + + + + +` + + result, err := ProcessHTML([]byte(input), StripConfig{}) + if err != nil { + t.Fatalf("ProcessHTML failed: %v", err) + } + + resultStr := string(result) + + for _, tag := range []string{"