From 93a2784629d3a98de914887d1763d24d1123b4f0 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 28 Feb 2026 17:07:06 +0000 Subject: [PATCH] Rewrite Go library to faithfully port TypeScript architecture Replace the previous recursive descent parser with the same matcher-based lexer and rule-based parser architecture as the TypeScript implementation: - Matcher-based lexer: fixed, space, line, string, comment, number, text matchers tried in order, with IGNORE token skipping - Rule engine: Rule/RuleSpec/AltSpec with open/close states, push/replace/pop stack operations, 2-token lookahead, counter methods (Eq/Lt/Gt/Lte/Gte) - Grammar: val/map/list/pair/elem rules with exact alternate orderings matching the TypeScript JSON phase + Jsonic extension phase (including unshift vs push semantics, modlist delete/move operations) - Undefined sentinel to distinguish "no value" from nil (null), matching TypeScript's undefined vs null semantics - Slice propagation: elem.BC propagates updated slices to parent list rule to handle Go's append semantics (vs JS reference-type arrays) - Result chain following: parser follows replacement chain to find the actual result rule for implicit list cases All 30 TSV spec tests pass (26 parser + 4 utility). https://claude.ai/code/session_01FUyByRfJWUvyPkFoLZ5z6H --- go/grammar.go | 470 +++++++++++++++++++++++ go/jsonic.go | 27 +- go/lexer.go | 1006 +++++++++++++++++++++++++++++++------------------ go/parser.go | 682 +++++---------------------------- go/rule.go | 392 +++++++++++++++++++ go/token.go | 93 +++++ 6 files changed, 1692 insertions(+), 978 deletions(-) create mode 100644 go/grammar.go create mode 100644 go/rule.go create mode 100644 go/token.go diff --git a/go/grammar.go b/go/grammar.go new file mode 100644 index 0000000..96cc426 --- /dev/null +++ b/go/grammar.go @@ -0,0 +1,470 @@ +package jsonic + +// Grammar builds the default jsonic grammar rules. +// This is a faithful port of grammar.ts, matching the exact alternate orderings +// produced by the JSON phase followed by the Jsonic extension phase. +// +// Key: "unshift" means prepend (default when no {append:true}), +// "push" means append (when {append:true}). +func Grammar(rsm map[string]*RuleSpec, cfg *LexConfig) { + // Token sets + VAL := TinSetVAL // TX, NR, ST, VL + KEY := TinSetKEY // TX, NR, ST, VL + + // Helper: merge two Tin slices + merge := func(a, b []Tin) []Tin { + r := make([]Tin, 0, len(a)+len(b)) + r = append(r, a...) + r = append(r, b...) + return r + } + + // finish error function: if auto-close is disabled, return error token + finish := func(r *Rule, ctx *Context) *Token { + if !cfg.FinishRule { + return ctx.T0 + } + return nil + } + + // pairkey action: extract key from first matched token + pairkey := func(r *Rule, ctx *Context) { + _ = ctx + keyToken := r.O0 + var key string + if keyToken.Tin == TinST || keyToken.Tin == TinTX { + key, _ = keyToken.Val.(string) + } else { + key = keyToken.Src // Numbers, etc. use src as key + } + r.U["key"] = key + } + + // pairval: set key:value on node with merging support. + // Uses r.U["prev"] (saved by pair.BC[0]) to get the previous value, + // then deep-merges if both previous and new values are maps. + pairval := func(r *Rule, ctx *Context) { + _ = ctx + key, _ := r.U["key"].(string) + val := r.Child.Node + + // Convert undefined to null + if IsUndefined(val) { + val = nil + } + + // Use saved previous value (set by pair.BC JSON phase before overwrite) + prev := r.U["prev"] + + if prev == nil { + nodeMapSet(r.Node, key, val) + } else { + // Deep merge using the Deep utility + nodeMapSet(r.Node, key, Deep(prev, val)) + } + } + + // ====== VAL rule ====== + valSpec := &RuleSpec{Name: "val"} + + // BO callbacks (JSON then Jsonic): + // JSON: clear node for new value + valSpec.BO = []StateAction{ + func(r *Rule, ctx *Context) { + _ = ctx + r.Node = Undefined // undefined in TS + }, + } + + // BC callbacks (JSON then Jsonic): + // JSON: resolve node value + valSpec.BC = []StateAction{ + func(r *Rule, ctx *Context) { + _ = ctx + if IsUndefined(r.Node) { + if IsUndefined(r.Child.Node) { + if r.OS == 0 { + r.Node = Undefined // no value + } else { + r.Node = r.O0.ResolveVal() + } + } else { + r.Node = r.Child.Node + } + } + // else: keep r.Node as is + }, + } + + // val.Open ordering (after JSON + Jsonic with append:true, delete:[2]): + // [0] OB → map (JSON) + // [1] OS → list (JSON) + // [2] KEY CL → implicit map at top (Jsonic, was [3] before delete) + // [3] KEY CL → pair dive (Jsonic) + // [4] VAL → plain value (Jsonic, replaces deleted JSON [2]) + // [5] CB/CS → implicit null ends (Jsonic) + // [6] CA → implicit list top (Jsonic) + // [7] CA → null before commas (Jsonic) + // [8] ZZ → end (Jsonic) + valSpec.Open = []*AltSpec{ + // JSON: A map: { ... + {S: [][]Tin{{TinOB}}, P: "map", B: 1}, + // JSON: A list: [ ... + {S: [][]Tin{{TinOS}}, P: "list", B: 1}, + // Jsonic: Implicit map at top level: a: ... + {S: [][]Tin{KEY, {TinCL}}, C: func(r *Rule, ctx *Context) bool { return r.D == 0 }, P: "map", B: 2}, + // Jsonic: Pair dive: a:b: ... + {S: [][]Tin{KEY, {TinCL}}, P: "map", B: 2, N: map[string]int{"pk": 1}}, + // Jsonic: A plain value + {S: [][]Tin{VAL}}, + // Jsonic: Implicit ends {a:} → null, [a:] → null + {S: [][]Tin{{TinCB, TinCS}}, B: 1, C: func(r *Rule, ctx *Context) bool { return r.D > 0 }}, + // Jsonic: Implicit list at top level: a,b + {S: [][]Tin{{TinCA}}, C: func(r *Rule, ctx *Context) bool { return r.D == 0 }, P: "list", B: 1}, + // Jsonic: Value is implicitly null before commas + {S: [][]Tin{{TinCA}}, B: 1}, + // Jsonic: End of source + {S: [][]Tin{{TinZZ}}}, + } + + // val.Close ordering (after JSON + Jsonic with append:true, move:[1,-1]): + // [0] ZZ (JSON end) + // [1] CB/CS close (Jsonic) + // [2] CA implist comma (Jsonic) + // [3] condition-only implist space (Jsonic) + // [4] ZZ (Jsonic end) + // [5] b:1 more (JSON, moved to end) + valSpec.Close = []*AltSpec{ + // JSON: End of source + {S: [][]Tin{{TinZZ}}}, + // Jsonic: Explicitly close map or list: }, ] + {S: [][]Tin{{TinCB, TinCS}}, B: 1, + E: func(r *Rule, ctx *Context) *Token { + if r.D == 0 { + return ctx.T0 + } + return nil + }, + }, + // Jsonic: Implicit list (comma sep) + {S: [][]Tin{{TinCA}}, + C: func(r *Rule, ctx *Context) bool { return r.Lte("dlist", 0) && r.Lte("dmap", 0) }, + R: "list", U: map[string]any{"implist": true}}, + // Jsonic: Implicit list (space sep) - no token match, just condition + {C: func(r *Rule, ctx *Context) bool { return r.Lte("dlist", 0) && r.Lte("dmap", 0) }, + R: "list", U: map[string]any{"implist": true}, B: 1}, + // Jsonic: End of source + {S: [][]Tin{{TinZZ}}}, + // JSON: There's more - backtrack (MOVED TO END by move:[1,-1]) + {B: 1}, + } + + // ====== MAP rule ====== + mapSpec := &RuleSpec{Name: "map"} + + // BO callbacks (JSON then Jsonic): + mapSpec.BO = []StateAction{ + // JSON: create empty map + func(r *Rule, ctx *Context) { + _ = ctx + r.Node = make(map[string]any) + }, + // Jsonic: increment dmap depth + func(r *Rule, ctx *Context) { + _ = ctx + if v, ok := r.N["dmap"]; ok { + r.N["dmap"] = v + 1 + } else { + r.N["dmap"] = 1 + } + }, + } + + // map.Open ordering (after Jsonic unshift + append): + // [0] OB ZZ auto-close (Jsonic, unshifted) + // [1] OB CB empty map (JSON) + // [2] OB pair (JSON) + // [3] KEY CL implicit pair (Jsonic, appended) + mapSpec.Open = []*AltSpec{ + // Jsonic: Auto-close at EOF: {ZZ (unshifted - no append flag) + {S: [][]Tin{{TinOB}, {TinZZ}}, B: 1, E: finish}, + // JSON: Empty map: {} + {S: [][]Tin{{TinOB}, {TinCB}}, B: 1, N: map[string]int{"pk": 0}}, + // JSON: Start pairs: {key: + {S: [][]Tin{{TinOB}}, P: "pair", N: map[string]int{"pk": 0}}, + // Jsonic: Pair from implicit map (no braces) (appended) + {S: [][]Tin{KEY, {TinCL}}, P: "pair", B: 2}, + } + + // map.Close ordering (after Jsonic append + delete:[0]): + // [0] CB with lte(pk) (Jsonic, replaces deleted JSON [0]) + // [1] CB b:1 ascending path (Jsonic) + // [2] CA/CS/VAL b:1 end of implicit path (Jsonic) + // [3] ZZ auto-close (Jsonic) + mapSpec.Close = []*AltSpec{ + // Normal end of map, no path dive + {S: [][]Tin{{TinCB}}, C: func(r *Rule, ctx *Context) bool { return r.Lte("pk", 0) }}, + // Not yet at end of path dive, keep ascending + {S: [][]Tin{{TinCB}}, B: 1}, + // End of implicit path: comma, close-square, or value token + {S: [][]Tin{merge([]Tin{TinCA, TinCS}, VAL)}, B: 1}, + // Auto-close at EOF + {S: [][]Tin{{TinZZ}}, E: finish}, + } + + // ====== LIST rule ====== + listSpec := &RuleSpec{Name: "list"} + + // BO callbacks (JSON then Jsonic): + listSpec.BO = []StateAction{ + // JSON: create empty list + func(r *Rule, ctx *Context) { + _ = ctx + r.Node = make([]any, 0) + }, + // Jsonic: increment dlist depth, handle implist + func(r *Rule, ctx *Context) { + _ = ctx + if v, ok := r.N["dlist"]; ok { + r.N["dlist"] = v + 1 + } else { + r.N["dlist"] = 1 + } + // If previous rule was an implicit list, adopt its node + if r.Prev != NoRule && r.Prev != nil { + if implist, ok := r.Prev.U["implist"]; ok && implist == true { + arr := r.Node.([]any) + prevNode := r.Prev.Node + if IsUndefined(prevNode) { + prevNode = nil + } + arr = append(arr, prevNode) + r.Node = arr + r.Prev.Node = r.Node + } + } + }, + } + + // list.Open ordering (Jsonic unshift + JSON + Jsonic append): + // [0] implist condition (Jsonic, unshifted as single object) + // [1] OS CS empty list (JSON) + // [2] OS elem (JSON) + // [3] CA elem b:1 initial comma (Jsonic, appended) + // [4] p:elem (Jsonic, appended) + listSpec.Open = []*AltSpec{ + // Jsonic: if prev was implist, just push elem (unshifted) + {C: func(r *Rule, ctx *Context) bool { + return r.Prev != NoRule && r.Prev != nil && r.Prev.U["implist"] == true + }, P: "elem"}, + // JSON: Empty list: [] + {S: [][]Tin{{TinOS}, {TinCS}}, B: 1}, + // JSON: Start elements: [elem + {S: [][]Tin{{TinOS}}, P: "elem"}, + // Jsonic: Initial comma [, will insert null + {S: [][]Tin{{TinCA}}, P: "elem", B: 1}, + // Jsonic: Another element (no token match needed) + {P: "elem"}, + } + + // list.Close ordering (JSON + Jsonic append): + // [0] CS end of list (JSON) + // [1] ZZ auto-close (Jsonic, appended) + listSpec.Close = []*AltSpec{ + // JSON: End of list + {S: [][]Tin{{TinCS}}}, + // Jsonic: Auto-close at EOF + {S: [][]Tin{{TinZZ}}, E: finish}, + } + + // ====== PAIR rule ====== + pairSpec := &RuleSpec{Name: "pair"} + + // BC callbacks (JSON then Jsonic): + pairSpec.BC = []StateAction{ + // JSON phase: set key=value + func(r *Rule, ctx *Context) { + if _, ok := r.U["pair"]; ok { + r.U["prev"] = nodeMapGetVal(r.Node, r.U["key"]) + nodeMapSet(r.Node, r.U["key"].(string), r.Child.Node) + } + }, + // Jsonic phase: pairval with merge support + func(r *Rule, ctx *Context) { + if _, ok := r.U["pair"]; ok { + pairval(r, ctx) + } + }, + } + + // pair.Open ordering (JSON + Jsonic append): + // [0] KEY CL pair (JSON) + // [1] CA ignore comma (Jsonic, appended) + pairSpec.Open = []*AltSpec{ + // JSON: key:value pair + {S: [][]Tin{KEY, {TinCL}}, P: "val", U: map[string]any{"pair": true}, A: pairkey}, + // Jsonic: Ignore initial comma: {,a:1 + {S: [][]Tin{{TinCA}}}, + } + + // pair.Close ordering (after Jsonic unshift + delete:[0,1]): + // Jsonic alternates unshifted, then JSON [0] and [1] deleted. + // Final ordering: + // [0] CB lte(pk) b:1 (Jsonic) + // [1] CA CB lte(pk) b:1 (Jsonic) + // [2] CA ZZ (Jsonic) + // [3] CA lte(pk) r:pair (Jsonic) + // [4] CA lte(dmap,1) r:pair (Jsonic) + // [5] KEY lte(dmap,1) r:pair b:1 (Jsonic) + // [6] CB/CA/CS/KEY pk>0 b:1 (Jsonic) + // [7] CS error (Jsonic) + // [8] ZZ finish (Jsonic) + // [9] r:pair b:1 (Jsonic) + pairSpec.Close = []*AltSpec{ + // Jsonic: End of map, check pk depth + {S: [][]Tin{{TinCB}}, C: func(r *Rule, ctx *Context) bool { return r.Lte("pk", 0) }, B: 1}, + // Jsonic: Ignore trailing comma at end of map + {S: [][]Tin{{TinCA}, {TinCB}}, C: func(r *Rule, ctx *Context) bool { return r.Lte("pk", 0) }, B: 1}, + // Jsonic: Comma then EOF + {S: [][]Tin{{TinCA}, {TinZZ}}}, + // Jsonic: Comma means new pair at same level (with pk check) + {S: [][]Tin{{TinCA}}, C: func(r *Rule, ctx *Context) bool { return r.Lte("pk", 0) }, R: "pair"}, + // Jsonic: Comma means new pair if implicit top level map + {S: [][]Tin{{TinCA}}, C: func(r *Rule, ctx *Context) bool { return r.Lte("dmap", 1) }, R: "pair"}, + // Jsonic: Value means new pair (space-separated) if implicit top level map + {S: [][]Tin{KEY}, C: func(r *Rule, ctx *Context) bool { return r.Lte("dmap", 1) }, R: "pair", B: 1}, + // Jsonic: End of implicit path, keep closing until pk=0 + {S: [][]Tin{merge([]Tin{TinCB, TinCA, TinCS}, KEY)}, + C: func(r *Rule, ctx *Context) bool { _, ok := r.N["pk"]; return ok && r.N["pk"] > 0 }, + B: 1}, + // Jsonic: Can't close map with ] + {S: [][]Tin{{TinCS}}, E: func(r *Rule, ctx *Context) *Token { return r.C0 }}, + // Jsonic: Auto-close at EOF + {S: [][]Tin{{TinZZ}}, E: finish}, + // Jsonic: Who needs commas anyway? (implicit continuation) + {R: "pair", B: 1}, + } + + // ====== ELEM rule ====== + elemSpec := &RuleSpec{Name: "elem"} + + // BC callbacks (JSON then Jsonic): + elemSpec.BC = []StateAction{ + // JSON: push child node onto list + func(r *Rule, ctx *Context) { + _ = ctx + done, _ := r.U["done"].(bool) + if !done && !IsUndefined(r.Child.Node) { + if arr, ok := r.Node.([]any); ok { + r.Node = append(arr, r.Child.Node) + // Propagate updated slice to parent list rule + // (Go slices may reallocate on append, unlike JS arrays which are reference types) + if r.Parent != NoRule && r.Parent != nil { + r.Parent.Node = r.Node + } + } + } + }, + // Jsonic: handle pair-in-list + func(r *Rule, ctx *Context) { + if pair, ok := r.U["pair"]; ok && pair == true { + r.U["prev"] = nodeMapGetVal(r.Node, r.U["key"]) + pairval(r, ctx) + } + }, + } + + // elem.Open ordering (Jsonic unshifted + JSON): + // [0] CA CA double comma null (Jsonic, unshifted) + // [1] CA single comma null (Jsonic, unshifted) + // [2] KEY CL pair in list (Jsonic, unshifted) + // [3] p:val (JSON, original) + elemSpec.Open = []*AltSpec{ + // Jsonic: Empty commas insert null (CA CA) + {S: [][]Tin{{TinCA}, {TinCA}}, B: 2, + U: map[string]any{"done": true}, + A: func(r *Rule, ctx *Context) { + _ = ctx + if arr, ok := r.Node.([]any); ok { + r.Node = append(arr, nil) + // Propagate to parent + if r.Parent != NoRule && r.Parent != nil { + r.Parent.Node = r.Node + } + } + }}, + // Jsonic: Single comma inserts null + {S: [][]Tin{{TinCA}}, + U: map[string]any{"done": true}, + A: func(r *Rule, ctx *Context) { + _ = ctx + if arr, ok := r.Node.([]any); ok { + r.Node = append(arr, nil) + // Propagate to parent + if r.Parent != NoRule && r.Parent != nil { + r.Parent.Node = r.Node + } + } + }}, + // Jsonic: Pair in list [key:val] + {S: [][]Tin{KEY, {TinCL}}, P: "val", + N: map[string]int{"pk": 1, "dmap": 1}, + U: map[string]any{"done": true, "pair": true, "list": true}, + A: pairkey}, + // JSON: Element is a value + {P: "val"}, + } + + // elem.Close ordering (Jsonic unshifted + delete:[-1,-2]): + // [0] CA CS/ZZ trailing comma (Jsonic, unshifted) + // [1] CA r:elem next element (Jsonic, unshifted) + // [2] CS b:1 end of list (Jsonic, unshifted) + // [3] ZZ finish (Jsonic, unshifted) + // [4] CB error (Jsonic, unshifted) + // [5] r:elem b:1 implicit (Jsonic, unshifted) + // JSON [0] and [1] deleted by delete:[-1,-2] + elemSpec.Close = []*AltSpec{ + // Jsonic: Ignore trailing comma before ] or ZZ + {S: [][]Tin{{TinCA}, {TinCS, TinZZ}}, B: 1}, + // Jsonic: Next element + {S: [][]Tin{{TinCA}}, R: "elem"}, + // Jsonic: End of list + {S: [][]Tin{{TinCS}}, B: 1}, + // Jsonic: Auto-close at EOF + {S: [][]Tin{{TinZZ}}, E: finish}, + // Jsonic: Can't close list with } + {S: [][]Tin{{TinCB}}, E: func(r *Rule, ctx *Context) *Token { return r.C0 }}, + // Jsonic: Who needs commas anyway? (implicit element) + {R: "elem", B: 1}, + } + + rsm["val"] = valSpec + rsm["map"] = mapSpec + rsm["list"] = listSpec + rsm["pair"] = pairSpec + rsm["elem"] = elemSpec +} + +// nodeMapSet sets a key on a map node. +func nodeMapSet(node any, key any, val any) { + if m, ok := node.(map[string]any); ok { + k, _ := key.(string) + m[k] = val + } +} + +// nodeMapGet gets a value from a map node. +func nodeMapGet(node any, key any) (any, bool) { + if m, ok := node.(map[string]any); ok { + k, _ := key.(string) + v, exists := m[k] + return v, exists + } + return nil, false +} + +// nodeMapGetVal is a helper that returns the value or nil. +func nodeMapGetVal(node any, key any) any { + v, _ := nodeMapGet(node, key) + return v +} diff --git a/go/jsonic.go b/go/jsonic.go index 4036799..df935d6 100644 --- a/go/jsonic.go +++ b/go/jsonic.go @@ -2,7 +2,8 @@ // including unquoted keys, implicit objects/arrays, comments, trailing commas, // single-quoted strings, path diving (nested object shorthand), and more. // -// It is a Go port of the jsonic TypeScript library. +// It is a Go port of the jsonic TypeScript library, faithfully implementing +// the same matcher-based lexer and rule-based parser architecture. package jsonic // Parse parses a jsonic string and returns the resulting Go value. @@ -17,33 +18,13 @@ func Parse(src string) any { // Preprocess: handle literal \n, \r\n, \t in test input src = preprocessEscapes(src) - if len(src) == 0 { - return nil - } - - // Check if input is only whitespace - allWhitespace := true - for _, ch := range src { - if ch != ' ' && ch != '\t' && ch != '\n' && ch != '\r' { - allWhitespace = false - break - } - } - if allWhitespace { - return nil - } - - lex := newLexer(src) - tokens := lex.tokenize() - - p := newParser(tokens) - return p.parse() + p := NewParser() + return p.Start(src) } // preprocessEscapes replaces literal backslash-n sequences with real newlines, etc. // This handles the case where TSV test files contain literal "\n" in the input. func preprocessEscapes(s string) string { - // Only process if there are actual backslash characters if len(s) == 0 { return s } diff --git a/go/lexer.go b/go/lexer.go index a194e3a..3c86e52 100644 --- a/go/lexer.go +++ b/go/lexer.go @@ -5,503 +5,755 @@ import ( "unicode" ) -// tokenType identifies the kind of token. -type tokenType int - -const ( - tokenEOF tokenType = iota - tokenOpenBrace // { - tokenCloseBrace // } - tokenOpenBracket // [ - tokenCloseBracket // ] - tokenColon // : - tokenComma // , - tokenString // quoted string value - tokenNumber // numeric literal - tokenText // unquoted text - tokenTrue // true - tokenFalse // false - tokenNull // null - tokenNewline // \n or \r\n (significant for implicit commas) -) - -// token represents a lexical token. -type token struct { - typ tokenType - val string // raw string value; for strings, the unescaped content +// Lex is the lexer that produces tokens from source text. +type Lex struct { + Src string + pnt Point + end *Token // End-of-source token (cached) + tokens []*Token // Lookahead token queue + Config *LexConfig } -// lexer tokenizes jsonic input. -type lexer struct { - src []rune - pos int +// LexConfig holds lexer configuration. +type LexConfig struct { + StringChars map[rune]bool // Quote characters + MultiChars map[rune]bool // Multiline quote characters + EscapeChar rune + SpaceChars map[rune]bool + LineChars map[rune]bool + RowChars map[rune]bool + CommentLine []string // Line comment starters: "#", "//" + CommentBlock [][2]string // Block comment: [start, end] pairs + NumberHex bool + NumberOct bool + NumberBin bool + NumberSep rune // Separator char (underscore) + AllowUnknownEscape bool + FinishRule bool // Auto-close unclosed structures at EOF } -func newLexer(src string) *lexer { - return &lexer{src: []rune(src), pos: 0} -} - -func (l *lexer) peek() rune { - if l.pos >= len(l.src) { - return 0 +// DefaultLexConfig returns the default lexer configuration matching jsonic defaults. +func DefaultLexConfig() *LexConfig { + return &LexConfig{ + StringChars: map[rune]bool{'\'': true, '"': true, '`': true}, + MultiChars: map[rune]bool{'`': true}, + EscapeChar: '\\', + SpaceChars: map[rune]bool{' ': true, '\t': true}, + LineChars: map[rune]bool{'\r': true, '\n': true}, + RowChars: map[rune]bool{'\n': true}, + CommentLine: []string{"#", "//"}, + CommentBlock: [][2]string{{"/*", "*/"}}, + NumberHex: true, + NumberOct: true, + NumberBin: true, + NumberSep: '_', + AllowUnknownEscape: true, + FinishRule: true, } - return l.src[l.pos] } -func (l *lexer) advance() rune { - ch := l.src[l.pos] - l.pos++ - return ch +// NewLex creates a new lexer for the given source. +func NewLex(src string, cfg *LexConfig) *Lex { + return &Lex{ + Src: src, + pnt: Point{Len: len(src), SI: 0, RI: 1, CI: 1}, + Config: cfg, + } } -func (l *lexer) atEnd() bool { - return l.pos >= len(l.src) +// Token creates a new token at the current point. +func (l *Lex) Token(name string, tin Tin, val any, src string) *Token { + return MakeToken(name, tin, val, src, l.pnt) } -// tokenize produces all tokens from the input. -func (l *lexer) tokenize() []token { - var tokens []token +// Next returns the next non-IGNORE token. +// This is the core lexing method called by the parser. +// rule and tI are provided for context but not currently used in the simplified port. +func (l *Lex) Next() *Token { for { - tok := l.nextToken() - tokens = append(tokens, tok) - if tok.typ == tokenEOF { - break + tkn := l.nextRaw() + if tkn == nil { + return l.bad("unexpected", l.pnt.SI, l.pnt.SI+1) + } + // Skip IGNORE tokens (space, line, comment) + if TinSetIGNORE[tkn.Tin] { + continue } + return tkn } - return tokens } -func (l *lexer) nextToken() token { - l.skipSpacesAndComments() +// nextRaw returns the next raw token (including IGNORE tokens). +func (l *Lex) nextRaw() *Token { + // Return cached end token + if l.end != nil { + return l.end + } + + // Return queued lookahead tokens + if len(l.tokens) > 0 { + tkn := l.tokens[0] + l.tokens = l.tokens[1:] + return tkn + } - if l.atEnd() { - return token{typ: tokenEOF} + // End of source + if l.pnt.SI >= l.pnt.Len { + l.end = l.Token("#ZZ", TinZZ, Undefined, "") + return l.end } - ch := l.peek() + // Try matchers in order: match, fixed, space, line, string, comment, number, text + // (We skip 'match' as it's for plugins only) - // Check for newlines (emit as tokens for implicit comma handling) - if ch == '\n' || ch == '\r' { - l.consumeNewline() - // Skip additional whitespace/newlines - for !l.atEnd() { - c := l.peek() - if c == '\n' || c == '\r' { - l.consumeNewline() - } else if c == ' ' || c == '\t' { - l.advance() - } else { - break - } - } - return token{typ: tokenNewline} - } - - switch ch { - case '{': - l.advance() - return token{typ: tokenOpenBrace, val: "{"} - case '}': - l.advance() - return token{typ: tokenCloseBrace, val: "}"} - case '[': - l.advance() - return token{typ: tokenOpenBracket, val: "["} - case ']': - l.advance() - return token{typ: tokenCloseBracket, val: "]"} - case ':': - l.advance() - return token{typ: tokenColon, val: ":"} - case ',': - l.advance() - return token{typ: tokenComma, val: ","} - case '"', '\'', '`': - return l.readString(ch) - default: - // Try number, then fall back to text - if ch == '-' || ch == '+' || (ch >= '0' && ch <= '9') || ch == '.' { - if tok, ok := l.tryNumber(); ok { - return tok - } - } - return l.readText() + if tkn := l.matchFixed(); tkn != nil { + return tkn + } + if tkn := l.matchSpace(); tkn != nil { + return tkn + } + if tkn := l.matchLine(); tkn != nil { + return tkn + } + if tkn := l.matchString(); tkn != nil { + return tkn } + if tkn := l.matchComment(); tkn != nil { + return tkn + } + if tkn := l.matchNumber(); tkn != nil { + return tkn + } + if tkn := l.matchText(); tkn != nil { + return tkn + } + + // Bad token - no matcher matched + return l.bad("unexpected", l.pnt.SI, l.pnt.SI+1) } -// skipSpacesAndComments skips spaces, tabs, and comments but NOT newlines. -func (l *lexer) skipSpacesAndComments() { - for !l.atEnd() { - ch := l.peek() - if ch == ' ' || ch == '\t' { - l.advance() - continue - } - // Line comments - if ch == '#' { - l.skipLineComment() - continue - } - if ch == '/' && l.pos+1 < len(l.src) { - next := l.src[l.pos+1] - if next == '/' { - l.skipLineComment() - continue - } - if next == '*' { - l.skipBlockComment() - continue - } - } - break +func (l *Lex) bad(why string, pstart, pend int) *Token { + src := "" + if pstart >= 0 && pstart < len(l.Src) && pend <= len(l.Src) { + src = l.Src[pstart:pend] + } else if l.pnt.SI < len(l.Src) { + src = string(l.Src[l.pnt.SI]) } + tkn := l.Token("#BD", TinBD, nil, src) + tkn.Why = why + return tkn } -func (l *lexer) skipLineComment() { - for !l.atEnd() { - ch := l.advance() - if ch == '\n' { - break - } +// matchFixed matches fixed tokens: { } [ ] : , +func (l *Lex) matchFixed() *Token { + if l.pnt.SI >= l.pnt.Len { + return nil } + ch := l.Src[l.pnt.SI] + src := string(ch) + tin, ok := FixedTokens[src] + if !ok { + return nil + } + tkn := l.Token(tinName(tin), tin, nil, src) + l.pnt.SI++ + l.pnt.CI++ + return tkn } -func (l *lexer) skipBlockComment() { - l.advance() // / - l.advance() // * - for !l.atEnd() { - ch := l.advance() - if ch == '*' && !l.atEnd() && l.peek() == '/' { - l.advance() - return +// matchSpace matches space and tab characters. +func (l *Lex) matchSpace() *Token { + sI := l.pnt.SI + cI := l.pnt.CI + for sI < l.pnt.Len && l.Config.SpaceChars[rune(l.Src[sI])] { + sI++ + cI++ + } + if sI > l.pnt.SI { + src := l.Src[l.pnt.SI:sI] + tkn := l.Token("#SP", TinSP, nil, src) + l.pnt.SI = sI + l.pnt.CI = cI + return tkn + } + return nil +} + +// matchLine matches line ending characters (\r, \n). +func (l *Lex) matchLine() *Token { + sI := l.pnt.SI + rI := l.pnt.RI + for sI < l.pnt.Len && l.Config.LineChars[rune(l.Src[sI])] { + if l.Config.RowChars[rune(l.Src[sI])] { + rI++ } + sI++ + } + if sI > l.pnt.SI { + src := l.Src[l.pnt.SI:sI] + tkn := l.Token("#LN", TinLN, nil, src) + l.pnt.SI = sI + l.pnt.RI = rI + l.pnt.CI = 1 + return tkn } + return nil } -func (l *lexer) consumeNewline() { - ch := l.advance() - if ch == '\r' && !l.atEnd() && l.peek() == '\n' { - l.advance() +// matchComment matches line comments (# //) and block comments (/* */). +func (l *Lex) matchComment() *Token { + fwd := l.Src[l.pnt.SI:] + + // Line comments + for _, start := range l.Config.CommentLine { + if strings.HasPrefix(fwd, start) { + fI := len(start) + cI := l.pnt.CI + len(start) + for fI < len(fwd) && !l.Config.LineChars[rune(fwd[fI])] { + cI++ + fI++ + } + src := fwd[:fI] + tkn := l.Token("#CM", TinCM, nil, src) + l.pnt.SI += len(src) + l.pnt.CI = cI + return tkn + } + } + + // Block comments + for _, pair := range l.Config.CommentBlock { + start, end := pair[0], pair[1] + if strings.HasPrefix(fwd, start) { + rI := l.pnt.RI + cI := l.pnt.CI + len(start) + fI := len(start) + for fI < len(fwd) && !strings.HasPrefix(fwd[fI:], end) { + if l.Config.RowChars[rune(fwd[fI])] { + rI++ + cI = 0 + } + cI++ + fI++ + } + if strings.HasPrefix(fwd[fI:], end) { + cI += len(end) + src := fwd[:fI+len(end)] + tkn := l.Token("#CM", TinCM, nil, src) + l.pnt.SI += len(src) + l.pnt.RI = rI + l.pnt.CI = cI + return tkn + } + // Unterminated comment - return bad token + return l.bad("unterminated_comment", l.pnt.SI, l.pnt.SI+len(start)*9) + } } + + return nil } -// readString reads a quoted string (", ', or `). -func (l *lexer) readString(quote rune) token { - l.advance() // skip opening quote +// matchString matches quoted strings: "...", '...', `...` +func (l *Lex) matchString() *Token { + if l.pnt.SI >= l.pnt.Len { + return nil + } + q := rune(l.Src[l.pnt.SI]) + if !l.Config.StringChars[q] { + return nil + } + + isMultiLine := l.Config.MultiChars[q] + src := l.Src + sI := l.pnt.SI + 1 + rI := l.pnt.RI + cI := l.pnt.CI + 1 + var sb strings.Builder - for !l.atEnd() { - ch := l.advance() - if ch == '\\' && quote != '`' { - if l.atEnd() { - sb.WriteRune('\\') + srclen := len(src) + + for sI < srclen { + cI++ + c := rune(src[sI]) + + // End quote + if c == q { + sI++ + break + } + + // Escape character + if c == l.Config.EscapeChar && q != '`' { + sI++ + cI++ + if sI >= srclen { break } - esc := l.advance() + esc := src[sI] switch esc { + case 'b': + sb.WriteByte('\b') + case 'f': + sb.WriteByte('\f') case 'n': - sb.WriteRune('\n') + sb.WriteByte('\n') case 'r': - sb.WriteRune('\r') + sb.WriteByte('\r') case 't': - sb.WriteRune('\t') - case '\\': - sb.WriteRune('\\') - case '/': - sb.WriteRune('/') - case 'b': - sb.WriteRune('\b') - case 'f': - sb.WriteRune('\f') + sb.WriteByte('\t') + case 'v': + sb.WriteByte('\v') case '"': - sb.WriteRune('"') + sb.WriteByte('"') case '\'': - sb.WriteRune('\'') + sb.WriteByte('\'') case '`': - sb.WriteRune('`') - case 'u': - sb.WriteRune(l.readUnicodeEscape()) + sb.WriteByte('`') + case '\\': + sb.WriteByte('\\') + case '/': + sb.WriteByte('/') case 'x': - sb.WriteRune(l.readHexEscape()) + // ASCII escape \x** + sI++ + if sI+2 <= srclen { + cc := parseHexInt(src[sI : sI+2]) + if cc >= 0 { + sb.WriteRune(rune(cc)) + sI += 1 // loop will increment + cI += 2 + } else { + sb.WriteByte(esc) + sI-- + } + } + case 'u': + // Unicode escape \u**** or \u{*****} + sI++ + if sI < srclen && src[sI] == '{' { + sI++ + endI := strings.IndexByte(src[sI:], '}') + if endI >= 0 { + cc := parseHexInt(src[sI : sI+endI]) + if cc >= 0 { + sb.WriteRune(rune(cc)) + sI += endI // skip past digits, loop handles +1 + cI += endI + 2 + } + } + } else if sI+4 <= srclen { + cc := parseHexInt(src[sI : sI+4]) + if cc >= 0 { + sb.WriteRune(rune(cc)) + sI += 3 + cI += 4 + } + } default: - // Allow unknown escapes - just emit the character - sb.WriteRune(esc) + if l.Config.AllowUnknownEscape { + sb.WriteByte(esc) + } } - } else if ch == quote { - break - } else { - sb.WriteRune(ch) + sI++ + continue } - } - return token{typ: tokenString, val: sb.String()} -} -func (l *lexer) readUnicodeEscape() rune { - // Handle \u{HHHH} or \uHHHH - if !l.atEnd() && l.peek() == '{' { - l.advance() // skip { - var hex strings.Builder - for !l.atEnd() && l.peek() != '}' { - hex.WriteRune(l.advance()) + // Check for unprintable / multiline + if c < 32 { + if isMultiLine && l.Config.LineChars[c] { + if l.Config.RowChars[c] { + rI++ + } + cI = 1 + sb.WriteByte(src[sI]) + sI++ + continue + } + // Non-multiline unprintable - bad + break } - if !l.atEnd() { - l.advance() // skip } + + // Normal body - fast scan + bI := sI + for sI < srclen { + cc := rune(src[sI]) + if cc < 32 || cc == q || cc == rune(l.Config.EscapeChar) { + break + } + sI++ + cI++ } - return parseHexRune(hex.String()) - } - // Standard 4-digit - var hex strings.Builder - for i := 0; i < 4 && !l.atEnd(); i++ { - hex.WriteRune(l.advance()) + cI-- // loop will re-increment + sb.WriteString(src[bI:sI]) + continue } - return parseHexRune(hex.String()) -} -func (l *lexer) readHexEscape() rune { - var hex strings.Builder - for i := 0; i < 2 && !l.atEnd(); i++ { - hex.WriteRune(l.advance()) + // Check for unterminated string + if sI > l.pnt.SI+1 && (sI <= l.pnt.Len && src[sI-1] != byte(q)) { + // Unterminated string + return l.bad("unterminated_string", l.pnt.SI, sI) } - return parseHexRune(hex.String()) + + val := sb.String() + ssrc := src[l.pnt.SI:sI] + tkn := l.Token("#ST", TinST, val, ssrc) + l.pnt.SI = sI + l.pnt.RI = rI + l.pnt.CI = cI + return tkn } -func parseHexRune(hex string) rune { - var val rune - for _, ch := range hex { - val <<= 4 - switch { - case ch >= '0' && ch <= '9': - val |= ch - '0' - case ch >= 'a' && ch <= 'f': - val |= ch - 'a' + 10 - case ch >= 'A' && ch <= 'F': - val |= ch - 'A' + 10 - } +// matchNumber matches numeric literals: decimal, hex (0x), octal (0o), binary (0b). +// Returns nil if the text at current position is not a valid number (lets text matcher try). +func (l *Lex) matchNumber() *Token { + if l.pnt.SI >= l.pnt.Len { + return nil } - return val -} -// tryNumber attempts to parse a number. Returns (token, true) on success. -// If the token turns out to be text (like "1a" or "0x" with no digits), returns false. -func (l *lexer) tryNumber() (token, bool) { - start := l.pos - var sb strings.Builder + src := l.Src + sI := l.pnt.SI + ch := src[sI] - ch := l.peek() + // Must start with digit, +, -, or . + if !isDigit(ch) && ch != '-' && ch != '+' && ch != '.' { + return nil + } + + // Save start position for backtracking + start := sI // Handle sign + hasSign := false if ch == '-' || ch == '+' { - sb.WriteRune(ch) - l.advance() - if l.atEnd() { - l.pos = start - return token{}, false + hasSign = true + sI++ + if sI >= len(src) { + return nil } - ch = l.peek() + ch = src[sI] } - // Check for hex: 0x or 0X - if ch == '0' && l.pos+1 < len(l.src) && (l.src[l.pos+1] == 'x' || l.src[l.pos+1] == 'X') { - sb.WriteRune(l.advance()) // 0 - sb.WriteRune(l.advance()) // x - hexCount := 0 - for !l.atEnd() && isHexDigit(l.peek()) { - sb.WriteRune(l.advance()) - hexCount++ + // Hex: 0x... + if ch == '0' && sI+1 < len(src) && (src[sI+1] == 'x' || src[sI+1] == 'X') && l.Config.NumberHex { + sI += 2 + hexStart := sI + for sI < len(src) && (isHexDigitByte(src[sI]) || (l.Config.NumberSep != 0 && rune(src[sI]) == l.Config.NumberSep)) { + sI++ + } + if sI == hexStart { + // "0x" with no hex digits → let text matcher handle + return nil + } + // Check trailing text + if sI < len(src) && isTextContinuation(src[sI]) { + return nil } - if hexCount == 0 { - // "0x" with no hex digits → text - l.pos = start - return token{}, false + msrc := src[start:sI] + nstr := msrc + if l.Config.NumberSep != 0 { + nstr = strings.ReplaceAll(nstr, string(l.Config.NumberSep), "") } - // Check for trailing alpha/digit that would make this text - if !l.atEnd() && isTextChar(l.peek()) && !isTokenBreak(l.peek()) { - l.pos = start - return token{}, false + num := parseNumericString(nstr) + if num != num { // NaN check + return nil } - return token{typ: tokenNumber, val: sb.String()}, true + tkn := l.Token("#NR", TinNR, num, msrc) + l.pnt.SI = sI + l.pnt.CI += sI - start + return tkn } - // Check for octal: 0o - if ch == '0' && l.pos+1 < len(l.src) && (l.src[l.pos+1] == 'o' || l.src[l.pos+1] == 'O') { - sb.WriteRune(l.advance()) // 0 - sb.WriteRune(l.advance()) // o - octCount := 0 - for !l.atEnd() && l.peek() >= '0' && l.peek() <= '7' { - sb.WriteRune(l.advance()) - octCount++ + // Octal: 0o... + if ch == '0' && sI+1 < len(src) && (src[sI+1] == 'o' || src[sI+1] == 'O') && l.Config.NumberOct { + sI += 2 + octStart := sI + for sI < len(src) && ((src[sI] >= '0' && src[sI] <= '7') || (l.Config.NumberSep != 0 && rune(src[sI]) == l.Config.NumberSep)) { + sI++ } - if octCount == 0 { - l.pos = start - return token{}, false + if sI == octStart { + return nil } - if !l.atEnd() && isTextChar(l.peek()) && !isTokenBreak(l.peek()) { - l.pos = start - return token{}, false + if sI < len(src) && isTextContinuation(src[sI]) { + return nil } - return token{typ: tokenNumber, val: sb.String()}, true + msrc := src[start:sI] + nstr := msrc + if l.Config.NumberSep != 0 { + nstr = strings.ReplaceAll(nstr, string(l.Config.NumberSep), "") + } + num := parseNumericString(nstr) + if num != num { + return nil + } + tkn := l.Token("#NR", TinNR, num, msrc) + l.pnt.SI = sI + l.pnt.CI += sI - start + return tkn } - // Check for binary: 0b - if ch == '0' && l.pos+1 < len(l.src) && (l.src[l.pos+1] == 'b' || l.src[l.pos+1] == 'B') { - sb.WriteRune(l.advance()) // 0 - sb.WriteRune(l.advance()) // b - binCount := 0 - for !l.atEnd() && (l.peek() == '0' || l.peek() == '1') { - sb.WriteRune(l.advance()) - binCount++ + // Binary: 0b... + if ch == '0' && sI+1 < len(src) && (src[sI+1] == 'b' || src[sI+1] == 'B') && l.Config.NumberBin { + sI += 2 + binStart := sI + for sI < len(src) && ((src[sI] == '0' || src[sI] == '1') || (l.Config.NumberSep != 0 && rune(src[sI]) == l.Config.NumberSep)) { + sI++ + } + if sI == binStart { + return nil } - if binCount == 0 { - l.pos = start - return token{}, false + if sI < len(src) && isTextContinuation(src[sI]) { + return nil } - if !l.atEnd() && isTextChar(l.peek()) && !isTokenBreak(l.peek()) { - l.pos = start - return token{}, false + msrc := src[start:sI] + nstr := msrc + if l.Config.NumberSep != 0 { + nstr = strings.ReplaceAll(nstr, string(l.Config.NumberSep), "") } - return token{typ: tokenNumber, val: sb.String()}, true + num := parseNumericString(nstr) + if num != num { + return nil + } + tkn := l.Token("#NR", TinNR, num, msrc) + l.pnt.SI = sI + l.pnt.CI += sI - start + return tkn } - // Regular number: digits, optional dot, optional exponent - hasDigit := false + // Decimal number: optional leading dot, digits, decimal, exponent + // Pattern: \.?[0-9]+([0-9_]*[0-9])? (\.[0-9]?([0-9_]*[0-9])?)? ([eE][-+]?[0-9]+([0-9_]*[0-9])?)? + hasDigits := false + + // Leading dot + if ch == '.' { + if sI+1 >= len(src) || !isDigit(src[sI+1]) { + return nil // Just a dot, not a number + } + sI++ // consume dot + for sI < len(src) && (isDigit(src[sI]) || (l.Config.NumberSep != 0 && rune(src[sI]) == l.Config.NumberSep)) { + sI++ + hasDigits = true + } + } else { + // Integer part + for sI < len(src) && (isDigit(src[sI]) || (l.Config.NumberSep != 0 && rune(src[sI]) == l.Config.NumberSep)) { + hasDigits = true + sI++ + } + } - // Integer part - for !l.atEnd() && l.peek() >= '0' && l.peek() <= '9' { - sb.WriteRune(l.advance()) - hasDigit = true + if !hasDigits { + return nil } // Decimal point - if !l.atEnd() && l.peek() == '.' { - nextAfterDot := rune(0) - if l.pos+1 < len(l.src) { - nextAfterDot = l.src[l.pos+1] - } - // Only treat as decimal if followed by digit or end/token break - if nextAfterDot >= '0' && nextAfterDot <= '9' { - sb.WriteRune(l.advance()) // . - for !l.atEnd() && l.peek() >= '0' && l.peek() <= '9' { - sb.WriteRune(l.advance()) - } - } else if hasDigit { - // "0." followed by non-digit like "0.a" → text - // Check if what follows the dot is a letter - if nextAfterDot != 0 && !isTokenBreak(nextAfterDot) && nextAfterDot != ',' && nextAfterDot != '}' && nextAfterDot != ']' && nextAfterDot != ':' && nextAfterDot != ' ' && nextAfterDot != '\t' && nextAfterDot != '\n' && nextAfterDot != '\r' { - l.pos = start - return token{}, false + if sI < len(src) && src[sI] == '.' { + // Check what follows the dot + if sI+1 < len(src) && isDigit(src[sI+1]) { + sI++ // consume dot + for sI < len(src) && (isDigit(src[sI]) || (l.Config.NumberSep != 0 && rune(src[sI]) == l.Config.NumberSep)) { + sI++ } - // "0." at end or before delimiter → just "0" - back up - // Actually "0." should be 0. Let's consume it - sb.WriteRune(l.advance()) // . + } else if sI+1 < len(src) && isTextContinuation(src[sI+1]) && src[sI+1] != '.' { + // "0.a" → not a number, let text handle it + return nil + } else { + // Trailing dot: "0." at end or before delimiter + sI++ // consume dot } } - if !hasDigit { - l.pos = start - return token{}, false - } - // Exponent - if !l.atEnd() && (l.peek() == 'e' || l.peek() == 'E') { - nextAfterE := rune(0) - if l.pos+1 < len(l.src) { - nextAfterE = l.src[l.pos+1] - } - // Is it a valid exponent? - if nextAfterE >= '0' && nextAfterE <= '9' || nextAfterE == '+' || nextAfterE == '-' { - sb.WriteRune(l.advance()) // e/E - if !l.atEnd() && (l.peek() == '+' || l.peek() == '-') { - sb.WriteRune(l.advance()) - } - expDigits := 0 - for !l.atEnd() && l.peek() >= '0' && l.peek() <= '9' { - sb.WriteRune(l.advance()) - expDigits++ - } - // Check trailing text after exponent - if !l.atEnd() && isTextChar(l.peek()) && !isTokenBreak(l.peek()) { - l.pos = start - return token{}, false - } - if expDigits == 0 { - l.pos = start - return token{}, false - } - } else { - // e followed by non-numeric → this might be text like "1e2e" - // We have digits so far, check if the 'e' is followed by something that makes it all text - if nextAfterE != 0 && !isTokenBreak(nextAfterE) && nextAfterE != ',' && nextAfterE != '}' && nextAfterE != ']' && nextAfterE != ':' && nextAfterE != ' ' && nextAfterE != '\t' && nextAfterE != '\n' && nextAfterE != '\r' { - l.pos = start - return token{}, false + if sI < len(src) && (src[sI] == 'e' || src[sI] == 'E') { + eSI := sI + sI++ // consume e + if sI < len(src) && (src[sI] == '+' || src[sI] == '-') { + sI++ + } + expStart := sI + for sI < len(src) && (isDigit(src[sI]) || (l.Config.NumberSep != 0 && rune(src[sI]) == l.Config.NumberSep)) { + sI++ + } + if sI == expStart { + // No exponent digits - check if trailing makes it text + if sI < len(src) && isTextContinuation(src[sI]) { + return nil } + sI = eSI // backtrack, 'e' is not part of number + } + // Check for trailing text after exponent + if sI < len(src) && isTextContinuation(src[sI]) { + return nil } } - // Check for trailing alpha that would make it text (e.g. "10b", "1a") - if !l.atEnd() { - next := l.peek() - if isTextChar(next) && !isTokenBreak(next) { - l.pos = start - return token{}, false - } + // Check for trailing alpha/text that would make this text + if sI < len(src) && isTextContinuation(src[sI]) { + return nil + } + + msrc := src[start:sI] + if len(msrc) == 0 || (hasSign && len(msrc) == 1) { + return nil + } + + // Check if this matches a value keyword (e.g. if value.def had this string) + // Not applicable for standard numbers, skip. + + nstr := msrc + if l.Config.NumberSep != 0 { + nstr = strings.ReplaceAll(nstr, string(l.Config.NumberSep), "") + } + + num := parseNumericString(nstr) + if num != num { // NaN + return nil } - return token{typ: tokenNumber, val: sb.String()}, true + tkn := l.Token("#NR", TinNR, num, msrc) + l.pnt.SI = sI + l.pnt.CI += sI - start + return tkn } -// readText reads unquoted text until a delimiter is hit. -// Text tokens stop at spaces, which act as token separators in jsonic. -func (l *lexer) readText() token { - var sb strings.Builder - for !l.atEnd() { - ch := l.peek() +// matchText matches unquoted text and checks for value keywords (true, false, null). +// Text is terminated by fixed tokens, whitespace, quotes, and comment starters. +func (l *Lex) matchText() *Token { + if l.pnt.SI >= l.pnt.Len { + return nil + } + + src := l.Src + sI := l.pnt.SI + start := sI + + for sI < len(src) { + ch := rune(src[sI]) + // Stop at: fixed tokens, whitespace, quotes, line chars if ch == '{' || ch == '}' || ch == '[' || ch == ']' || ch == ':' || ch == ',' || - ch == ' ' || ch == '\t' || - ch == '\n' || ch == '\r' || - ch == '"' || ch == '\'' || ch == '`' { + l.Config.SpaceChars[ch] || l.Config.LineChars[ch] || + l.Config.StringChars[ch] { break } - // Comments - if ch == '#' { - break + // Comment starters + rest := src[sI:] + isComment := false + for _, cs := range l.Config.CommentLine { + if strings.HasPrefix(rest, cs) { + isComment = true + break + } + } + if !isComment { + for _, cb := range l.Config.CommentBlock { + if strings.HasPrefix(rest, cb[0]) { + isComment = true + break + } + } } - if ch == '/' && l.pos+1 < len(l.src) && (l.src[l.pos+1] == '/' || l.src[l.pos+1] == '*') { + if isComment { break } + sI++ + } - sb.WriteRune(l.advance()) + if sI == start { + return nil } - val := sb.String() + msrc := src[start:sI] + mlen := len(msrc) - // Check for keywords - switch val { + // Check for value keywords + switch msrc { case "true": - return token{typ: tokenTrue, val: val} + tkn := l.Token("#VL", TinVL, true, msrc) + l.pnt.SI += mlen + l.pnt.CI += mlen + return tkn case "false": - return token{typ: tokenFalse, val: val} + tkn := l.Token("#VL", TinVL, false, msrc) + l.pnt.SI += mlen + l.pnt.CI += mlen + return tkn case "null": - return token{typ: tokenNull, val: val} + tkn := l.Token("#VL", TinVL, nil, msrc) + l.pnt.SI += mlen + l.pnt.CI += mlen + return tkn + } + + // Plain text + tkn := l.Token("#TX", TinTX, msrc, msrc) + l.pnt.SI += mlen + l.pnt.CI += mlen + + // Check if next char is a fixed token - push as lookahead (subMatchFixed) + if l.pnt.SI < l.pnt.Len { + nextCh := string(src[l.pnt.SI]) + if tin, ok := FixedTokens[nextCh]; ok { + fixTkn := l.Token(tinName(tin), tin, nil, nextCh) + l.pnt.SI++ + l.pnt.CI++ + l.tokens = append(l.tokens, fixTkn) + } } - return token{typ: tokenText, val: val} + return tkn } -func isHexDigit(ch rune) bool { +// Helper functions + +func tinName(tin Tin) string { + switch tin { + case TinOB: + return "#OB" + case TinCB: + return "#CB" + case TinOS: + return "#OS" + case TinCS: + return "#CS" + case TinCL: + return "#CL" + case TinCA: + return "#CA" + default: + return "#UK" + } +} + +func isDigit(ch byte) bool { + return ch >= '0' && ch <= '9' +} + +func isHexDigitByte(ch byte) bool { return (ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F') } -func isTextChar(ch rune) bool { - return ch != '{' && ch != '}' && ch != '[' && ch != ']' && - ch != ':' && ch != ',' && - ch != '"' && ch != '\'' && ch != '`' && - ch != '\n' && ch != '\r' && - !unicode.IsSpace(ch) +// isTextContinuation returns true if the character can continue a text token +// (i.e., it's not a delimiter). +func isTextContinuation(ch byte) bool { + r := rune(ch) + return !unicode.IsSpace(r) && ch != '{' && ch != '}' && ch != '[' && ch != ']' && + ch != ':' && ch != ',' && ch != '"' && ch != '\'' && ch != '`' } -func isTokenBreak(ch rune) bool { - return ch == '{' || ch == '}' || ch == '[' || ch == ']' || - ch == ':' || ch == ',' || - ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' || - ch == '"' || ch == '\'' || ch == '`' +func parseHexInt(s string) int { + val := 0 + for _, ch := range s { + val <<= 4 + switch { + case ch >= '0' && ch <= '9': + val |= int(ch - '0') + case ch >= 'a' && ch <= 'f': + val |= int(ch-'a') + 10 + case ch >= 'A' && ch <= 'F': + val |= int(ch-'A') + 10 + default: + return -1 + } + } + return val } diff --git a/go/parser.go b/go/parser.go index f80f8c6..56fd9ba 100644 --- a/go/parser.go +++ b/go/parser.go @@ -6,565 +6,145 @@ import ( "strings" ) -// parser converts tokens into Go values. -type parser struct { - tokens []token - pos int -} - -func newParser(tokens []token) *parser { - return &parser{tokens: tokens, pos: 0} -} - -func (p *parser) peek() token { - if p.pos >= len(p.tokens) { - return token{typ: tokenEOF} - } - return p.tokens[p.pos] -} - -func (p *parser) peekAt(offset int) token { - idx := p.pos + offset - if idx >= len(p.tokens) { - return token{typ: tokenEOF} - } - return p.tokens[idx] -} - -func (p *parser) advance() token { - tok := p.tokens[p.pos] - p.pos++ - return tok -} - -func (p *parser) skipNewlines() { - for p.peek().typ == tokenNewline { - p.advance() - } -} - -// Parse is the entry point. It parses the full token stream. -func (p *parser) parse() any { - p.skipNewlines() - - if p.peek().typ == tokenEOF { +// Context holds the parse state. +type Context struct { + UI int // Unique rule ID counter + T0 *Token // First lookahead token + T1 *Token // Second lookahead token + V1 *Token // Previous token 1 + V2 *Token // Previous token 2 + RS []*Rule // Rule stack + RSI int // Rule stack index + RSM map[string]*RuleSpec // Rule spec map + KI int // Iteration counter +} + +// Parser orchestrates the parsing process. +type Parser struct { + Config *LexConfig + RSM map[string]*RuleSpec +} + +// NewParser creates a parser with default configuration. +func NewParser() *Parser { + cfg := DefaultLexConfig() + rsm := make(map[string]*RuleSpec) + Grammar(rsm, cfg) + return &Parser{Config: cfg, RSM: rsm} +} + +// Start parses the source string and returns the result. +func (p *Parser) Start(src string) any { + if src == "" { return nil } - result := p.parseTopLevel() - return result -} - -// parseTopLevel handles implicit top-level objects and lists. -// At the top level, without braces/brackets, jsonic tries to detect: -// - key:value pairs → implicit object -// - comma-separated values → implicit list -func (p *parser) parseTopLevel() any { - p.skipNewlines() - - tok := p.peek() - - // Explicit object/array - if tok.typ == tokenOpenBrace { - val := p.parseObject() - p.skipNewlines() - // Check if there's more at top level after the object: {a:1}, - if p.peek().typ == tokenComma { - p.advance() // skip comma - p.skipNewlines() - if p.peek().typ == tokenEOF { - // trailing comma after top-level object: `{a:1},` → [{"a":1}] - return []any{val} - } - // More values: `{a:1},{b:2}` → [{a:1},{b:2}] - arr := []any{val} - for { - p.skipNewlines() - if p.peek().typ == tokenEOF { - break - } - arr = append(arr, p.parseValue()) - p.skipNewlines() - if p.peek().typ == tokenComma { - p.advance() - continue - } - break - } - return arr - } - return val - } - - if tok.typ == tokenOpenBracket { - return p.parseArray() - } - - // Determine if this is an implicit object or implicit list. - // If we see VALUE COLON, it's an object. - // If we see VALUE COMMA VALUE or VALUE NEWLINE VALUE where the second - // value is not a key (no colon after), it could be a list. - // Also handle: `a,b` → ["a","b"], `1,2` → [1,2] - // And: `a:1,b:2` → {"a":1,"b":2} - if p.looksLikeImplicitObject() { - return p.parseImplicitObject() - } - - // Check for implicit list: comma-only like `,` or `,,` or `value,value` - if tok.typ == tokenComma { - return p.parseImplicitList() - } - - // Single value or implicit list - val := p.parseValue() - - if p.peek().typ == tokenComma { - // It's an implicit list: `a,b` or `1,2` - p.advance() // skip comma - return p.continueImplicitList(val) - } - if p.peek().typ == tokenNewline { - p.skipNewlines() - if p.peek().typ != tokenEOF { - // Newline-separated values: `1\n2\n3` - arr := []any{val} - for p.peek().typ != tokenEOF { - arr = append(arr, p.parseValue()) - p.skipNewlines() - } - return arr - } - } - - return val -} - -// looksLikeImplicitObject checks if the token stream from current position -// looks like key:value pairs (implicit object). -func (p *parser) looksLikeImplicitObject() bool { - // Scan forward: if we see a value token followed by colon, it's an object - i := p.pos - for i < len(p.tokens) { - tok := p.tokens[i] - if tok.typ == tokenNewline { - i++ - continue - } - if isValueToken(tok) { - // Check if next non-newline token is colon - j := i + 1 - for j < len(p.tokens) && p.tokens[j].typ == tokenNewline { - j++ - } - if j < len(p.tokens) && p.tokens[j].typ == tokenColon { - return true - } - } - break - } - return false -} - -// parseImplicitObject parses key:value pairs at the top level without braces. -func (p *parser) parseImplicitObject() any { - obj := make(map[string]any) - p.parsePairsInto(obj, tokenEOF) - return obj -} - -// parseImplicitList parses a comma-only top level like `,` or `,,` or `1,2,3`. -func (p *parser) parseImplicitList() any { - arr := []any{} - - for { - p.skipNewlines() - if p.peek().typ == tokenEOF { + // Check if all whitespace + allWS := true + for _, ch := range src { + if ch != ' ' && ch != '\t' && ch != '\n' && ch != '\r' { + allWS = false break } - if p.peek().typ == tokenComma { - arr = append(arr, nil) // leading/double comma → null - p.advance() - continue - } - val := p.parseValue() - arr = append(arr, val) - p.skipNewlines() - if p.peek().typ == tokenComma { - p.advance() - p.skipNewlines() - if p.peek().typ == tokenEOF { - // trailing comma → ignore - break - } - continue - } - break } - return arr -} - -// continueImplicitList continues parsing an implicit list after the first -// value and comma have been consumed. -func (p *parser) continueImplicitList(first any) any { - arr := []any{first} - - for { - p.skipNewlines() - if p.peek().typ == tokenEOF { - break - } - if p.peek().typ == tokenComma { - arr = append(arr, nil) - p.advance() - continue - } - val := p.parseValue() - arr = append(arr, val) - p.skipNewlines() - if p.peek().typ == tokenComma { - p.advance() - p.skipNewlines() - if p.peek().typ == tokenEOF { - break - } - continue - } - break - } - return arr -} - -// parseValue parses a single value. -func (p *parser) parseValue() any { - p.skipNewlines() - tok := p.peek() - - switch tok.typ { - case tokenOpenBrace: - return p.parseObject() - case tokenOpenBracket: - return p.parseArray() - case tokenString: - p.advance() - return tok.val - case tokenNumber: - p.advance() - return parseNumber(tok.val) - case tokenTrue: - p.advance() - return true - case tokenFalse: - p.advance() - return false - case tokenNull: - p.advance() - return nil - case tokenText: - p.advance() - return tok.val - default: - // Unexpected token - return nil - if tok.typ != tokenEOF { - p.advance() - } + if allWS { return nil } -} -// parseObject parses { ... }. -func (p *parser) parseObject() any { - p.advance() // skip { - p.skipNewlines() + lex := NewLex(src, p.Config) - if p.peek().typ == tokenCloseBrace { - p.advance() - return map[string]any{} + ctx := &Context{ + UI: 0, + T0: NoToken, + T1: NoToken, + V1: NoToken, + V2: NoToken, + RS: make([]*Rule, len(src)*4+100), + RSI: 0, + RSM: p.RSM, } - obj := make(map[string]any) - p.parsePairsInto(obj, tokenCloseBrace) - - p.skipNewlines() - if p.peek().typ == tokenCloseBrace { - p.advance() + startSpec := p.RSM["val"] + if startSpec == nil { + return nil } - // If no closing brace, auto-close (jsonic is lenient) - return obj -} - -// parsePairsInto parses key:value pairs into the given map until endToken. -func (p *parser) parsePairsInto(obj map[string]any, endToken tokenType) { - for { - p.skipNewlines() - tok := p.peek() - - if tok.typ == endToken || tok.typ == tokenEOF { - break - } + rule := MakeRule(startSpec, ctx, nil) + root := rule - // Skip extra commas in objects - if tok.typ == tokenComma { - p.advance() - continue - } - - // Closing tokens that shouldn't be consumed - if tok.typ == tokenCloseBrace || tok.typ == tokenCloseBracket { - break - } - - // Read key - key := p.parseKey() - - p.skipNewlines() - if p.peek().typ == tokenColon { - p.advance() // skip colon - } else { - // No colon found - treat as a value (shouldn't normally happen in object context) - obj[key] = key - continue - } - - p.skipNewlines() - - // Check for implicit null: key with no value (next is comma, close, newline, or EOF) - next := p.peek() - if next.typ == tokenComma || next.typ == tokenCloseBrace || next.typ == tokenEOF || - next.typ == tokenNewline { - obj[key] = nil - continue - } - - // Check for path diving: key:key2:value → nested objects - // If the value position has a value token followed by colon, it's path diving - if p.looksLikePathDive() { - p.parsePathDive(obj, key, endToken) - } else { - val := p.parseValue() - obj[key] = val - } - - p.skipNewlines() - - // Separator: comma, newline, or space (implicit) - if p.peek().typ == tokenComma { - p.advance() - } else if p.peek().typ == tokenNewline { - // newline acts as implicit comma - continue - } - // Space-separated pairs: no explicit separator needed + // Maximum iterations: 2 * numRules * srcLen * 2 * maxmul(3) + maxr := 2 * len(p.RSM) * len(src) * 2 * 3 + if maxr < 100 { + maxr = 100 } -} -// looksLikePathDive checks if the next tokens look like a path dive: -// VALUE COLON (not followed by open brace/bracket without a further colon) -func (p *parser) looksLikePathDive() bool { - // We're positioned right after the first colon. - // Look ahead: VALUE COLON means path dive. - tok := p.peek() - if !isValueToken(tok) { - return false - } - // Skip forward to find if there's a colon after this value - i := p.pos + 1 - for i < len(p.tokens) && p.tokens[i].typ == tokenNewline { - i++ + kI := 0 + for rule != NoRule && kI < maxr { + ctx.KI = kI + rule = rule.Process(ctx, lex) + kI++ } - if i < len(p.tokens) && p.tokens[i].typ == tokenColon { - return true - } - return false -} -// parsePathDive handles path diving like a:b:c:1 → {"a":{"b":{"c":1}}} -// It also handles merging: a:b:1 followed by a:c:2 → {"a":{"b":1,"c":2}} -func (p *parser) parsePathDive(obj map[string]any, key string, endToken tokenType) { - // Collect the chain of keys: we already have 'key', now collect more - keys := []string{key} - for { - // Current position should be at the next key token - keyTok := p.advance() // consume the key token - nextKey := tokenToString(keyTok) - p.skipNewlines() - - if p.peek().typ != tokenColon { - // This was the final value in the chain, not a key - setNestedValue(obj, keys, tokenToValue(keyTok)) - return - } - // It's key:... so consume the colon - keys = append(keys, nextKey) - p.advance() // skip colon - p.skipNewlines() - - // Check if next is still a path dive - if !p.looksLikePathDive() { - // Parse the final value - next := p.peek() - if next.typ == tokenComma || next.typ == tokenCloseBrace || next.typ == tokenEOF || - next.typ == tokenNewline { - setNestedValue(obj, keys, nil) - } else { - val := p.parseValue() - setNestedValue(obj, keys, val) - } - return - } + // Follow replacement chain: when val is replaced by list (implicit list), + // root.Node is stale. Follow Next/Prev links to find the actual result. + result := root + for result.Next != NoRule && result.Next != nil && result.Next.Prev == result { + result = result.Next } -} -// setNestedValue sets a value in a nested map structure. -// keys = ["a", "b", "c"], val = 1 → obj["a"]["b"]["c"] = 1 -// If intermediate maps exist and are maps, merge into them. -func setNestedValue(obj map[string]any, keys []string, val any) { - current := obj - for i := 0; i < len(keys)-1; i++ { - k := keys[i] - if existing, ok := current[k]; ok { - if m, ok := existing.(map[string]any); ok { - current = m - continue - } - } - m := make(map[string]any) - current[k] = m - current = m + if IsUndefined(result.Node) { + return nil } - current[keys[len(keys)-1]] = val + return result.Node } -// parseKey reads a key (string, text, number, true, false, null can all be keys). -func (p *parser) parseKey() string { - tok := p.advance() - switch tok.typ { - case tokenString: - return tok.val - case tokenText: - return tok.val - case tokenNumber: - return tok.val - case tokenTrue: - return "true" - case tokenFalse: - return "false" - case tokenNull: - return "null" - default: - return tok.val +// parseNumericString converts a numeric string to float64. +// Handles standard decimals, hex (0x), octal (0o), binary (0b), and signs. +func parseNumericString(s string) float64 { + if len(s) == 0 { + return math.NaN() } -} - -// parseArray parses [ ... ]. -func (p *parser) parseArray() any { - p.advance() // skip [ - p.skipNewlines() - - arr := []any{} - if p.peek().typ == tokenCloseBracket { - p.advance() - return arr + // Handle sign prefix for special formats + sign := 1.0 + ns := s + if ns[0] == '-' { + sign = -1.0 + ns = ns[1:] + } else if ns[0] == '+' { + sign = 1.0 + ns = ns[1:] } - for { - p.skipNewlines() - tok := p.peek() - - if tok.typ == tokenCloseBracket || tok.typ == tokenEOF { - break - } - - if tok.typ == tokenComma { - arr = append(arr, nil) // comma without value → null - p.advance() - p.skipNewlines() - continue - } - - val := p.parseValue() - arr = append(arr, val) - - p.skipNewlines() - - if p.peek().typ == tokenComma { - p.advance() - p.skipNewlines() - // Check for trailing comma before close bracket - if p.peek().typ == tokenCloseBracket { - break + if len(ns) >= 2 { + switch { + case ns[0] == '0' && (ns[1] == 'x' || ns[1] == 'X'): + val, err := strconv.ParseInt(ns[2:], 16, 64) + if err != nil { + return math.NaN() } - continue - } - - // Implicit comma: space/newline separated values in arrays - if p.peek().typ == tokenNewline { - p.skipNewlines() - if p.peek().typ == tokenCloseBracket || p.peek().typ == tokenEOF { - break + return sign * float64(val) + case ns[0] == '0' && (ns[1] == 'o' || ns[1] == 'O'): + val, err := strconv.ParseInt(ns[2:], 8, 64) + if err != nil { + return math.NaN() } - continue - } - - // Implicit comma between adjacent values: [a b], [a [b]], [a {b:2}] - if p.peek().typ != tokenCloseBracket && p.peek().typ != tokenEOF { - continue - } - - break - } - - if p.peek().typ == tokenCloseBracket { - p.advance() - } - - return arr -} - -// parseNumber converts a number string to float64. -func parseNumber(s string) float64 { - // Handle underscore separators - s = strings.ReplaceAll(s, "_", "") - - // Handle hex - if len(s) >= 2 && s[0] == '0' && (s[1] == 'x' || s[1] == 'X') { - val, err := strconv.ParseInt(s[2:], 16, 64) - if err != nil { - return 0 - } - return float64(val) - } - // Handle negative hex - if len(s) >= 3 && s[0] == '-' && s[1] == '0' && (s[2] == 'x' || s[2] == 'X') { - val, err := strconv.ParseInt(s[3:], 16, 64) - if err != nil { - return 0 - } - return float64(-val) - } - - // Handle octal - if len(s) >= 2 && s[0] == '0' && (s[1] == 'o' || s[1] == 'O') { - val, err := strconv.ParseInt(s[2:], 8, 64) - if err != nil { - return 0 + return sign * float64(val) + case ns[0] == '0' && (ns[1] == 'b' || ns[1] == 'B'): + val, err := strconv.ParseInt(ns[2:], 2, 64) + if err != nil { + return math.NaN() + } + return sign * float64(val) } - return float64(val) } - // Handle binary - if len(s) >= 2 && s[0] == '0' && (s[1] == 'b' || s[1] == 'B') { - val, err := strconv.ParseInt(s[2:], 2, 64) - if err != nil { - return 0 - } - return float64(val) - } + // Remove underscores if present + ns = strings.ReplaceAll(s, "_", "") - val, err := strconv.ParseFloat(s, 64) + val, err := strconv.ParseFloat(ns, 64) if err != nil { - return 0 + return math.NaN() } // Normalize -0 to 0 @@ -574,57 +154,3 @@ func parseNumber(s string) float64 { return val } - -// isValueToken returns true if the token can be a value or key. -func isValueToken(tok token) bool { - switch tok.typ { - case tokenString, tokenText, tokenNumber, tokenTrue, tokenFalse, tokenNull: - return true - } - return false -} - -// tokenToString returns the string representation of a token for use as a key. -func tokenToString(tok token) string { - switch tok.typ { - case tokenString: - return tok.val - case tokenTrue: - return "true" - case tokenFalse: - return "false" - case tokenNull: - return "null" - default: - return tok.val - } -} - -// tokenToValue converts a token to its Go value. -func tokenToValue(tok token) any { - switch tok.typ { - case tokenString: - return tok.val - case tokenNumber: - return parseNumber(tok.val) - case tokenTrue: - return true - case tokenFalse: - return false - case tokenNull: - return nil - case tokenText: - return tok.val - default: - return tok.val - } -} - -// normalizeNumber cleans up float64 for JSON comparison: -// if the float64 is a whole number, ensure it compares properly. -func normalizeNumber(v float64) any { - if math.IsInf(v, 0) || math.IsNaN(v) { - return v - } - return v -} diff --git a/go/rule.go b/go/rule.go new file mode 100644 index 0000000..9e857b5 --- /dev/null +++ b/go/rule.go @@ -0,0 +1,392 @@ +package jsonic + +// RuleState represents whether a rule is in open or close state. +type RuleState = string + +const ( + OPEN RuleState = "o" + CLOSE RuleState = "c" +) + +// Undefined is a sentinel value distinguishing "no value" from nil (null). +// In TypeScript, undefined !== null. In Go, we use this sentinel. +type undefinedType struct{} + +var Undefined any = &undefinedType{} + +// IsUndefined checks if a value is the Undefined sentinel. +func IsUndefined(v any) bool { + _, ok := v.(*undefinedType) + return ok +} + +// UnwrapUndefined converts Undefined sentinels to nil in the result. +func UnwrapUndefined(v any) any { + if IsUndefined(v) { + return nil + } + switch val := v.(type) { + case map[string]any: + for k, vv := range val { + val[k] = UnwrapUndefined(vv) + } + return val + case []any: + for i, vv := range val { + val[i] = UnwrapUndefined(vv) + } + return val + } + return v +} + +// AltCond is a condition function for an alternate. +type AltCond func(r *Rule, ctx *Context) bool + +// AltAction is an action function for an alternate. +type AltAction func(r *Rule, ctx *Context) + +// AltError is an error function for an alternate. +type AltError func(r *Rule, ctx *Context) *Token + +// StateAction is a before/after action on a rule state transition. +type StateAction func(r *Rule, ctx *Context) + +// AltSpec defines a parse alternate specification. +type AltSpec struct { + S [][]Tin // Token Tin sequences to match: s[0] for t0, s[1] for t1 + P string // Push rule name (create child) + R string // Replace rule name (create sibling) + B int // Move token pointer backward (backtrack) + C AltCond // Custom condition + N map[string]int // Counter increments + A AltAction // Match action + U map[string]any // Custom props added to Rule.u + K map[string]any // Custom props added to Rule.k (propagated) + G string // Named group tags (comma-separated) + E AltError // Error generation +} + +// RuleSpec defines the specification for a parsing rule. +type RuleSpec struct { + Name string + Open []*AltSpec + Close []*AltSpec + BO []StateAction // Before-open actions + BC []StateAction // Before-close actions + AO []StateAction // After-open actions + AC []StateAction // After-close actions +} + +// Rule represents a rule instance during parsing. +type Rule struct { + I int + Name string + Spec *RuleSpec + Node any + State RuleState + D int + Child *Rule + Parent *Rule + Prev *Rule + Next *Rule + O0 *Token + O1 *Token + C0 *Token + C1 *Token + OS int + CS int + N map[string]int + U map[string]any + K map[string]any + Why string +} + +// NoRule is a sentinel rule. +// Node is Undefined (like TS where NORULE.node = undefined). +var NoRule *Rule + +func init() { + NoRule = &Rule{Name: "norule", I: -1, State: OPEN, Node: Undefined, + N: make(map[string]int), U: make(map[string]any), K: make(map[string]any)} +} + +// Eq checks if counter equals limit (nil/missing → true). +func (r *Rule) Eq(counter string, limit int) bool { + val, ok := r.N[counter] + return !ok || val == limit +} + +// Lt checks if counter < limit (nil/missing → true). +func (r *Rule) Lt(counter string, limit int) bool { + val, ok := r.N[counter] + return !ok || val < limit +} + +// Gt checks if counter > limit (nil/missing → true). +func (r *Rule) Gt(counter string, limit int) bool { + val, ok := r.N[counter] + return !ok || val > limit +} + +// Lte checks if counter <= limit (nil/missing → true). +func (r *Rule) Lte(counter string, limit int) bool { + val, ok := r.N[counter] + return !ok || val <= limit +} + +// Gte checks if counter >= limit (nil/missing → true). +func (r *Rule) Gte(counter string, limit int) bool { + val, ok := r.N[counter] + return !ok || val >= limit +} + +// MakeRule creates a new Rule from a RuleSpec. +func MakeRule(spec *RuleSpec, ctx *Context, node any) *Rule { + r := &Rule{ + I: ctx.UI, Name: spec.Name, Spec: spec, Node: node, + State: OPEN, D: ctx.RSI, + Child: NoRule, Parent: NoRule, Prev: NoRule, Next: NoRule, + O0: NoToken, O1: NoToken, C0: NoToken, C1: NoToken, + N: make(map[string]int), U: make(map[string]any), K: make(map[string]any), + } + ctx.UI++ + return r +} + +// Process processes this rule, returning the next rule to process. +func (r *Rule) Process(ctx *Context, lex *Lex) *Rule { + isOpen := r.State == OPEN + var next *Rule + if isOpen { + next = r + } else { + next = NoRule + } + + def := r.Spec + var alts []*AltSpec + if isOpen { + alts = def.Open + } else { + alts = def.Close + } + + // Before actions + if isOpen && len(def.BO) > 0 { + for _, action := range def.BO { + action(r, ctx) + } + } else if !isOpen && len(def.BC) > 0 { + for _, action := range def.BC { + action(r, ctx) + } + } + + // Match alternates + alt, _ := ParseAlts(isOpen, alts, lex, r, ctx) + + // Error check (lenient - ignore errors for auto-close) + if alt != nil && alt.E != nil { + errTkn := alt.E(r, ctx) + _ = errTkn // jsonic is lenient, auto-closes + } + + // Update counters + if alt != nil && alt.N != nil { + for cn, cv := range alt.N { + if cv == 0 { + r.N[cn] = 0 + } else { + if _, ok := r.N[cn]; !ok { + r.N[cn] = 0 + } + r.N[cn] += cv + } + } + } + + // Set custom properties + if alt != nil && alt.U != nil { + for k, v := range alt.U { + r.U[k] = v + } + } + if alt != nil && alt.K != nil { + for k, v := range alt.K { + r.K[k] = v + } + } + + // Action callback + if alt != nil && alt.A != nil { + alt.A(r, ctx) + } + + // Push / Replace / Pop + if alt != nil && alt.P != "" { + rulespec, ok := ctx.RSM[alt.P] + if ok { + ctx.RS[ctx.RSI] = r + ctx.RSI++ + next = MakeRule(rulespec, ctx, r.Node) + r.Child = next + next.Parent = r + for k, v := range r.N { + next.N[k] = v + } + if len(r.K) > 0 { + for k, v := range r.K { + next.K[k] = v + } + } + } + } else if alt != nil && alt.R != "" { + rulespec, ok := ctx.RSM[alt.R] + if ok { + next = MakeRule(rulespec, ctx, r.Node) + next.Parent = r.Parent + next.Prev = r + for k, v := range r.N { + next.N[k] = v + } + if len(r.K) > 0 { + for k, v := range r.K { + next.K[k] = v + } + } + } + } else if !isOpen { + // Pop + if ctx.RSI > 0 { + ctx.RSI-- + next = ctx.RS[ctx.RSI] + } else { + next = NoRule + } + } + + r.Next = next + + // After actions + if isOpen && len(def.AO) > 0 { + for _, action := range def.AO { + action(r, ctx) + } + } else if !isOpen && len(def.AC) > 0 { + for _, action := range def.AC { + action(r, ctx) + } + } + + // State transition + if r.State == OPEN { + r.State = CLOSE + } + + // Token consumption with backtrack + backtrack := 0 + if alt != nil { + backtrack = alt.B + } + var consumed int + if isOpen { + consumed = r.OS - backtrack + } else { + consumed = r.CS - backtrack + } + + if consumed == 1 { + ctx.V2 = ctx.V1 + ctx.V1 = ctx.T0 + ctx.T0 = ctx.T1 + ctx.T1 = NoToken + } else if consumed == 2 { + ctx.V2 = ctx.T1 + ctx.V1 = ctx.T0 + ctx.T0 = NoToken + ctx.T1 = NoToken + } + + return next +} + +// ParseAlts attempts to match one of the alternates. +func ParseAlts(isOpen bool, alts []*AltSpec, lex *Lex, rule *Rule, ctx *Context) (*AltSpec, bool) { + if len(alts) == 0 { + return nil, false + } + + for _, alt := range alts { + has0, has1 := false, false + cond := true + + if len(alt.S) > 0 && len(alt.S[0]) > 0 { + if ctx.T0.IsNoToken() { + ctx.T0 = lex.Next() + } + has0 = true + cond = tinMatch(ctx.T0.Tin, alt.S[0]) + + if cond && len(alt.S) > 1 && len(alt.S[1]) > 0 { + if ctx.T1.IsNoToken() { + ctx.T1 = lex.Next() + } + has1 = true + cond = tinMatch(ctx.T1.Tin, alt.S[1]) + } + } + + if isOpen { + if has0 { + rule.O0 = ctx.T0 + } else { + rule.O0 = NoToken + } + if has1 { + rule.O1 = ctx.T1 + } else { + rule.O1 = NoToken + } + rule.OS = boolToInt(has0) + boolToInt(has1) + } else { + if has0 { + rule.C0 = ctx.T0 + } else { + rule.C0 = NoToken + } + if has1 { + rule.C1 = ctx.T1 + } else { + rule.C1 = NoToken + } + rule.CS = boolToInt(has0) + boolToInt(has1) + } + + if cond && alt.C != nil { + cond = alt.C(rule, ctx) + } + + if cond { + return alt, true + } + } + + return nil, false +} + +func tinMatch(tin Tin, tins []Tin) bool { + for _, t := range tins { + if tin == t { + return true + } + } + return false +} + +func boolToInt(b bool) int { + if b { + return 1 + } + return 0 +} diff --git a/go/token.go b/go/token.go new file mode 100644 index 0000000..1f85b6c --- /dev/null +++ b/go/token.go @@ -0,0 +1,93 @@ +package jsonic + +// Tin is a token identification number. +type Tin = int + +// Standard token Tins - assigned in order matching the TypeScript implementation. +const ( + TinBD Tin = 1 // #BD - BAD + TinZZ Tin = 2 // #ZZ - END + TinUK Tin = 3 // #UK - UNKNOWN + TinAA Tin = 4 // #AA - ANY + TinSP Tin = 5 // #SP - SPACE + TinLN Tin = 6 // #LN - LINE + TinCM Tin = 7 // #CM - COMMENT + TinNR Tin = 8 // #NR - NUMBER + TinST Tin = 9 // #ST - STRING + TinTX Tin = 10 // #TX - TEXT + TinVL Tin = 11 // #VL - VALUE (true, false, null) + TinOB Tin = 12 // #OB - Open Brace { + TinCB Tin = 13 // #CB - Close Brace } + TinOS Tin = 14 // #OS - Open Square [ + TinCS Tin = 15 // #CS - Close Square ] + TinCL Tin = 16 // #CL - Colon : + TinCA Tin = 17 // #CA - Comma , + TinMAX Tin = 18 // Next available Tin +) + +// Token set constants +var ( + // IGNORE tokens: space, line, comment + TinSetIGNORE = map[Tin]bool{TinSP: true, TinLN: true, TinCM: true} + // VAL tokens: text, number, string, value + TinSetVAL = []Tin{TinTX, TinNR, TinST, TinVL} + // KEY tokens: text, number, string, value (same as VAL) + TinSetKEY = []Tin{TinTX, TinNR, TinST, TinVL} +) + +// Point tracks position in source text. +type Point struct { + Len int // Source length + SI int // String index (0-based) + RI int // Row index (1-based) + CI int // Column index (1-based) +} + +// Token represents a lexical token. +type Token struct { + Name string // Token name (#OB, #ST, etc.) + Tin Tin // Token identification number + Val any // Resolved value + Src string // Source text + SI int // Start position + RI int // Row + CI int // Column + Err string // Error code + Why string // Tracing/reason +} + +// IsNoToken returns true if this is a sentinel/empty token. +func (t *Token) IsNoToken() bool { + return t.Tin == -1 +} + +// ResolveVal returns the token's value (or src for numbers used as keys). +func (t *Token) ResolveVal() any { + return t.Val +} + +// MakeToken creates a new Token. +func MakeToken(name string, tin Tin, val any, src string, pnt Point) *Token { + return &Token{ + Name: name, + Tin: tin, + Val: val, + Src: src, + SI: pnt.SI, + RI: pnt.RI, + CI: pnt.CI, + } +} + +// NoToken is a sentinel token indicating "no token". +var NoToken = &Token{Name: "", Tin: -1, SI: -1, RI: -1, CI: -1} + +// Fixed token source map: character -> Tin +var FixedTokens = map[string]Tin{ + "{": TinOB, + "}": TinCB, + "[": TinOS, + "]": TinCS, + ":": TinCL, + ",": TinCA, +}