From 0ccf015c218ed1565855a34b1fc01fd917181343 Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Wed, 20 May 2026 11:50:23 -0700
Subject: [PATCH 1/2] Fix OOB errors in the lexer

Fix an OOB string_view access that was just trying to get a one-past-the-end pointer. Fix two locations where we could have been peeking empty input.

Fixes #8732.
---
 src/parser/lexer.h       | 10 +++++++++-
 test/gtest/wat-lexer.cpp |  9 +++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)
diff --git a/src/parser/lexer.h b/src/parser/lexer.h
index 2f5cb7a0291..e998c6db820 100644
--- a/src/parser/lexer.h
+++ b/src/parser/lexer.h
@@ -624,6 +624,9 @@ inline std::optional<uint64_t> Lexer::takeHexnum(OverflowBehavior behavior) {
 }
 
 inline Lexer::Sign Lexer::takeSign() {
+  if (empty()) {
+    return NoSign;
+  }
   auto c = peek();
   if (c == '+') {
     take(1);
@@ -812,7 +815,8 @@ inline std::optional<Lexer::LexedFloat> Lexer::takeFloat() {
   // we need to strip any underscores since `std::strtod` does not understand
   // them.
   std::stringstream ss;
-  for (const char *curr = &buffer[startPos], *end = &buffer[pos]; curr != end;
+  for (const char *curr = buffer.data() + startPos, *end = buffer.data() + pos;
+       curr != end;
        ++curr) {
     if (*curr != '_') {
       ss << *curr;
@@ -853,6 +857,10 @@ inline std::optional<Lexer::StringOrView> Lexer::takeStr() {
       // Escape sequences
       ensureBuildingEscaped();
       take(1);
+      if (empty()) {
+        pos = startPos;
+        return std::nullopt;
+      }
       auto c = peek();
       take(1);
       switch (c) {
diff --git a/test/gtest/wat-lexer.cpp b/test/gtest/wat-lexer.cpp
index 3a4cd49e246..38ad63dedca 100644
--- a/test/gtest/wat-lexer.cpp
+++ b/test/gtest/wat-lexer.cpp
@@ -22,6 +22,14 @@
 using namespace wasm::WATParser;
 using namespace std::string_view_literals;
 
+TEST(LexerTest, EmptyInput) {
+  EXPECT_TRUE(Lexer(""sv).empty());
+  EXPECT_EQ(Lexer(""sv).takeI32(), std::nullopt);
+  EXPECT_EQ(Lexer(""sv).takeF32(), std::nullopt);
+  EXPECT_EQ(Lexer(""sv).takeString(), std::nullopt);
+  EXPECT_EQ(Lexer(""sv).takeID(), std::nullopt);
+}
+
 TEST(LexerTest, LexWhitespace) {
   Lexer lexer(" 1\t2\n3\r4 \n\n\t 5 "sv);
 
@@ -915,6 +923,7 @@ TEST(LexerTest, LexString) {
     "_$_\xC2\xA3_\xE2\x82\xAC_\xF0\x90\x8D\x88_"s);
 
   EXPECT_FALSE(Lexer("\"unterminated"sv).takeString());
+  EXPECT_FALSE(Lexer("\"foo\\"sv).takeString());
   EXPECT_FALSE(Lexer("\"unescaped nul\0\""sv).takeString());
   EXPECT_FALSE(Lexer("\"unescaped U+19\x19\""sv).takeString());
   EXPECT_FALSE(Lexer("\"unescaped U+7f\x7f\""sv).takeString());

From 40103d823fdf818f4b78c21fb89fd4eeca96d6c3 Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Wed, 20 May 2026 13:45:11 -0700
Subject: [PATCH 2/2] [NFC] Return optional in Lexer::peek

This prevents errors where we peek the buffer without checking that there is more input to peek at.
---
 src/parser/lexer.h | 109 ++++++++++++++++++++++++---------------------
 1 file changed, 58 insertions(+), 51 deletions(-)

diff --git a/src/parser/lexer.h b/src/parser/lexer.h
index e998c6db820..f6da0aeedc7 100644
--- a/src/parser/lexer.h
+++ b/src/parser/lexer.h
@@ -69,6 +69,8 @@ struct Lexer {
   std::vector<Annotation> annotations;
   std::optional<std::string> file;
 
+  static bool isSpacechar(uint8_t c);
+
 public:
   std::string_view buffer;
 
@@ -93,11 +95,11 @@ struct Lexer {
 
   std::optional<char> peekChar() const;
 
-  bool peekLParen() { return !empty() && peek() == '('; }
+  bool peekLParen() { return peek() == uint8_t('('); }
 
   bool takeLParen();
 
-  bool peekRParen() { return !empty() && peek() == ')'; }
+  bool peekRParen() { return peek() == uint8_t(')'); }
 
   bool takeRParen();
 
@@ -134,7 +136,12 @@ struct Lexer {
 
   std::string_view next() const { return buffer.substr(pos); }
 
-  uint8_t peek() const { return buffer[pos]; }
+  std::optional<uint8_t> peek() const {
+    if (empty()) {
+      return std::nullopt;
+    }
+    return uint8_t(buffer[pos]);
+  }
 
   void advance() {
     annotations.clear();
@@ -247,8 +254,8 @@ inline Lexer::Lexer(std::string_view buffer, std::optional<std::string> file)
 }
 
 inline std::optional<char> Lexer::peekChar() const {
-  if (!empty()) {
-    return peek();
+  if (auto c = peek()) {
+    return char(*c);
   }
   return std::nullopt;
 }
@@ -298,16 +305,12 @@ inline std::optional<Name> Lexer::takeID() {
 }
 
 inline std::optional<std::string_view> Lexer::peekKeyword() {
-  if (empty()) {
+  auto start = peek();
+  if (!start || *start < 'a' || *start > 'z') {
     return std::nullopt;
   }
   auto startPos = pos;
-  uint8_t start = peek();
-  if ('a' <= start && start <= 'z') {
-    take(1);
-  } else {
-    return std::nullopt;
-  }
+  take(1);
   while (idchar()) {
     take(1);
   }
@@ -544,23 +547,21 @@ inline bool Lexer::takePrefix(std::string_view sv) {
 }
 
 inline std::optional<int> Lexer::takeDigit() {
-  if (empty()) {
-    return std::nullopt;
-  }
-  if (auto d = getDigit(peek())) {
-    take(1);
-    return d;
+  if (auto c = peek()) {
+    if (auto d = getDigit(*c)) {
+      take(1);
+      return d;
+    }
   }
   return std::nullopt;
 }
 
 inline std::optional<int> Lexer::takeHexdigit() {
-  if (empty()) {
-    return std::nullopt;
-  }
-  if (auto h = getHexDigit(peek())) {
-    take(1);
-    return h;
+  if (auto c = peek()) {
+    if (auto h = getHexDigit(*c)) {
+      take(1);
+      return h;
+    }
   }
   return std::nullopt;
 }
@@ -624,17 +625,15 @@ inline std::optional<uint64_t> Lexer::takeHexnum(OverflowBehavior behavior) {
 }
 
 inline Lexer::Sign Lexer::takeSign() {
-  if (empty()) {
-    return NoSign;
-  }
-  auto c = peek();
-  if (c == '+') {
-    take(1);
-    return Pos;
-  }
-  if (c == '-') {
-    take(1);
-    return Neg;
+  if (auto c = peek()) {
+    if (*c == '+') {
+      take(1);
+      return Pos;
+    }
+    if (*c == '-') {
+      take(1);
+      return Neg;
+    }
   }
   return NoSign;
 }
@@ -857,13 +856,13 @@ inline std::optional<Lexer::StringOrView> Lexer::takeStr() {
       // Escape sequences
       ensureBuildingEscaped();
       take(1);
-      if (empty()) {
+      auto c = peek();
+      if (!c) {
         pos = startPos;
         return std::nullopt;
       }
-      auto c = peek();
       take(1);
-      switch (c) {
+      switch (*c) {
         case 't':
           *escapeBuilder << '\t';
           break;
@@ -909,7 +908,7 @@ inline std::optional<Lexer::StringOrView> Lexer::takeStr() {
         default: {
           // Byte escape: \hh
           // We already took the first h as c.
-          auto first = getHexDigit(c);
+          auto first = getHexDigit(*c);
           auto second = takeHexdigit();
           if (!first || !second) {
             // TODO: Add error production for unrecognized escape sequence.
@@ -921,7 +920,8 @@ inline std::optional<Lexer::StringOrView> Lexer::takeStr() {
       }
     } else {
       // Normal characters
-      if (uint8_t c = peek(); c >= 0x20 && c != 0x7F) {
+      uint8_t c = *peek();
+      if (c >= 0x20 && c != 0x7F) {
         if (escapeBuilder) {
           *escapeBuilder << c;
         }
@@ -941,17 +941,17 @@ inline std::optional<Lexer::StringOrView> Lexer::takeStr() {
 }
 
 inline bool Lexer::idchar() {
-  if (empty()) {
+  auto c = peek();
+  if (!c) {
     return false;
   }
-  uint8_t c = peek();
   // All the allowed characters lie in the range '!' to '~', and within that
   // range the vast majority of characters are allowed, so it is significantly
   // faster to check for the disallowed characters instead.
-  if (c < '!' || c > '~') {
+  if (*c < '!' || *c > '~') {
     return false;
   }
-  switch (c) {
+  switch (*c) {
     case '"':
     case '(':
     case ')':
@@ -999,11 +999,8 @@ inline std::optional<Lexer::StringOrView> Lexer::takeIdent() {
   return std::nullopt;
 }
 
-inline bool Lexer::spacechar() {
-  if (empty()) {
-    return false;
-  }
-  switch (peek()) {
+inline bool Lexer::isSpacechar(uint8_t c) {
+  switch (c) {
     case ' ':
     case '\n':
     case '\r':
@@ -1014,6 +1011,13 @@ inline bool Lexer::spacechar() {
   }
 }
 
+inline bool Lexer::spacechar() {
+  if (auto c = peek()) {
+    return isSpacechar(*c);
+  }
+  return false;
+}
+
 inline bool Lexer::takeSpacechar() {
   if (spacechar()) {
     take(1);
@@ -1160,8 +1164,11 @@ inline bool Lexer::canFinish() {
   // actually want to parse more than a couple characters of space, so check
   // for individual space chars or comment starts instead.
   using namespace std::string_view_literals;
-  return empty() || spacechar() || peek() == '(' || peek() == ')' ||
-         startsWith(";;"sv);
+  auto c = peek();
+  if (!c) {
+    return true;
+  }
+  return isSpacechar(*c) || *c == '(' || *c == ')' || startsWith(";;"sv);
 }
 
 } // namespace wasm::WATParser