From 0ccf015c218ed1565855a34b1fc01fd917181343 Mon Sep 17 00:00:00 2001 From: Thomas Lively Date: Wed, 20 May 2026 11:50:23 -0700 Subject: [PATCH 1/2] Fix OOB errors in the lexer Fix an OOB string_view access that was just trying to get a one-past-the-end pointer. Fix two locations where we could have been peeking empty input. Fixes #8732. --- src/parser/lexer.h | 10 +++++++++- test/gtest/wat-lexer.cpp | 9 +++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/src/parser/lexer.h b/src/parser/lexer.h index 2f5cb7a0291..e998c6db820 100644 --- a/src/parser/lexer.h +++ b/src/parser/lexer.h @@ -624,6 +624,9 @@ inline std::optional Lexer::takeHexnum(OverflowBehavior behavior) { } inline Lexer::Sign Lexer::takeSign() { + if (empty()) { + return NoSign; + } auto c = peek(); if (c == '+') { take(1); @@ -812,7 +815,8 @@ inline std::optional Lexer::takeFloat() { // we need to strip any underscores since `std::strtod` does not understand // them. std::stringstream ss; - for (const char *curr = &buffer[startPos], *end = &buffer[pos]; curr != end; + for (const char *curr = buffer.data() + startPos, *end = buffer.data() + pos; + curr != end; ++curr) { if (*curr != '_') { ss << *curr; @@ -853,6 +857,10 @@ inline std::optional Lexer::takeStr() { // Escape sequences ensureBuildingEscaped(); take(1); + if (empty()) { + pos = startPos; + return std::nullopt; + } auto c = peek(); take(1); switch (c) { diff --git a/test/gtest/wat-lexer.cpp b/test/gtest/wat-lexer.cpp index 3a4cd49e246..38ad63dedca 100644 --- a/test/gtest/wat-lexer.cpp +++ b/test/gtest/wat-lexer.cpp @@ -22,6 +22,14 @@ using namespace wasm::WATParser; using namespace std::string_view_literals; +TEST(LexerTest, EmptyInput) { + EXPECT_TRUE(Lexer(""sv).empty()); + EXPECT_EQ(Lexer(""sv).takeI32(), std::nullopt); + EXPECT_EQ(Lexer(""sv).takeF32(), std::nullopt); + EXPECT_EQ(Lexer(""sv).takeString(), std::nullopt); + EXPECT_EQ(Lexer(""sv).takeID(), std::nullopt); +} + TEST(LexerTest, LexWhitespace) { Lexer lexer(" 1\t2\n3\r4 \n\n\t 5 "sv); @@ -915,6 +923,7 @@ TEST(LexerTest, LexString) { "_$_\xC2\xA3_\xE2\x82\xAC_\xF0\x90\x8D\x88_"s); EXPECT_FALSE(Lexer("\"unterminated"sv).takeString()); + EXPECT_FALSE(Lexer("\"foo\\"sv).takeString()); EXPECT_FALSE(Lexer("\"unescaped nul\0\""sv).takeString()); EXPECT_FALSE(Lexer("\"unescaped U+19\x19\""sv).takeString()); EXPECT_FALSE(Lexer("\"unescaped U+7f\x7f\""sv).takeString()); From 40103d823fdf818f4b78c21fb89fd4eeca96d6c3 Mon Sep 17 00:00:00 2001 From: Thomas Lively Date: Wed, 20 May 2026 13:45:11 -0700 Subject: [PATCH 2/2] [NFC] Return optional in Lexer::peek This prevents errors where we peek the buffer without checking that there is more input to peek at. --- src/parser/lexer.h | 109 ++++++++++++++++++++++++--------------------- 1 file changed, 58 insertions(+), 51 deletions(-) diff --git a/src/parser/lexer.h b/src/parser/lexer.h index e998c6db820..f6da0aeedc7 100644 --- a/src/parser/lexer.h +++ b/src/parser/lexer.h @@ -69,6 +69,8 @@ struct Lexer { std::vector annotations; std::optional file; + static bool isSpacechar(uint8_t c); + public: std::string_view buffer; @@ -93,11 +95,11 @@ struct Lexer { std::optional peekChar() const; - bool peekLParen() { return !empty() && peek() == '('; } + bool peekLParen() { return peek() == uint8_t('('); } bool takeLParen(); - bool peekRParen() { return !empty() && peek() == ')'; } + bool peekRParen() { return peek() == uint8_t(')'); } bool takeRParen(); @@ -134,7 +136,12 @@ struct Lexer { std::string_view next() const { return buffer.substr(pos); } - uint8_t peek() const { return buffer[pos]; } + std::optional peek() const { + if (empty()) { + return std::nullopt; + } + return uint8_t(buffer[pos]); + } void advance() { annotations.clear(); @@ -247,8 +254,8 @@ inline Lexer::Lexer(std::string_view buffer, std::optional file) } inline std::optional Lexer::peekChar() const { - if (!empty()) { - return peek(); + if (auto c = peek()) { + return char(*c); } return std::nullopt; } @@ -298,16 +305,12 @@ inline std::optional Lexer::takeID() { } inline std::optional Lexer::peekKeyword() { - if (empty()) { + auto start = peek(); + if (!start || *start < 'a' || *start > 'z') { return std::nullopt; } auto startPos = pos; - uint8_t start = peek(); - if ('a' <= start && start <= 'z') { - take(1); - } else { - return std::nullopt; - } + take(1); while (idchar()) { take(1); } @@ -544,23 +547,21 @@ inline bool Lexer::takePrefix(std::string_view sv) { } inline std::optional Lexer::takeDigit() { - if (empty()) { - return std::nullopt; - } - if (auto d = getDigit(peek())) { - take(1); - return d; + if (auto c = peek()) { + if (auto d = getDigit(*c)) { + take(1); + return d; + } } return std::nullopt; } inline std::optional Lexer::takeHexdigit() { - if (empty()) { - return std::nullopt; - } - if (auto h = getHexDigit(peek())) { - take(1); - return h; + if (auto c = peek()) { + if (auto h = getHexDigit(*c)) { + take(1); + return h; + } } return std::nullopt; } @@ -624,17 +625,15 @@ inline std::optional Lexer::takeHexnum(OverflowBehavior behavior) { } inline Lexer::Sign Lexer::takeSign() { - if (empty()) { - return NoSign; - } - auto c = peek(); - if (c == '+') { - take(1); - return Pos; - } - if (c == '-') { - take(1); - return Neg; + if (auto c = peek()) { + if (*c == '+') { + take(1); + return Pos; + } + if (*c == '-') { + take(1); + return Neg; + } } return NoSign; } @@ -857,13 +856,13 @@ inline std::optional Lexer::takeStr() { // Escape sequences ensureBuildingEscaped(); take(1); - if (empty()) { + auto c = peek(); + if (!c) { pos = startPos; return std::nullopt; } - auto c = peek(); take(1); - switch (c) { + switch (*c) { case 't': *escapeBuilder << '\t'; break; @@ -909,7 +908,7 @@ inline std::optional Lexer::takeStr() { default: { // Byte escape: \hh // We already took the first h as c. - auto first = getHexDigit(c); + auto first = getHexDigit(*c); auto second = takeHexdigit(); if (!first || !second) { // TODO: Add error production for unrecognized escape sequence. @@ -921,7 +920,8 @@ inline std::optional Lexer::takeStr() { } } else { // Normal characters - if (uint8_t c = peek(); c >= 0x20 && c != 0x7F) { + uint8_t c = *peek(); + if (c >= 0x20 && c != 0x7F) { if (escapeBuilder) { *escapeBuilder << c; } @@ -941,17 +941,17 @@ inline std::optional Lexer::takeStr() { } inline bool Lexer::idchar() { - if (empty()) { + auto c = peek(); + if (!c) { return false; } - uint8_t c = peek(); // All the allowed characters lie in the range '!' to '~', and within that // range the vast majority of characters are allowed, so it is significantly // faster to check for the disallowed characters instead. - if (c < '!' || c > '~') { + if (*c < '!' || *c > '~') { return false; } - switch (c) { + switch (*c) { case '"': case '(': case ')': @@ -999,11 +999,8 @@ inline std::optional Lexer::takeIdent() { return std::nullopt; } -inline bool Lexer::spacechar() { - if (empty()) { - return false; - } - switch (peek()) { +inline bool Lexer::isSpacechar(uint8_t c) { + switch (c) { case ' ': case '\n': case '\r': @@ -1014,6 +1011,13 @@ inline bool Lexer::spacechar() { } } +inline bool Lexer::spacechar() { + if (auto c = peek()) { + return isSpacechar(*c); + } + return false; +} + inline bool Lexer::takeSpacechar() { if (spacechar()) { take(1); @@ -1160,8 +1164,11 @@ inline bool Lexer::canFinish() { // actually want to parse more than a couple characters of space, so check // for individual space chars or comment starts instead. using namespace std::string_view_literals; - return empty() || spacechar() || peek() == '(' || peek() == ')' || - startsWith(";;"sv); + auto c = peek(); + if (!c) { + return true; + } + return isSpacechar(*c) || *c == '(' || *c == ')' || startsWith(";;"sv); } } // namespace wasm::WATParser