diff options
| author | sweiglbosker <stefan@s00.xyz> | 2026-02-24 13:04:50 -0500 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2026-02-24 13:04:50 -0500 |
| commit | 9d386221c9d6265f8ab85b42fcb93b4a0cafbb54 (patch) | |
| tree | 317c3f56776538eae9980ad93bd16432d396470b | |
| parent | 4b005e4a6e646c0b2788fc261097cdca2a93696c (diff) | |
| download | compiler-9d386221c9d6265f8ab85b42fcb93b4a0cafbb54.tar.gz | |
| -rw-r--r-- | BUILD.bazel | 1 | ||||
| -rw-r--r-- | willow/tools/willowc/include/tokenizer.hpp | 11 | ||||
| -rw-r--r-- | willow/tools/willowc/lib/tokenizer.cpp | 23 | ||||
| -rw-r--r-- | willow/tools/willowc/unittest/BUILD.bazel | 16 | ||||
| -rw-r--r-- | willow/tools/willowc/unittest/TokenizerTest.cpp | 197 |
5 files changed, 246 insertions, 2 deletions
diff --git a/BUILD.bazel b/BUILD.bazel index 60088b1..4c0f6f7 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -6,5 +6,6 @@ refresh_compile_commands( targets = { "//willow:willow": "", "//willow:willowc": "", + "//willow/tools/willowc/unittest": "", }, ) diff --git a/willow/tools/willowc/include/tokenizer.hpp b/willow/tools/willowc/include/tokenizer.hpp index 47577ab..d3970f6 100644 --- a/willow/tools/willowc/include/tokenizer.hpp +++ b/willow/tools/willowc/include/tokenizer.hpp @@ -22,6 +22,9 @@ enum class TokenKind { LCurly, RCurly, Equals, + Star, + LTriangle, + RTriangle, RArrow, Comment, @@ -66,7 +69,7 @@ class Tokenizer { bool scan_hex(); bool scan_constant(); public: - explicit Tokenizer(std::string_view buf, std::size_t offset = 0) + explicit constexpr Tokenizer(std::string_view buf, std::size_t offset = 0) : buf{buf}, offset{offset} {} Token scan(); @@ -103,6 +106,12 @@ constexpr std::string_view TokenKindName(TokenKind t) { return "RCurly"; case TokenKind::Equals: return "Equals"; + case TokenKind::Star: + return "Star"; + case TokenKind::LTriangle: + return "LTriangle"; + case TokenKind::RTriangle: + return "RTriangle"; case TokenKind::RArrow: return "RArrow"; case TokenKind::Comment: diff --git a/willow/tools/willowc/lib/tokenizer.cpp b/willow/tools/willowc/lib/tokenizer.cpp index 7ad28a6..f7dd04f 100644 --- a/willow/tools/willowc/lib/tokenizer.cpp +++ b/willow/tools/willowc/lib/tokenizer.cpp @@ -1,3 +1,4 @@ +#include <iostream> #include <tokenizer.hpp> namespace willowc { @@ -84,6 +85,15 @@ Token Tokenizer::scan() { case '=': skip(); return TokenKind::Equals; + case '*': + skip(); + return TokenKind::Star; + case '<': + skip(); + return TokenKind::LTriangle; + case '>': + skip(); + return TokenKind::RTriangle; case '-': { if (peek(1) == '>') { skip(2); @@ -133,12 +143,21 @@ Token Tokenizer::scan() { }(); Token t{start, offset, k}; + if (t.kind == TokenKind::Invalid) { - if (t.start == t.end) + if (t.start == t.end) { t.end++; + skip(); + } recover(); } + if (t.kind == TokenKind::Inst) { + auto lexeme = buf.substr(t.start, t.end - t.start); + if (lexeme == "func") + t.kind = TokenKind::FuncKW; + } + return t; } @@ -214,6 +233,8 @@ void Tokenizer::recover() { [[fallthrough]]; case '=': [[fallthrough]]; + case '^': + [[fallthrough]]; case '\0': return true; default: diff --git a/willow/tools/willowc/unittest/BUILD.bazel b/willow/tools/willowc/unittest/BUILD.bazel new file mode 100644 index 0000000..141bf82 --- /dev/null +++ b/willow/tools/willowc/unittest/BUILD.bazel @@ -0,0 +1,16 @@ +test_suite( + name = "unittest", + tests = [ + ":tokenizer" + ], +) + +cc_test( + name = "tokenizer", + srcs = ["TokenizerTest.cpp"], + deps = [ + "//willow/tools/willowc:willowc_lib", + "@catch2//:catch2_main" + ], + tags = ["tokenizer"] +) diff --git a/willow/tools/willowc/unittest/TokenizerTest.cpp b/willow/tools/willowc/unittest/TokenizerTest.cpp new file mode 100644 index 0000000..d085b1d --- /dev/null +++ b/willow/tools/willowc/unittest/TokenizerTest.cpp @@ -0,0 +1,197 @@ +#include <catch2/catch_test_macros.hpp> + +#include <iostream> +#include <parser.hpp> +#include <print> +#include <span> +#include <tokenizer.hpp> +#include <willow/Util/Color.h> + +using namespace willowc; +using namespace willow::termcolor; + +bool tokenizer_test(std::string_view buffer, std::span<const TokenKind> args) { + Tokenizer tokenizer(buffer); + + size_t token_index = 0; + while (true) { + Token t = tokenizer.scan(); + + if (token_index >= args.size()) + break; + + TokenKind expected = args[token_index++]; + if (t.kind != expected) { + size_t line_start = [&] { + size_t ls = t.start; + if (ls > buffer.size()) + ls = buffer.size(); + for (; ls > 0; ls--) { + if (buffer[ls - 1] == '\n') { + break; + } + } + return ls; + }(); + size_t line_end = [&]() { + auto p = buffer.find('\n', t.start); + return (p == std::string_view::npos) ? buffer.size() : p; + }(); + + std::println(std::cerr); + std::println(std::cerr, "{}FAIL:{} expected '{}', got '{}'{}", + willow::termcolor::TextStyle{AnsiColor::Red, Emphasis::Bold}, + TextStyle{AnsiColor::Default, AnsiColor::Default}, expected, + t.kind, + TextStyle{AnsiColor::None, AnsiColor::None, Emphasis::None}); + std::println(std::cerr, "{}", + buffer.substr(line_start, line_end - line_start)); + for (size_t i = line_start; i < t.start; i++) + std::print(std::cerr, " "); + + std::print(std::cerr, "{}", TextStyle{AnsiColor::Red, Emphasis::Bold}); + for (size_t i = t.start; i < t.end; i++) + std::print(std::cerr, "^"); + std::println(); + + return false; + } + + if (t.kind == willowc::TokenKind::Eof) + break; + } + + return true; +} + +TEST_CASE("basic code", "[tokenizer]") { + using namespace std::string_literals; + using enum TokenKind; + + REQUIRE(tokenizer_test( + R"( + func @add(%a: i32, %b: i32) -> i32 { + %c: i32 = add %a, %b; + // return the sum of %a and %b + return %c; + })", + // clang-format off + std::array{FuncKW, Function, LParen, Variable, Colon, Type, Comma, Variable, Colon, Type, RParen, RArrow, Type, LCurly, + Variable, Colon, Type, Equals, Inst, Variable, Comma, Variable, Semicolon, + Comment, + Inst, Variable, Semicolon, + RCurly, Eof})); + // clang-format on + + REQUIRE(tokenizer_test( + R"( + func @slt(%a: i32, %b: i32) -> i32 { + ^entry: + %retval: *i32 = alloca i32; + %pred: i1 = lt %a, %b; + br %pred, ^lt, ^ge; + ^lt: + store %a, %retval; + jmp ^ret; + ^gt: + store %b, %retval; + jmp ^ret; + ^ret: + %r: i32 = load %retval; + return %r; + })", + // clang-format off + std::array{ + FuncKW, Function, LParen, Variable, Colon, Type, Comma, Variable, Colon, Type, RParen, RArrow, Type, LCurly, + Label, Colon, // ^entry: + + Variable, Colon, Star, Type, Equals, Inst, Type, Semicolon, // %retval: *i32 = alloca i32; + Variable, Colon, Type, Equals, Inst, Variable, Comma, Variable, Semicolon, // %pred: i1 = lt %a, %b; + + Inst, Variable, Comma, Label, Comma, Label, Semicolon, // br %pred, %lt, %ge; + + Label, Colon, // ^lt: + Inst, Variable, Comma, Variable, Semicolon, // store %a, %retval; + Inst, Label, Semicolon, // jmp ^ret; + + Label, Colon, // ^gt: + Inst, Variable, Comma, Variable, Semicolon, // store %b, %retval; + Inst, Label, Semicolon, // jmp ^ret; + + Label, Colon, // ^ret: + Variable, Colon, Type, Equals, Inst, Variable, Semicolon, // %r: i32 = load %retval; + Inst, Variable, Semicolon, // return %r; + RCurly, Eof + })); + + // clang-format on +} + +TEST_CASE("constants", "[tokenizer]") { + using namespace std::string_literals; + using enum TokenKind; + + REQUIRE(tokenizer_test( + R"( + func @c() -> i32 { + %a: i32 = const 0; + %b: i64 = const -7; + %c: i64 = const 123; + %d: i32 = const 0x2a; + } + )", + // clang-format off + std::array{FuncKW, Function, LParen, RParen, RArrow, Type, LCurly, + Variable, Colon, Type, Equals, Inst, Constant, Semicolon, + Variable, Colon, Type, Equals, Inst, Constant, Semicolon, + Variable, Colon, Type, Equals, Inst, Constant, Semicolon, + Variable, Colon, Type, Equals, Inst, Constant, Semicolon, + RCurly, Eof})); + // clang-format on +} + +TEST_CASE("identifiers", "[identifiers]") { + using namespace std::string_literals; + using enum TokenKind; + + REQUIRE(tokenizer_test( + R"( + func @-foo.bar_baz-0(%$a0: i32, %_tmp_1: i32) -> i32 { + ret %_tmp_1; + } + )", + // clang-format off + std::array{ + FuncKW, Function, LParen, Variable, Colon, Type, Comma, Variable, Colon, Type, RParen, RArrow, Type, LCurly, + Inst, Variable, Semicolon, + RCurly, Eof + })); + // clang-format on +} + +TEST_CASE("edge cases", "[tokenizer]") { + using namespace std::string_literals; + using enum TokenKind; + + REQUIRE(tokenizer_test(R"()", std::array{Eof})); + + REQUIRE(tokenizer_test(R"(@foo // hi!)", std::array{Function, Comment, Eof})); + + REQUIRE(tokenizer_test(R"(--5)", std::array{Invalid, Eof})); + + REQUIRE(tokenizer_test(R"(/ %foo)", std::array{Invalid, Variable, Eof})); + + REQUIRE(tokenizer_test(R"(^:)", std::array{Invalid, Colon, Eof})); +} + +TEST_CASE("invalid token recovery", "[tokenizer]") { + using namespace std::string_literals; + using enum TokenKind; + + REQUIRE(tokenizer_test(R"(----xyz)", std::array{Invalid, Eof})); + + REQUIRE(tokenizer_test(R"(^^foo:)", std::array{Invalid, Label, Colon, Eof})); + + REQUIRE(tokenizer_test(R"(%13a %foo)", + std::array{Variable, Inst, Variable, Eof})); +} |