summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--BUILD.bazel1
-rw-r--r--willow/tools/willowc/include/tokenizer.hpp11
-rw-r--r--willow/tools/willowc/lib/tokenizer.cpp23
-rw-r--r--willow/tools/willowc/unittest/BUILD.bazel16
-rw-r--r--willow/tools/willowc/unittest/TokenizerTest.cpp197
5 files changed, 246 insertions, 2 deletions
diff --git a/BUILD.bazel b/BUILD.bazel
index 60088b1..4c0f6f7 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -6,5 +6,6 @@ refresh_compile_commands(
targets = {
"//willow:willow": "",
"//willow:willowc": "",
+ "//willow/tools/willowc/unittest": "",
},
)
diff --git a/willow/tools/willowc/include/tokenizer.hpp b/willow/tools/willowc/include/tokenizer.hpp
index 47577ab..d3970f6 100644
--- a/willow/tools/willowc/include/tokenizer.hpp
+++ b/willow/tools/willowc/include/tokenizer.hpp
@@ -22,6 +22,9 @@ enum class TokenKind {
LCurly,
RCurly,
Equals,
+ Star,
+ LTriangle,
+ RTriangle,
RArrow,
Comment,
@@ -66,7 +69,7 @@ class Tokenizer {
bool scan_hex();
bool scan_constant();
public:
- explicit Tokenizer(std::string_view buf, std::size_t offset = 0)
+ explicit constexpr Tokenizer(std::string_view buf, std::size_t offset = 0)
: buf{buf}, offset{offset} {}
Token scan();
@@ -103,6 +106,12 @@ constexpr std::string_view TokenKindName(TokenKind t) {
return "RCurly";
case TokenKind::Equals:
return "Equals";
+ case TokenKind::Star:
+ return "Star";
+ case TokenKind::LTriangle:
+ return "LTriangle";
+ case TokenKind::RTriangle:
+ return "RTriangle";
case TokenKind::RArrow:
return "RArrow";
case TokenKind::Comment:
diff --git a/willow/tools/willowc/lib/tokenizer.cpp b/willow/tools/willowc/lib/tokenizer.cpp
index 7ad28a6..f7dd04f 100644
--- a/willow/tools/willowc/lib/tokenizer.cpp
+++ b/willow/tools/willowc/lib/tokenizer.cpp
@@ -1,3 +1,4 @@
+#include <iostream>
#include <tokenizer.hpp>
namespace willowc {
@@ -84,6 +85,15 @@ Token Tokenizer::scan() {
case '=':
skip();
return TokenKind::Equals;
+ case '*':
+ skip();
+ return TokenKind::Star;
+ case '<':
+ skip();
+ return TokenKind::LTriangle;
+ case '>':
+ skip();
+ return TokenKind::RTriangle;
case '-': {
if (peek(1) == '>') {
skip(2);
@@ -133,12 +143,21 @@ Token Tokenizer::scan() {
}();
Token t{start, offset, k};
+
if (t.kind == TokenKind::Invalid) {
- if (t.start == t.end)
+ if (t.start == t.end) {
t.end++;
+ skip();
+ }
recover();
}
+ if (t.kind == TokenKind::Inst) {
+ auto lexeme = buf.substr(t.start, t.end - t.start);
+ if (lexeme == "func")
+ t.kind = TokenKind::FuncKW;
+ }
+
return t;
}
@@ -214,6 +233,8 @@ void Tokenizer::recover() {
[[fallthrough]];
case '=':
[[fallthrough]];
+ case '^':
+ [[fallthrough]];
case '\0':
return true;
default:
diff --git a/willow/tools/willowc/unittest/BUILD.bazel b/willow/tools/willowc/unittest/BUILD.bazel
new file mode 100644
index 0000000..141bf82
--- /dev/null
+++ b/willow/tools/willowc/unittest/BUILD.bazel
@@ -0,0 +1,16 @@
+test_suite(
+ name = "unittest",
+ tests = [
+ ":tokenizer"
+ ],
+)
+
+cc_test(
+ name = "tokenizer",
+ srcs = ["TokenizerTest.cpp"],
+ deps = [
+ "//willow/tools/willowc:willowc_lib",
+ "@catch2//:catch2_main"
+ ],
+ tags = ["tokenizer"]
+)
diff --git a/willow/tools/willowc/unittest/TokenizerTest.cpp b/willow/tools/willowc/unittest/TokenizerTest.cpp
new file mode 100644
index 0000000..d085b1d
--- /dev/null
+++ b/willow/tools/willowc/unittest/TokenizerTest.cpp
@@ -0,0 +1,197 @@
+#include <catch2/catch_test_macros.hpp>
+
+#include <iostream>
+#include <parser.hpp>
+#include <print>
+#include <span>
+#include <tokenizer.hpp>
+#include <willow/Util/Color.h>
+
+using namespace willowc;
+using namespace willow::termcolor;
+
+bool tokenizer_test(std::string_view buffer, std::span<const TokenKind> args) {
+ Tokenizer tokenizer(buffer);
+
+ size_t token_index = 0;
+ while (true) {
+ Token t = tokenizer.scan();
+
+ if (token_index >= args.size())
+ break;
+
+ TokenKind expected = args[token_index++];
+ if (t.kind != expected) {
+ size_t line_start = [&] {
+ size_t ls = t.start;
+ if (ls > buffer.size())
+ ls = buffer.size();
+ for (; ls > 0; ls--) {
+ if (buffer[ls - 1] == '\n') {
+ break;
+ }
+ }
+ return ls;
+ }();
+ size_t line_end = [&]() {
+ auto p = buffer.find('\n', t.start);
+ return (p == std::string_view::npos) ? buffer.size() : p;
+ }();
+
+ std::println(std::cerr);
+ std::println(std::cerr, "{}FAIL:{} expected '{}', got '{}'{}",
+ willow::termcolor::TextStyle{AnsiColor::Red, Emphasis::Bold},
+ TextStyle{AnsiColor::Default, AnsiColor::Default}, expected,
+ t.kind,
+ TextStyle{AnsiColor::None, AnsiColor::None, Emphasis::None});
+ std::println(std::cerr, "{}",
+ buffer.substr(line_start, line_end - line_start));
+ for (size_t i = line_start; i < t.start; i++)
+ std::print(std::cerr, " ");
+
+ std::print(std::cerr, "{}", TextStyle{AnsiColor::Red, Emphasis::Bold});
+ for (size_t i = t.start; i < t.end; i++)
+ std::print(std::cerr, "^");
+ std::println();
+
+ return false;
+ }
+
+ if (t.kind == willowc::TokenKind::Eof)
+ break;
+ }
+
+ return true;
+}
+
+TEST_CASE("basic code", "[tokenizer]") {
+ using namespace std::string_literals;
+ using enum TokenKind;
+
+ REQUIRE(tokenizer_test(
+ R"(
+ func @add(%a: i32, %b: i32) -> i32 {
+ %c: i32 = add %a, %b;
+ // return the sum of %a and %b
+ return %c;
+ })",
+ // clang-format off
+ std::array{FuncKW, Function, LParen, Variable, Colon, Type, Comma, Variable, Colon, Type, RParen, RArrow, Type, LCurly,
+ Variable, Colon, Type, Equals, Inst, Variable, Comma, Variable, Semicolon,
+ Comment,
+ Inst, Variable, Semicolon,
+ RCurly, Eof}));
+ // clang-format on
+
+ REQUIRE(tokenizer_test(
+ R"(
+ func @slt(%a: i32, %b: i32) -> i32 {
+ ^entry:
+ %retval: *i32 = alloca i32;
+ %pred: i1 = lt %a, %b;
+ br %pred, ^lt, ^ge;
+ ^lt:
+ store %a, %retval;
+ jmp ^ret;
+ ^gt:
+ store %b, %retval;
+ jmp ^ret;
+ ^ret:
+ %r: i32 = load %retval;
+ return %r;
+ })",
+ // clang-format off
+ std::array{
+ FuncKW, Function, LParen, Variable, Colon, Type, Comma, Variable, Colon, Type, RParen, RArrow, Type, LCurly,
+ Label, Colon, // ^entry:
+
+ Variable, Colon, Star, Type, Equals, Inst, Type, Semicolon, // %retval: *i32 = alloca i32;
+ Variable, Colon, Type, Equals, Inst, Variable, Comma, Variable, Semicolon, // %pred: i1 = lt %a, %b;
+
+ Inst, Variable, Comma, Label, Comma, Label, Semicolon, // br %pred, %lt, %ge;
+
+ Label, Colon, // ^lt:
+ Inst, Variable, Comma, Variable, Semicolon, // store %a, %retval;
+ Inst, Label, Semicolon, // jmp ^ret;
+
+ Label, Colon, // ^gt:
+ Inst, Variable, Comma, Variable, Semicolon, // store %b, %retval;
+ Inst, Label, Semicolon, // jmp ^ret;
+
+ Label, Colon, // ^ret:
+ Variable, Colon, Type, Equals, Inst, Variable, Semicolon, // %r: i32 = load %retval;
+ Inst, Variable, Semicolon, // return %r;
+ RCurly, Eof
+ }));
+
+ // clang-format on
+}
+
+TEST_CASE("constants", "[tokenizer]") {
+ using namespace std::string_literals;
+ using enum TokenKind;
+
+ REQUIRE(tokenizer_test(
+ R"(
+ func @c() -> i32 {
+ %a: i32 = const 0;
+ %b: i64 = const -7;
+ %c: i64 = const 123;
+ %d: i32 = const 0x2a;
+ }
+ )",
+ // clang-format off
+ std::array{FuncKW, Function, LParen, RParen, RArrow, Type, LCurly,
+ Variable, Colon, Type, Equals, Inst, Constant, Semicolon,
+ Variable, Colon, Type, Equals, Inst, Constant, Semicolon,
+ Variable, Colon, Type, Equals, Inst, Constant, Semicolon,
+ Variable, Colon, Type, Equals, Inst, Constant, Semicolon,
+ RCurly, Eof}));
+ // clang-format on
+}
+
+TEST_CASE("identifiers", "[identifiers]") {
+ using namespace std::string_literals;
+ using enum TokenKind;
+
+ REQUIRE(tokenizer_test(
+ R"(
+ func @-foo.bar_baz-0(%$a0: i32, %_tmp_1: i32) -> i32 {
+ ret %_tmp_1;
+ }
+ )",
+ // clang-format off
+ std::array{
+ FuncKW, Function, LParen, Variable, Colon, Type, Comma, Variable, Colon, Type, RParen, RArrow, Type, LCurly,
+ Inst, Variable, Semicolon,
+ RCurly, Eof
+ }));
+ // clang-format on
+}
+
+TEST_CASE("edge cases", "[tokenizer]") {
+ using namespace std::string_literals;
+ using enum TokenKind;
+
+ REQUIRE(tokenizer_test(R"()", std::array{Eof}));
+
+ REQUIRE(tokenizer_test(R"(@foo // hi!)", std::array{Function, Comment, Eof}));
+
+ REQUIRE(tokenizer_test(R"(--5)", std::array{Invalid, Eof}));
+
+ REQUIRE(tokenizer_test(R"(/ %foo)", std::array{Invalid, Variable, Eof}));
+
+ REQUIRE(tokenizer_test(R"(^:)", std::array{Invalid, Colon, Eof}));
+}
+
+TEST_CASE("invalid token recovery", "[tokenizer]") {
+ using namespace std::string_literals;
+ using enum TokenKind;
+
+ REQUIRE(tokenizer_test(R"(----xyz)", std::array{Invalid, Eof}));
+
+ REQUIRE(tokenizer_test(R"(^^foo:)", std::array{Invalid, Label, Colon, Eof}));
+
+ REQUIRE(tokenizer_test(R"(%13a %foo)",
+ std::array{Variable, Inst, Variable, Eof}));
+}