5 files changed, 246 insertions, 2 deletions
diff --git a/BUILD.bazel b/BUILD.bazel
index 60088b1..4c0f6f7 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -6,5 +6,6 @@ refresh_compile_commands(
     targets = {
       "//willow:willow": "",
       "//willow:willowc": "",
+      "//willow/tools/willowc/unittest": "",
     },
 )
diff --git a/willow/tools/willowc/include/tokenizer.hpp b/willow/tools/willowc/include/tokenizer.hpp
index 47577ab..d3970f6 100644
--- a/willow/tools/willowc/include/tokenizer.hpp
+++ b/willow/tools/willowc/include/tokenizer.hpp
@@ -22,6 +22,9 @@ enum class TokenKind {
   LCurly,
   RCurly,
   Equals,
+  Star,
+  LTriangle,
+  RTriangle,
   RArrow,
   Comment,
 
@@ -66,7 +69,7 @@ class Tokenizer {
   bool scan_hex();
   bool scan_constant();
 public:
-  explicit Tokenizer(std::string_view buf, std::size_t offset = 0)
+  explicit constexpr Tokenizer(std::string_view buf, std::size_t offset = 0)
       : buf{buf}, offset{offset} {}
 
   Token scan();
@@ -103,6 +106,12 @@ constexpr std::string_view TokenKindName(TokenKind t) {
     return "RCurly";
   case TokenKind::Equals:
     return "Equals";
+  case TokenKind::Star:
+    return "Star";
+  case TokenKind::LTriangle:
+    return "LTriangle";
+  case TokenKind::RTriangle:
+    return "RTriangle";
   case TokenKind::RArrow:
     return "RArrow";
   case TokenKind::Comment:
diff --git a/willow/tools/willowc/lib/tokenizer.cpp b/willow/tools/willowc/lib/tokenizer.cpp
index 7ad28a6..f7dd04f 100644
--- a/willow/tools/willowc/lib/tokenizer.cpp
+++ b/willow/tools/willowc/lib/tokenizer.cpp
@@ -1,3 +1,4 @@
+#include <iostream>
 #include <tokenizer.hpp>
 
 namespace willowc {
@@ -84,6 +85,15 @@ Token Tokenizer::scan() {
     case '=':
       skip();
       return TokenKind::Equals;
+    case '*':
+      skip();
+      return TokenKind::Star;
+    case '<':
+      skip();
+      return TokenKind::LTriangle;
+    case '>':
+      skip();
+      return TokenKind::RTriangle;
     case '-': {
       if (peek(1) == '>') {
         skip(2);
@@ -133,12 +143,21 @@ Token Tokenizer::scan() {
   }();
 
   Token t{start, offset, k};
+
   if (t.kind == TokenKind::Invalid) {
-    if (t.start == t.end)
+    if (t.start == t.end) {
       t.end++;
+      skip();
+    }
     recover();
   }
 
+  if (t.kind == TokenKind::Inst) {
+    auto lexeme = buf.substr(t.start, t.end - t.start);
+    if (lexeme == "func")
+      t.kind = TokenKind::FuncKW;
+  }
+
   return t;
 }
 
@@ -214,6 +233,8 @@ void Tokenizer::recover() {
       [[fallthrough]];
     case '=':
       [[fallthrough]];
+    case '^':
+      [[fallthrough]];
     case '\0':
       return true;
     default:
diff --git a/willow/tools/willowc/unittest/BUILD.bazel b/willow/tools/willowc/unittest/BUILD.bazel
new file mode 100644
index 0000000..141bf82
--- /dev/null
+++ b/willow/tools/willowc/unittest/BUILD.bazel
@@ -0,0 +1,16 @@
+test_suite(
+    name = "unittest",
+    tests = [
+      ":tokenizer"
+    ],
+)
+
+cc_test(
+  name = "tokenizer",
+  srcs = ["TokenizerTest.cpp"],
+  deps = [
+    "//willow/tools/willowc:willowc_lib",
+    "@catch2//:catch2_main"
+  ],
+  tags = ["tokenizer"]
+)
diff --git a/willow/tools/willowc/unittest/TokenizerTest.cpp b/willow/tools/willowc/unittest/TokenizerTest.cpp
new file mode 100644
index 0000000..d085b1d
--- /dev/null
+++ b/willow/tools/willowc/unittest/TokenizerTest.cpp
@@ -0,0 +1,197 @@
+#include <catch2/catch_test_macros.hpp>
+
+#include <iostream>
+#include <parser.hpp>
+#include <print>
+#include <span>
+#include <tokenizer.hpp>
+#include <willow/Util/Color.h>
+
+using namespace willowc;
+using namespace willow::termcolor;
+
+bool tokenizer_test(std::string_view buffer, std::span<const TokenKind> args) {
+  Tokenizer tokenizer(buffer);
+
+  size_t token_index = 0;
+  while (true) {
+    Token t = tokenizer.scan();
+
+    if (token_index >= args.size())
+      break;
+
+    TokenKind expected = args[token_index++];
+    if (t.kind != expected) {
+      size_t line_start = [&] {
+        size_t ls = t.start;
+        if (ls > buffer.size())
+          ls = buffer.size();
+        for (; ls > 0; ls--) {
+          if (buffer[ls - 1] == '\n') {
+            break;
+          }
+        }
+        return ls;
+      }();
+      size_t line_end = [&]() {
+        auto p = buffer.find('\n', t.start);
+        return (p == std::string_view::npos) ? buffer.size() : p;
+      }();
+
+      std::println(std::cerr);
+      std::println(std::cerr, "{}FAIL:{} expected '{}', got '{}'{}",
+                   willow::termcolor::TextStyle{AnsiColor::Red, Emphasis::Bold},
+                   TextStyle{AnsiColor::Default, AnsiColor::Default}, expected,
+                   t.kind,
+                   TextStyle{AnsiColor::None, AnsiColor::None, Emphasis::None});
+      std::println(std::cerr, "{}",
+                   buffer.substr(line_start, line_end - line_start));
+      for (size_t i = line_start; i < t.start; i++)
+        std::print(std::cerr, " ");
+
+      std::print(std::cerr, "{}", TextStyle{AnsiColor::Red, Emphasis::Bold});
+      for (size_t i = t.start; i < t.end; i++)
+        std::print(std::cerr, "^");
+      std::println();
+
+      return false;
+    }
+
+    if (t.kind == willowc::TokenKind::Eof)
+      break;
+  }
+
+  return true;
+}
+
+TEST_CASE("basic code", "[tokenizer]") {
+  using namespace std::string_literals;
+  using enum TokenKind;
+
+  REQUIRE(tokenizer_test(
+      R"(
+  func @add(%a: i32, %b: i32) -> i32 {
+    %c: i32 = add %a, %b;
+    // return the sum of %a and %b
+    return %c;
+  })",
+      // clang-format off
+      std::array{FuncKW, Function, LParen, Variable, Colon, Type, Comma, Variable, Colon, Type, RParen, RArrow, Type, LCurly, 
+                   Variable, Colon, Type, Equals, Inst, Variable, Comma, Variable,  Semicolon,
+                   Comment,
+                   Inst, Variable, Semicolon,
+                 RCurly, Eof}));
+  // clang-format on
+
+  REQUIRE(tokenizer_test(
+      R"(
+  func @slt(%a: i32, %b: i32) -> i32 {
+  ^entry:
+    %retval: *i32 = alloca i32;
+    %pred: i1 = lt %a, %b;
+    br %pred, ^lt, ^ge;
+  ^lt:
+    store %a, %retval;
+    jmp ^ret;
+  ^gt:
+    store %b, %retval;
+    jmp ^ret;
+  ^ret:
+    %r: i32 = load %retval;
+    return %r;
+  })",
+      // clang-format off
+      std::array{
+      FuncKW, Function, LParen, Variable, Colon, Type, Comma, Variable, Colon, Type, RParen, RArrow, Type, LCurly,
+        Label, Colon,                         // ^entry:
+
+        Variable, Colon, Star, Type, Equals, Inst, Type, Semicolon,                // %retval: *i32 = alloca i32;
+        Variable, Colon, Type, Equals, Inst, Variable, Comma, Variable, Semicolon, // %pred: i1 = lt %a, %b;
+
+        Inst, Variable, Comma, Label, Comma, Label, Semicolon,               // br %pred, %lt, %ge;
+
+        Label, Colon,                         // ^lt:
+        Inst, Variable, Comma, Variable, Semicolon,                                // store %a, %retval;
+        Inst, Label, Semicolon,                                                    // jmp ^ret;
+
+        Label, Colon,                         // ^gt:
+        Inst, Variable, Comma, Variable, Semicolon,                                // store %b, %retval;
+        Inst, Label, Semicolon,                                                    // jmp ^ret;
+
+        Label, Colon,                         // ^ret:
+        Variable, Colon, Type, Equals, Inst, Variable, Semicolon,                  // %r: i32 = load %retval;
+        Inst, Variable, Semicolon,                                                 // return %r;
+      RCurly, Eof
+      }));
+
+  // clang-format on
+}
+
+TEST_CASE("constants", "[tokenizer]") {
+  using namespace std::string_literals;
+  using enum TokenKind;
+
+  REQUIRE(tokenizer_test(
+      R"(
+        func @c() -> i32 {
+          %a: i32 = const 0;
+          %b: i64 = const -7;
+          %c: i64 = const 123;
+          %d: i32 = const 0x2a;
+        }
+      )",
+      // clang-format off
+      std::array{FuncKW, Function, LParen, RParen, RArrow, Type, LCurly,
+                   Variable, Colon, Type, Equals, Inst, Constant, Semicolon,
+                   Variable, Colon, Type, Equals, Inst, Constant, Semicolon,
+                   Variable, Colon, Type, Equals, Inst, Constant, Semicolon,
+                   Variable, Colon, Type, Equals, Inst, Constant, Semicolon,
+                 RCurly, Eof}));
+  // clang-format on
+}
+
+TEST_CASE("identifiers", "[identifiers]") {
+  using namespace std::string_literals;
+  using enum TokenKind;
+
+  REQUIRE(tokenizer_test(
+      R"(
+        func @-foo.bar_baz-0(%$a0: i32, %_tmp_1: i32) -> i32 {
+          ret %_tmp_1;
+        }
+      )",
+      // clang-format off
+      std::array{
+        FuncKW, Function, LParen, Variable, Colon, Type, Comma, Variable, Colon, Type, RParen, RArrow, Type, LCurly,
+          Inst, Variable, Semicolon,
+        RCurly, Eof
+      }));
+  // clang-format on
+}
+
+TEST_CASE("edge cases", "[tokenizer]") {
+  using namespace std::string_literals;
+  using enum TokenKind;
+
+  REQUIRE(tokenizer_test(R"()", std::array{Eof}));
+
+  REQUIRE(tokenizer_test(R"(@foo // hi!)", std::array{Function, Comment, Eof}));
+
+  REQUIRE(tokenizer_test(R"(--5)", std::array{Invalid, Eof}));
+
+  REQUIRE(tokenizer_test(R"(/ %foo)", std::array{Invalid, Variable, Eof}));
+
+  REQUIRE(tokenizer_test(R"(^:)", std::array{Invalid, Colon, Eof}));
+}
+
+TEST_CASE("invalid token recovery", "[tokenizer]") {
+  using namespace std::string_literals;
+  using enum TokenKind;
+
+  REQUIRE(tokenizer_test(R"(----xyz)", std::array{Invalid, Eof}));
+
+  REQUIRE(tokenizer_test(R"(^^foo:)", std::array{Invalid, Label, Colon, Eof}));
+
+  REQUIRE(tokenizer_test(R"(%13a %foo)",
+                         std::array{Variable, Inst, Variable, Eof}));
+}