summaryrefslogtreecommitdiff
path: root/willow/tools/willowc/lib/tokenizer.cpp
diff options
context:
space:
mode:
authorStefan Weigl-Bosker <stefan@s00.xyz>2026-02-03 14:59:53 -0500
committerGitHub <noreply@github.com>2026-02-03 14:59:53 -0500
commitadd95b14f74e6dbe04a6efe98ff0f20424930b73 (patch)
tree13ce413ee4190a4c8f8743c7740aaa8d04353f14 /willow/tools/willowc/lib/tokenizer.cpp
parentc5b2905c5a64433f8519531a77d3acc42d881f17 (diff)
downloadcompiler-add95b14f74e6dbe04a6efe98ff0f20424930b73.tar.gz
[willow]: initial frontend work, unit tests (#8)dev/stefan
Diffstat (limited to 'willow/tools/willowc/lib/tokenizer.cpp')
-rw-r--r--willow/tools/willowc/lib/tokenizer.cpp176
1 files changed, 176 insertions, 0 deletions
diff --git a/willow/tools/willowc/lib/tokenizer.cpp b/willow/tools/willowc/lib/tokenizer.cpp
new file mode 100644
index 0000000..0c1f917
--- /dev/null
+++ b/willow/tools/willowc/lib/tokenizer.cpp
@@ -0,0 +1,176 @@
+#include <tokenizer.hpp>
+
+namespace willowc {
+
+static inline bool is_space(unsigned char c) {
+ return c == ' ' || c == '\t' || c == '\n' || c == '\r';
+}
+static inline bool is_digit(unsigned char c) { return c >= '0' && c <= '9'; }
+static inline bool is_xdigit(unsigned char c) {
+ return is_digit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
+}
+static inline bool is_alpha(unsigned char c) {
+ unsigned char x = static_cast<unsigned char>(c | 0x20);
+ return x >= 'a' && x <= 'z';
+}
+
+static inline bool valid_id_start(int c) {
+ return is_alpha(c) || c == '$' || c == '.' || c == '_' || c == '-';
+}
+
+bool Tokenizer::scan_id(bool accept_digits = true) {
+ char c = peek();
+
+ if (accept_digits && is_digit(c)) {
+ // if it starts with a digit, must be all digits
+ while (is_digit(peek()))
+ skip();
+ return true;
+ }
+
+ if (!valid_id_start(c))
+ return false;
+
+ while (valid_id_start(peek()) || isdigit(peek()))
+ skip();
+
+ return true;
+}
+
+Token Tokenizer::scan() {
+ std::size_t start = this->offset;
+
+ while (isspace(peek()))
+ skip();
+
+ TokenKind k = [&] {
+ switch (peek()) {
+ case '@':
+ skip();
+ if (scan_id(false))
+ return TokenKind::Function;
+ return TokenKind::Invalid;
+ case '%':
+ skip();
+ if (scan_id())
+ return TokenKind::Variable;
+ return TokenKind::Invalid;
+ case '^':
+ skip();
+ if (scan_id())
+ return TokenKind::Label;
+ return TokenKind::Invalid;
+ case ',':
+ skip();
+ return TokenKind::Comma;
+ case ';':
+ skip();
+ return TokenKind::Semicolon;
+ case '(':
+ skip();
+ return TokenKind::LParen;
+ case ')':
+ skip();
+ return TokenKind::RParen;
+ case '{':
+ skip();
+ return TokenKind::LCurly;
+ case '}':
+ skip();
+ return TokenKind::RCurly;
+ case '=':
+ skip();
+ return TokenKind::Equals;
+ case '-': {
+ if (peek(1) == '>') {
+ skip(2);
+ return TokenKind::RArrow;
+ }
+ if (isdigit(peek(1))) {
+ skip();
+ if (scan_dec())
+ return TokenKind::Constant;
+ }
+ return TokenKind::Invalid;
+ }
+ case '/': {
+ skip();
+ if (peek() != '/')
+ return TokenKind::Invalid;
+
+ skip();
+ char c = eat();
+ while (c != '\0' && c != '\n')
+ c = eat();
+
+ return TokenKind::Comment;
+ }
+ case '\0':
+ return TokenKind::Eof;
+ default: {
+ if (is_digit(peek()))
+ return scan_constant() ? TokenKind::Constant : TokenKind::Invalid;
+
+ if (peek() == 'i') {
+ skip();
+ if (scan_dec())
+ return TokenKind::Type;
+ }
+
+ if (isalpha(peek())) {
+ skip();
+ while (isalnum(peek()) || peek() == '.')
+ skip();
+ return TokenKind::Inst;
+ }
+
+ return TokenKind::Invalid;
+ }
+ }
+ }();
+
+ return Token{start, offset, k};
+}
+
+bool Tokenizer::scan_dec() {
+ if (!is_digit(peek()))
+ return false;
+ skip();
+ while (is_digit(peek()))
+ skip();
+
+ return true;
+}
+
+bool Tokenizer::scan_hex() {
+ if (!is_xdigit(peek()))
+ return false;
+ skip();
+ while (is_xdigit(peek()))
+ skip();
+
+ return true;
+}
+
+bool Tokenizer::scan_constant() {
+ if (peek() == '-')
+ skip();
+
+ if (peek() == '0') {
+ skip();
+ if (is_digit(peek()))
+ return false;
+ if (peek() == 'x') {
+ skip();
+ return scan_hex();
+ } else {
+ return true; // 0
+ }
+ } else if (is_digit(peek())) {
+ return scan_dec();
+ }
+
+ return false;
+}
+
+} // namespace willowc