#include #include namespace willowc { static inline bool is_space(unsigned char c) { return c == ' ' || c == '\t' || c == '\n' || c == '\r'; } static inline bool is_digit(unsigned char c) { return c >= '0' && c <= '9'; } static inline bool is_xdigit(unsigned char c) { return is_digit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); } static inline bool is_alpha(unsigned char c) { unsigned char x = static_cast(c | 0x20); return x >= 'a' && x <= 'z'; } static inline bool valid_id_start(int c) { return is_alpha(c) || c == '$' || c == '.' || c == '_' || c == '-'; } bool Tokenizer::scan_id(bool accept_digits = true) { char c = peek(); if (accept_digits && is_digit(c)) { // if it starts with a digit, must be all digits while (is_digit(peek())) skip(); return true; } if (!valid_id_start(c)) return false; while (valid_id_start(peek()) || isdigit(peek())) skip(); return true; } Token Tokenizer::scan() { while (is_space(peek())) skip(); std::size_t start = this->offset; TokenKind k = [&] { switch (peek()) { case '@': skip(); if (scan_id(false)) return TokenKind::Function; return TokenKind::Invalid; case '%': skip(); if (scan_id()) return TokenKind::Variable; return TokenKind::Invalid; case '^': skip(); if (scan_id()) return TokenKind::Label; return TokenKind::Invalid; case ',': skip(); return TokenKind::Comma; case ':': skip(); return TokenKind::Colon; case ';': skip(); return TokenKind::Semicolon; case '(': skip(); return TokenKind::LParen; case ')': skip(); return TokenKind::RParen; case '{': skip(); return TokenKind::LCurly; case '}': skip(); return TokenKind::RCurly; case '=': skip(); return TokenKind::Equals; case '*': skip(); return TokenKind::Star; case '<': skip(); return TokenKind::LTriangle; case '>': skip(); return TokenKind::RTriangle; case '-': { if (peek(1) == '>') { skip(2); return TokenKind::RArrow; } if (isdigit(peek(1))) { skip(); if (scan_dec()) return TokenKind::Constant; } return TokenKind::Invalid; } case '/': { skip(); if (peek() != '/') return TokenKind::Invalid; skip(); while (peek() != '\0' && peek() != '\n') skip(); return TokenKind::Comment; } case '\0': return TokenKind::Eof; default: { if (is_digit(peek())) return scan_constant() ? TokenKind::Constant : TokenKind::Invalid; if (peek() == 'i') { skip(); if (scan_dec()) return TokenKind::Type; } if (isalpha(peek())) { skip(); while (isalnum(peek()) || peek() == '.') skip(); return TokenKind::Inst; } skip(); return TokenKind::Invalid; } } }(); Token t{start, offset, k}; if (t.kind == TokenKind::Invalid) { if (t.start == t.end) t.end++; recover(); } if (t.kind == TokenKind::Inst) { auto lexeme = buf.substr(t.start, t.end - t.start); if (lexeme == "func") t.kind = TokenKind::FuncKW; } return t; } bool Tokenizer::scan_dec() { if (!is_digit(peek())) return false; skip(); while (is_digit(peek())) skip(); return true; } bool Tokenizer::scan_hex() { if (!is_xdigit(peek())) return false; skip(); while (is_xdigit(peek())) skip(); return true; } bool Tokenizer::scan_constant() { if (peek() == '-') skip(); if (peek() == '0') { skip(); if (is_digit(peek())) return false; if (peek() == 'x') { skip(); return scan_hex(); } else { return true; // 0 } } else if (is_digit(peek())) { return scan_dec(); } return false; } void Tokenizer::recover() { auto is_boundary = [&](char c) { switch (c) { case ' ': [[fallthrough]]; case '\n': [[fallthrough]]; case '\t': [[fallthrough]]; case '\\': [[fallthrough]]; case ',': [[fallthrough]]; case '%': [[fallthrough]]; case '@': [[fallthrough]]; case ':': [[fallthrough]]; case ';': [[fallthrough]]; case '(': [[fallthrough]]; case ')': [[fallthrough]]; case '{': [[fallthrough]]; case '}': [[fallthrough]]; case '=': [[fallthrough]]; case '^': [[fallthrough]]; case '\0': return true; default: return false; } }; while (!is_boundary(peek())) { skip(); } } } // namespace willowc