From 94b86481395cb2d2a594bb98a1380b9ddc8aa900 Mon Sep 17 00:00:00 2001 From: Stefan Weigl-Bosker Date: Sat, 14 Dec 2024 18:10:39 -0500 Subject: inital commit --- src/main.zig | 34 ++++++++++ src/regex.zig | 200 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 234 insertions(+) create mode 100644 src/main.zig create mode 100644 src/regex.zig (limited to 'src') diff --git a/src/main.zig b/src/main.zig new file mode 100644 index 0000000..8964c3c --- /dev/null +++ b/src/main.zig @@ -0,0 +1,34 @@ +const std = @import("std"); +const Regex = @import("regex.zig"); + +pub fn main() !void { + const stdio = std.io.getStdIn().writer(); + var gpa = std.heap.GeneralPurposeAllocator(.{}){}; + var allocator = gpa.allocator(); + + defer _ = gpa.deinit(); + + const lexer = try allocator.create(Regex.Lexer); + lexer.* = Regex.Lexer.init("abc|123[a-Z]()+", &allocator); + defer allocator.destroy(lexer); + + while (true) { + const token = lexer.advance() catch |err| switch (err) { + Regex.Lexer.Error.EndOfBuffer => { + break; + }, + else => { + return err; + }, + }; + + try stdio.print("{}\n", .{token}); + + if (token.value) |v| { + switch (v) { + .Class => |*rl| rl.deinit(), + else => {}, + } + } + } +} diff --git a/src/regex.zig b/src/regex.zig new file mode 100644 index 0000000..ed66df8 --- /dev/null +++ b/src/regex.zig @@ -0,0 +1,200 @@ +const std = @import("std"); + +pub const Token = struct { + pub const Kind = enum { Literal, LParen, RParen, Class, Dot, Star, Plus, Question, Or }; + pub const Value = union(Token.Kind) { Literal: u8, LParen, RParen, Class: RangeList, Dot, Star, Plus, Question, Or }; + + kind: Token.Kind, + value: ?Token.Value, + pos: usize, + + pub fn format(self: *const Token, comptime fmt: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void { + if (fmt.len != 0) { // must be {} or {any} + return std.invalidFmtError(fmt, self); + } + + try writer.print("{{ kind: {s}", .{@tagName(self.kind)}); + + if (self.value) |val| { + switch (val) { + .Literal => |literal| try writer.print(", value: {c}", .{literal}), + .Class => |rl| try writer.print(", value: {}", .{rl}), + else => {}, + } + } + + try writer.print(" }}", .{}); + } +}; + +pub const Lexer = struct { + start: usize, + cursor: usize, + regexp: []const u8, + allocator: *std.mem.Allocator, + + pub const Error = error{ EndOfBuffer, InvalidCharacter, UnexpectedCharater }; + + pub fn init(regexp: []const u8, allocator: *std.mem.Allocator) Lexer { + return .{ .cursor = 0, .start = 0, .regexp = regexp, .allocator = allocator }; + } + + fn readChar(self: *Lexer) !u8 { + if (self.cursor >= self.regexp.len) { + return Error.EndOfBuffer; + } + + return self.regexp[self.cursor]; + } + + pub fn advance(self: *Lexer) !Token { + self.start = self.cursor; + + var c = try self.readChar(); + + const inferred_type: Token.Kind = switch (c) { + '(' => .LParen, + ')' => .RParen, + '[' => .Class, + '.' => .Dot, + '*' => .Star, + '+' => .Plus, + '?' => .Question, + '|' => .Or, + else => .Literal, + }; + + switch (inferred_type) { + .Literal => { + if (c == '\\') { + self.cursor += 1; + c = try self.readChar(); // needs more verbose error handling + c = escapedChar(c); + } + self.cursor += 1; + return .{ .kind = inferred_type, .value = .{ .Literal = c }, .pos = self.start }; + }, + .Class => { + while (c != ']') { // needs more verbose error handling + if (self.cursor >= self.regexp.len) { + return Error.EndOfBuffer; + } + self.cursor += 1; + } + return .{ .kind = inferred_type, .value = .{ .Class = try RangeList.init(self.regexp[self.start + 1 .. self.cursor - 1], self.allocator) }, .pos = self.start }; + }, + else => { + self.cursor += 1; + return .{ .kind = inferred_type, .pos = self.start, .value = null }; + }, + } + } + + fn escapedChar(c: u8) u8 { + return switch (c) { + 'n' => '\n', // placeholder for now + 't' => '\t', + 'r' => '\r', + else => c, + }; + } +}; + +// could use something like a bst, but thinking about what regex classes people actually write makes me think that a list is more efficient +pub const RangeList = struct { + negated: bool, + head: ?*Node, + allocator: *std.mem.Allocator, + + const Node = struct { + range: struct { min: u8, max: u8 }, + next: ?*Node, + }; + + // need to do more error handling + pub fn init(str: []const u8, allocator: *std.mem.Allocator) !RangeList { + var rl = RangeList{ + .negated = false, + .head = null, + .allocator = allocator, + }; + + var i: usize = 0; + + if (str.len == 0) { + return rl; + } + + if (str[0] == '^') { + rl.negated = true; + i += 1; + } + + var p = rl.head; + while (i < str.len) : (i += 1) { + const node = try rl.allocator.create(Node); + + const min = str[i]; + var max = min; + + i += 1; + + if (i + 1 < str.len and str[i] == '-') { + i += 1; + max = str[i]; + i += 1; + } + + node.* = .{ + .range = .{ .min = min, .max = max }, + .next = null, + }; + + if (p) |tail| { + tail.next = node; + p = tail.next; + } else { + rl.head = node; + p = rl.head; + } + } + return rl; + } + + pub fn deinit(self: *const RangeList) void { + var itr = self.head; + + while (itr) |node| { + const prev = node; + itr = node.next; + self.allocator.destroy(prev); + } + } + + pub fn format(self: *const RangeList, comptime fmt: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void { + var itr = self.head; + + if (fmt.len != 0) { // must be {} or {any} + return std.invalidFmtError(fmt, self); + } + + if (self.negated) { + try writer.print("^", .{}); + } + + try writer.print("[", .{}); + + while (itr) |node| : (itr = node.next) { + if (node.range.min == node.range.max) { + try writer.print("{c}", .{node.range.min}); + } else { + try writer.print("{c}-{c}", .{ node.range.min, node.range.max }); + } + + if (node.next) |_| { + try writer.print(", ", .{}); + } + } + try writer.print("]", .{}); + } +}; -- cgit v1.2.3