From 32d0ac56c27719c6fcde70c51238f11cf5f44141 Mon Sep 17 00:00:00 2001
From: Stefan Weigl-Bosker <stefan@s00.xyz>
Date: Fri, 20 Dec 2024 17:22:33 -0500
Subject: regex parsing

---
 src/main.zig   |  37 +++++++-------
 src/parser.zig | 153 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 src/regex.zig  |  60 +++++++++++++++++++++-
 3 files changed, 229 insertions(+), 21 deletions(-)
 create mode 100644 src/parser.zig

(limited to 'src')

diff --git a/src/main.zig b/src/main.zig
index 8964c3c..f28ab18 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -9,26 +9,23 @@ pub fn main() !void {
     defer _ = gpa.deinit();
 
     const lexer = try allocator.create(Regex.Lexer);
+
     lexer.* = Regex.Lexer.init("abc|123[a-Z]()+", &allocator);
-    defer allocator.destroy(lexer);
-
-    while (true) {
-        const token = lexer.advance() catch |err| switch (err) {
-            Regex.Lexer.Error.EndOfBuffer => {
-                break;
-            },
-            else => {
-                return err;
-            },
-        };
-
-        try stdio.print("{}\n", .{token});
-
-        if (token.value) |v| {
-            switch (v) {
-                .Class => |*rl| rl.deinit(),
-                else => {},
-            }
-        }
+
+    const tl = try lexer.scan();
+
+    var it = tl.first;
+    while (it) |node| : (it = node.next) {
+        try stdio.print("{}\n", .{&(node.data)});
     }
+
+    const parser = try allocator.create(Regex.Parser);
+
+    parser.* = Regex.Parser.init(&lexer.tokens, &allocator);
+
+    _ = try parser.parseRe();
+
+    lexer.deinit();
+    allocator.destroy(lexer);
+    allocator.destroy(parser);
 }
diff --git a/src/parser.zig b/src/parser.zig
new file mode 100644
index 0000000..49fcdb6
--- /dev/null
+++ b/src/parser.zig
@@ -0,0 +1,153 @@
+const std = @import("std");
+const Token = @import("regex.zig").Token;
+
+const TokenList = std.SinglyLinkedList(Token);
+
+pub const Rule = enum { Terminal, Re, Rer, Cat, Catr, E, L };
+
+pub const ParseTree = struct {
+    rule: Rule,
+    token: ?Token,
+    sibling: ?*ParseTree,
+    child: ?*ParseTree,
+
+    pub fn init(rule: Rule, allocator: *std.mem.Allocator) !*ParseTree {
+        const t = try allocator.create(ParseTree);
+
+        t.* = .{
+            .rule = rule,
+            .token = null,
+            .sibling = null,
+            .child = null,
+        };
+
+        return t;
+    }
+
+    pub fn appendChild(self: *ParseTree, child: *ParseTree) void {
+        if (self.child) |c| {
+            var p = c;
+            while (p.sibling) |sibling| {
+                p = sibling;
+            }
+            p.sibling = child;
+        } else {
+            self.child = child;
+        }
+    }
+};
+
+pub const Parser = struct {
+    tokens: *const TokenList,
+    itr: ?*TokenList.Node,
+    allocator: *std.mem.Allocator,
+
+    pub const Error = error{ UnexpectedToken, ExpectedToken };
+
+    pub fn init(tl: *const TokenList, allocator: *std.mem.Allocator) Parser {
+        return .{ .tokens = tl, .allocator = allocator, .itr = tl.first.? };
+    }
+
+    pub fn expectToken(self: *Parser, kind: Token.Kind) !*ParseTree {
+        if (self.itr) |node| {
+            if (node.data.kind == kind) {
+                const t = try ParseTree.init(Rule.Terminal, self.allocator);
+
+                t.token = node.data;
+
+                self.itr = node.next;
+
+                return t;
+            } else {
+                std.debug.print("{d} | error: expected token of kind {s} but found token of kind {s}", .{ node.data.pos, @tagName(kind), @tagName(node.data.kind) });
+                return Error.UnexpectedToken;
+            }
+        } else {
+            std.debug.print("error: expected token of kind {s} but reached end of stream!\n", .{@tagName(kind)}); // should use actual logging eventually
+            return Error.ExpectedToken;
+        }
+    }
+
+    pub fn parseRe(self: *Parser) error{ ExpectedToken, UnexpectedToken, OutOfMemory }!*ParseTree {
+        const t = try ParseTree.init(Rule.Re, self.allocator);
+
+        t.appendChild(try self.parseCat());
+        t.appendChild(try self.parseRer());
+
+        return t;
+    }
+
+    pub fn parseRer(self: *Parser) !*ParseTree {
+        const t = try ParseTree.init(Rule.Rer, self.allocator);
+
+        if (self.itr) |_| {
+            t.appendChild(try self.expectToken(Token.Kind.Or));
+            t.appendChild(try self.parseCat());
+        } // else -> epsilon
+
+        return t;
+    }
+
+    pub fn parseCat(self: *Parser) !*ParseTree {
+        const t = try ParseTree.init(Rule.Cat, self.allocator);
+
+        t.appendChild(try self.parseE());
+        t.appendChild(try self.parseCatr());
+
+        return t;
+    }
+
+    pub fn parseCatr(self: *Parser) !*ParseTree {
+        const t = try ParseTree.init(Rule.Catr, self.allocator);
+
+        if (self.itr) |_| {
+            t.appendChild(try self.parseE());
+            t.appendChild(try self.parseCatr());
+        }
+
+        return t;
+    }
+
+    pub fn parseE(self: *Parser) !*ParseTree {
+        const t = try ParseTree.init(Rule.E, self.allocator);
+
+        t.appendChild(try self.parseL());
+
+        if (self.itr) |node| {
+            switch (node.data.kind) {
+                .Plus, .Question, .Star => {
+                    t.appendChild(try self.expectToken(node.data.kind)); // bruh
+                },
+                else => {},
+            }
+        } else {
+            return Error.ExpectedToken;
+        }
+
+        return t;
+    }
+
+    pub fn parseL(self: *Parser) !*ParseTree {
+        const t = try ParseTree.init(Rule.L, self.allocator);
+
+        if (self.itr) |node| {
+            switch (node.data.kind) {
+                .Literal, .Class, .Dot => {
+                    t.appendChild(try self.expectToken(node.data.kind)); // bruh
+                },
+                .LParen => {
+                    t.appendChild(try self.expectToken(Token.Kind.LParen));
+                    t.appendChild(try self.parseRe());
+                    t.appendChild(try self.expectToken(Token.Kind.RParen));
+                },
+                else => {
+                    return Error.UnexpectedToken;
+                },
+            }
+        } else {
+            return Error.ExpectedToken;
+        }
+
+        return t;
+    }
+};
diff --git a/src/regex.zig b/src/regex.zig
index a2f9c79..de24dc0 100644
--- a/src/regex.zig
+++ b/src/regex.zig
@@ -1,4 +1,6 @@
 const std = @import("std");
+pub const Parser = @import("parser.zig").Parser;
+pub const ParseTree = @import("parser.zig").ParseTree;
 
 pub const Token = struct {
     pub const Kind = enum { Literal, LParen, RParen, Class, Dot, Star, Plus, Question, Or };
@@ -25,6 +27,15 @@ pub const Token = struct {
 
         try writer.print(" }}", .{});
     }
+
+    pub fn deinit(self: *const Token) void {
+        if (self.value) |val| {
+            switch (val) {
+                .Class => |*rl| rl.deinit(),
+                else => {},
+            }
+        }
+    }
 };
 
 pub const Lexer = struct {
@@ -32,11 +43,27 @@ pub const Lexer = struct {
     cursor: usize,
     regexp: []const u8,
     allocator: *std.mem.Allocator,
+    tokens: std.SinglyLinkedList(Token),
 
     pub const Error = error{ EndOfBuffer, InvalidCharacter, UnexpectedCharater };
 
     pub fn init(regexp: []const u8, allocator: *std.mem.Allocator) Lexer {
-        return .{ .cursor = 0, .start = 0, .regexp = regexp, .allocator = allocator };
+        return .{ .cursor = 0, .start = 0, .regexp = regexp, .allocator = allocator, .tokens = std.SinglyLinkedList(Token){ .first = null } };
+    }
+
+    pub fn deinit(self: *Lexer) void {
+        var it = self.tokens.first;
+
+        while (it) |node| {
+            if (node.data.value) |val| {
+                switch (val) {
+                    .Class => |*rl| rl.deinit(),
+                    else => {},
+                }
+            }
+            it = node.next;
+            self.allocator.destroy(node);
+        }
     }
 
     fn readChar(self: *Lexer) !u8 {
@@ -47,6 +74,37 @@ pub const Lexer = struct {
         return self.regexp[self.cursor];
     }
 
+    pub fn scan(self: *Lexer) !std.SinglyLinkedList(Token) {
+        if (self.tokens.first) |_| {
+            return self.tokens;
+        } else {
+            self.tokens.first = try self.allocator.create(std.SinglyLinkedList(Token).Node);
+        }
+
+        var tail: ?*std.SinglyLinkedList(Token).Node = null;
+        var node = self.tokens.first;
+
+        while (true) {
+            const token: Token = self.advance() catch |err| switch (err) {
+                Error.EndOfBuffer => {
+                    self.allocator.destroy(node.?);
+                    if (tail) |t| {
+                        t.next = null;
+                    }
+                    return self.tokens;
+                },
+                else => {
+                    return err;
+                },
+            };
+
+            node.?.data = token;
+            node.?.next = try self.allocator.create(std.SinglyLinkedList(Token).Node);
+            tail = node;
+            node = node.?.next;
+        }
+    }
+
     pub fn advance(self: *Lexer) !Token {
         self.start = self.cursor;
 
-- 
cgit v1.2.3