From 32d0ac56c27719c6fcde70c51238f11cf5f44141 Mon Sep 17 00:00:00 2001 From: Stefan Weigl-Bosker Date: Fri, 20 Dec 2024 17:22:33 -0500 Subject: regex parsing --- src/main.zig | 37 +++++++------- src/parser.zig | 153 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/regex.zig | 60 +++++++++++++++++++++- 3 files changed, 229 insertions(+), 21 deletions(-) create mode 100644 src/parser.zig (limited to 'src') diff --git a/src/main.zig b/src/main.zig index 8964c3c..f28ab18 100644 --- a/src/main.zig +++ b/src/main.zig @@ -9,26 +9,23 @@ pub fn main() !void { defer _ = gpa.deinit(); const lexer = try allocator.create(Regex.Lexer); + lexer.* = Regex.Lexer.init("abc|123[a-Z]()+", &allocator); - defer allocator.destroy(lexer); - - while (true) { - const token = lexer.advance() catch |err| switch (err) { - Regex.Lexer.Error.EndOfBuffer => { - break; - }, - else => { - return err; - }, - }; - - try stdio.print("{}\n", .{token}); - - if (token.value) |v| { - switch (v) { - .Class => |*rl| rl.deinit(), - else => {}, - } - } + + const tl = try lexer.scan(); + + var it = tl.first; + while (it) |node| : (it = node.next) { + try stdio.print("{}\n", .{&(node.data)}); } + + const parser = try allocator.create(Regex.Parser); + + parser.* = Regex.Parser.init(&lexer.tokens, &allocator); + + _ = try parser.parseRe(); + + lexer.deinit(); + allocator.destroy(lexer); + allocator.destroy(parser); } diff --git a/src/parser.zig b/src/parser.zig new file mode 100644 index 0000000..49fcdb6 --- /dev/null +++ b/src/parser.zig @@ -0,0 +1,153 @@ +const std = @import("std"); +const Token = @import("regex.zig").Token; + +const TokenList = std.SinglyLinkedList(Token); + +pub const Rule = enum { Terminal, Re, Rer, Cat, Catr, E, L }; + +pub const ParseTree = struct { + rule: Rule, + token: ?Token, + sibling: ?*ParseTree, + child: ?*ParseTree, + + pub fn init(rule: Rule, allocator: *std.mem.Allocator) !*ParseTree { + const t = try allocator.create(ParseTree); + + t.* = .{ + .rule = rule, + .token = null, + .sibling = null, + .child = null, + }; + + return t; + } + + pub fn appendChild(self: *ParseTree, child: *ParseTree) void { + if (self.child) |c| { + var p = c; + while (p.sibling) |sibling| { + p = sibling; + } + p.sibling = child; + } else { + self.child = child; + } + } +}; + +pub const Parser = struct { + tokens: *const TokenList, + itr: ?*TokenList.Node, + allocator: *std.mem.Allocator, + + pub const Error = error{ UnexpectedToken, ExpectedToken }; + + pub fn init(tl: *const TokenList, allocator: *std.mem.Allocator) Parser { + return .{ .tokens = tl, .allocator = allocator, .itr = tl.first.? }; + } + + pub fn expectToken(self: *Parser, kind: Token.Kind) !*ParseTree { + if (self.itr) |node| { + if (node.data.kind == kind) { + const t = try ParseTree.init(Rule.Terminal, self.allocator); + + t.token = node.data; + + self.itr = node.next; + + return t; + } else { + std.debug.print("{d} | error: expected token of kind {s} but found token of kind {s}", .{ node.data.pos, @tagName(kind), @tagName(node.data.kind) }); + return Error.UnexpectedToken; + } + } else { + std.debug.print("error: expected token of kind {s} but reached end of stream!\n", .{@tagName(kind)}); // should use actual logging eventually + return Error.ExpectedToken; + } + } + + pub fn parseRe(self: *Parser) error{ ExpectedToken, UnexpectedToken, OutOfMemory }!*ParseTree { + const t = try ParseTree.init(Rule.Re, self.allocator); + + t.appendChild(try self.parseCat()); + t.appendChild(try self.parseRer()); + + return t; + } + + pub fn parseRer(self: *Parser) !*ParseTree { + const t = try ParseTree.init(Rule.Rer, self.allocator); + + if (self.itr) |_| { + t.appendChild(try self.expectToken(Token.Kind.Or)); + t.appendChild(try self.parseCat()); + } // else -> epsilon + + return t; + } + + pub fn parseCat(self: *Parser) !*ParseTree { + const t = try ParseTree.init(Rule.Cat, self.allocator); + + t.appendChild(try self.parseE()); + t.appendChild(try self.parseCatr()); + + return t; + } + + pub fn parseCatr(self: *Parser) !*ParseTree { + const t = try ParseTree.init(Rule.Catr, self.allocator); + + if (self.itr) |_| { + t.appendChild(try self.parseE()); + t.appendChild(try self.parseCatr()); + } + + return t; + } + + pub fn parseE(self: *Parser) !*ParseTree { + const t = try ParseTree.init(Rule.E, self.allocator); + + t.appendChild(try self.parseL()); + + if (self.itr) |node| { + switch (node.data.kind) { + .Plus, .Question, .Star => { + t.appendChild(try self.expectToken(node.data.kind)); // bruh + }, + else => {}, + } + } else { + return Error.ExpectedToken; + } + + return t; + } + + pub fn parseL(self: *Parser) !*ParseTree { + const t = try ParseTree.init(Rule.L, self.allocator); + + if (self.itr) |node| { + switch (node.data.kind) { + .Literal, .Class, .Dot => { + t.appendChild(try self.expectToken(node.data.kind)); // bruh + }, + .LParen => { + t.appendChild(try self.expectToken(Token.Kind.LParen)); + t.appendChild(try self.parseRe()); + t.appendChild(try self.expectToken(Token.Kind.RParen)); + }, + else => { + return Error.UnexpectedToken; + }, + } + } else { + return Error.ExpectedToken; + } + + return t; + } +}; diff --git a/src/regex.zig b/src/regex.zig index a2f9c79..de24dc0 100644 --- a/src/regex.zig +++ b/src/regex.zig @@ -1,4 +1,6 @@ const std = @import("std"); +pub const Parser = @import("parser.zig").Parser; +pub const ParseTree = @import("parser.zig").ParseTree; pub const Token = struct { pub const Kind = enum { Literal, LParen, RParen, Class, Dot, Star, Plus, Question, Or }; @@ -25,6 +27,15 @@ pub const Token = struct { try writer.print(" }}", .{}); } + + pub fn deinit(self: *const Token) void { + if (self.value) |val| { + switch (val) { + .Class => |*rl| rl.deinit(), + else => {}, + } + } + } }; pub const Lexer = struct { @@ -32,11 +43,27 @@ pub const Lexer = struct { cursor: usize, regexp: []const u8, allocator: *std.mem.Allocator, + tokens: std.SinglyLinkedList(Token), pub const Error = error{ EndOfBuffer, InvalidCharacter, UnexpectedCharater }; pub fn init(regexp: []const u8, allocator: *std.mem.Allocator) Lexer { - return .{ .cursor = 0, .start = 0, .regexp = regexp, .allocator = allocator }; + return .{ .cursor = 0, .start = 0, .regexp = regexp, .allocator = allocator, .tokens = std.SinglyLinkedList(Token){ .first = null } }; + } + + pub fn deinit(self: *Lexer) void { + var it = self.tokens.first; + + while (it) |node| { + if (node.data.value) |val| { + switch (val) { + .Class => |*rl| rl.deinit(), + else => {}, + } + } + it = node.next; + self.allocator.destroy(node); + } } fn readChar(self: *Lexer) !u8 { @@ -47,6 +74,37 @@ pub const Lexer = struct { return self.regexp[self.cursor]; } + pub fn scan(self: *Lexer) !std.SinglyLinkedList(Token) { + if (self.tokens.first) |_| { + return self.tokens; + } else { + self.tokens.first = try self.allocator.create(std.SinglyLinkedList(Token).Node); + } + + var tail: ?*std.SinglyLinkedList(Token).Node = null; + var node = self.tokens.first; + + while (true) { + const token: Token = self.advance() catch |err| switch (err) { + Error.EndOfBuffer => { + self.allocator.destroy(node.?); + if (tail) |t| { + t.next = null; + } + return self.tokens; + }, + else => { + return err; + }, + }; + + node.?.data = token; + node.?.next = try self.allocator.create(std.SinglyLinkedList(Token).Node); + tail = node; + node = node.?.next; + } + } + pub fn advance(self: *Lexer) !Token { self.start = self.cursor; -- cgit v1.2.3