diff options
-rw-r--r-- | LICENSE | 16 | ||||
-rw-r--r-- | build.zig | 55 | ||||
-rw-r--r-- | build.zig.zon | 72 | ||||
-rw-r--r-- | src/main.zig | 34 | ||||
-rw-r--r-- | src/regex.zig | 200 |
5 files changed, 377 insertions, 0 deletions
@@ -0,0 +1,16 @@ +ISC License + +Copyright (c) 2024 Stefan Weigl-Bosker <stefan@s00.xyz> + +Permission to use, copy, modify, and/or distribute this software for any +purpose with or without fee is hereby granted, provided that the above +copyright notice and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH +REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY +AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, +INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM +LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR +OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +PERFORMANCE OF THIS SOFTWARE. + diff --git a/build.zig b/build.zig new file mode 100644 index 0000000..f7e28eb --- /dev/null +++ b/build.zig @@ -0,0 +1,55 @@ +const std = @import("std"); + +pub fn build(b: *std.Build) void { + const target = b.standardTargetOptions(.{}); + const optimize = b.standardOptimizeOption(.{}); + + const exe = b.addExecutable(.{ + .name = "lg", + .root_source_file = b.path("src/main.zig"), + .target = target, + .optimize = optimize, + }); + + // This declares intent for the executable to be installed into the + // standard location when the user invokes the "install" step (the default + // step when running `zig build`). + b.installArtifact(exe); + + // This *creates* a Run step in the build graph, to be executed when another + // step is evaluated that depends on it. The next line below will establish + // such a dependency. + const run_cmd = b.addRunArtifact(exe); + + // By making the run step depend on the install step, it will be run from the + // installation directory rather than directly from within the cache directory. + // This is not necessary, however, if the application depends on other installed + // files, this ensures they will be present and in the expected location. + run_cmd.step.dependOn(b.getInstallStep()); + + // This allows the user to pass arguments to the application in the build + // command itself, like this: `zig build run -- arg1 arg2 etc` + if (b.args) |args| { + run_cmd.addArgs(args); + } + + // This creates a build step. It will be visible in the `zig build --help` menu, + // and can be selected like this: `zig build run` + // This will evaluate the `run` step rather than the default, which is "install". + const run_step = b.step("run", "Run the app"); + run_step.dependOn(&run_cmd.step); + + const exe_unit_tests = b.addTest(.{ + .root_source_file = b.path("src/main.zig"), + .target = target, + .optimize = optimize, + }); + + const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests); + + // Similar to creating the run step earlier, this exposes a `test` step to + // the `zig build --help` menu, providing a way for the user to request + // running the unit tests. + const test_step = b.step("test", "Run unit tests"); + test_step.dependOn(&run_exe_unit_tests.step); +} diff --git a/build.zig.zon b/build.zig.zon new file mode 100644 index 0000000..6d6fadb --- /dev/null +++ b/build.zig.zon @@ -0,0 +1,72 @@ +.{ + // This is the default name used by packages depending on this one. For + // example, when a user runs `zig fetch --save <url>`, this field is used + // as the key in the `dependencies` table. Although the user can choose a + // different name, most users will stick with this provided value. + // + // It is redundant to include "zig" in this name because it is already + // within the Zig package namespace. + .name = "lg", + + // This is a [Semantic Version](https://semver.org/). + // In a future version of Zig it will be used for package deduplication. + .version = "0.0.0", + + // This field is optional. + // This is currently advisory only; Zig does not yet do anything + // with this value. + //.minimum_zig_version = "0.11.0", + + // This field is optional. + // Each dependency must either provide a `url` and `hash`, or a `path`. + // `zig build --fetch` can be used to fetch all dependencies of a package, recursively. + // Once all dependencies are fetched, `zig build` no longer requires + // internet connectivity. + .dependencies = .{ + // See `zig fetch --save <url>` for a command-line interface for adding dependencies. + //.example = .{ + // // When updating this field to a new URL, be sure to delete the corresponding + // // `hash`, otherwise you are communicating that you expect to find the old hash at + // // the new URL. + // .url = "https://example.com/foo.tar.gz", + // + // // This is computed from the file contents of the directory of files that is + // // obtained after fetching `url` and applying the inclusion rules given by + // // `paths`. + // // + // // This field is the source of truth; packages do not come from a `url`; they + // // come from a `hash`. `url` is just one of many possible mirrors for how to + // // obtain a package matching this `hash`. + // // + // // Uses the [multihash](https://multiformats.io/multihash/) format. + // .hash = "...", + // + // // When this is provided, the package is found in a directory relative to the + // // build root. In this case the package's hash is irrelevant and therefore not + // // computed. This field and `url` are mutually exclusive. + // .path = "foo", + + // // When this is set to `true`, a package is declared to be lazily + // // fetched. This makes the dependency only get fetched if it is + // // actually used. + // .lazy = false, + //}, + }, + + // Specifies the set of files and directories that are included in this package. + // Only files and directories listed here are included in the `hash` that + // is computed for this package. Only files listed here will remain on disk + // when using the zig package manager. As a rule of thumb, one should list + // files required for compilation plus any license(s). + // Paths are relative to the build root. Use the empty string (`""`) to refer to + // the build root itself. + // A directory listed here means that all files within, recursively, are included. + .paths = .{ + "build.zig", + "build.zig.zon", + "src", + // For example... + //"LICENSE", + //"README.md", + }, +} diff --git a/src/main.zig b/src/main.zig new file mode 100644 index 0000000..8964c3c --- /dev/null +++ b/src/main.zig @@ -0,0 +1,34 @@ +const std = @import("std"); +const Regex = @import("regex.zig"); + +pub fn main() !void { + const stdio = std.io.getStdIn().writer(); + var gpa = std.heap.GeneralPurposeAllocator(.{}){}; + var allocator = gpa.allocator(); + + defer _ = gpa.deinit(); + + const lexer = try allocator.create(Regex.Lexer); + lexer.* = Regex.Lexer.init("abc|123[a-Z]()+", &allocator); + defer allocator.destroy(lexer); + + while (true) { + const token = lexer.advance() catch |err| switch (err) { + Regex.Lexer.Error.EndOfBuffer => { + break; + }, + else => { + return err; + }, + }; + + try stdio.print("{}\n", .{token}); + + if (token.value) |v| { + switch (v) { + .Class => |*rl| rl.deinit(), + else => {}, + } + } + } +} diff --git a/src/regex.zig b/src/regex.zig new file mode 100644 index 0000000..ed66df8 --- /dev/null +++ b/src/regex.zig @@ -0,0 +1,200 @@ +const std = @import("std"); + +pub const Token = struct { + pub const Kind = enum { Literal, LParen, RParen, Class, Dot, Star, Plus, Question, Or }; + pub const Value = union(Token.Kind) { Literal: u8, LParen, RParen, Class: RangeList, Dot, Star, Plus, Question, Or }; + + kind: Token.Kind, + value: ?Token.Value, + pos: usize, + + pub fn format(self: *const Token, comptime fmt: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void { + if (fmt.len != 0) { // must be {} or {any} + return std.invalidFmtError(fmt, self); + } + + try writer.print("{{ kind: {s}", .{@tagName(self.kind)}); + + if (self.value) |val| { + switch (val) { + .Literal => |literal| try writer.print(", value: {c}", .{literal}), + .Class => |rl| try writer.print(", value: {}", .{rl}), + else => {}, + } + } + + try writer.print(" }}", .{}); + } +}; + +pub const Lexer = struct { + start: usize, + cursor: usize, + regexp: []const u8, + allocator: *std.mem.Allocator, + + pub const Error = error{ EndOfBuffer, InvalidCharacter, UnexpectedCharater }; + + pub fn init(regexp: []const u8, allocator: *std.mem.Allocator) Lexer { + return .{ .cursor = 0, .start = 0, .regexp = regexp, .allocator = allocator }; + } + + fn readChar(self: *Lexer) !u8 { + if (self.cursor >= self.regexp.len) { + return Error.EndOfBuffer; + } + + return self.regexp[self.cursor]; + } + + pub fn advance(self: *Lexer) !Token { + self.start = self.cursor; + + var c = try self.readChar(); + + const inferred_type: Token.Kind = switch (c) { + '(' => .LParen, + ')' => .RParen, + '[' => .Class, + '.' => .Dot, + '*' => .Star, + '+' => .Plus, + '?' => .Question, + '|' => .Or, + else => .Literal, + }; + + switch (inferred_type) { + .Literal => { + if (c == '\\') { + self.cursor += 1; + c = try self.readChar(); // needs more verbose error handling + c = escapedChar(c); + } + self.cursor += 1; + return .{ .kind = inferred_type, .value = .{ .Literal = c }, .pos = self.start }; + }, + .Class => { + while (c != ']') { // needs more verbose error handling + if (self.cursor >= self.regexp.len) { + return Error.EndOfBuffer; + } + self.cursor += 1; + } + return .{ .kind = inferred_type, .value = .{ .Class = try RangeList.init(self.regexp[self.start + 1 .. self.cursor - 1], self.allocator) }, .pos = self.start }; + }, + else => { + self.cursor += 1; + return .{ .kind = inferred_type, .pos = self.start, .value = null }; + }, + } + } + + fn escapedChar(c: u8) u8 { + return switch (c) { + 'n' => '\n', // placeholder for now + 't' => '\t', + 'r' => '\r', + else => c, + }; + } +}; + +// could use something like a bst, but thinking about what regex classes people actually write makes me think that a list is more efficient +pub const RangeList = struct { + negated: bool, + head: ?*Node, + allocator: *std.mem.Allocator, + + const Node = struct { + range: struct { min: u8, max: u8 }, + next: ?*Node, + }; + + // need to do more error handling + pub fn init(str: []const u8, allocator: *std.mem.Allocator) !RangeList { + var rl = RangeList{ + .negated = false, + .head = null, + .allocator = allocator, + }; + + var i: usize = 0; + + if (str.len == 0) { + return rl; + } + + if (str[0] == '^') { + rl.negated = true; + i += 1; + } + + var p = rl.head; + while (i < str.len) : (i += 1) { + const node = try rl.allocator.create(Node); + + const min = str[i]; + var max = min; + + i += 1; + + if (i + 1 < str.len and str[i] == '-') { + i += 1; + max = str[i]; + i += 1; + } + + node.* = .{ + .range = .{ .min = min, .max = max }, + .next = null, + }; + + if (p) |tail| { + tail.next = node; + p = tail.next; + } else { + rl.head = node; + p = rl.head; + } + } + return rl; + } + + pub fn deinit(self: *const RangeList) void { + var itr = self.head; + + while (itr) |node| { + const prev = node; + itr = node.next; + self.allocator.destroy(prev); + } + } + + pub fn format(self: *const RangeList, comptime fmt: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void { + var itr = self.head; + + if (fmt.len != 0) { // must be {} or {any} + return std.invalidFmtError(fmt, self); + } + + if (self.negated) { + try writer.print("^", .{}); + } + + try writer.print("[", .{}); + + while (itr) |node| : (itr = node.next) { + if (node.range.min == node.range.max) { + try writer.print("{c}", .{node.range.min}); + } else { + try writer.print("{c}-{c}", .{ node.range.min, node.range.max }); + } + + if (node.next) |_| { + try writer.print(", ", .{}); + } + } + try writer.print("]", .{}); + } +}; |