5 files changed, 377 insertions, 0 deletions
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..1909081
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,16 @@
+ISC License
+     
+Copyright (c) 2024 Stefan Weigl-Bosker <stefan@s00.xyz>
+
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
+REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
+AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
+INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
+LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THIS SOFTWARE.
+
diff --git a/build.zig b/build.zig
new file mode 100644
index 0000000..f7e28eb
--- /dev/null
+++ b/build.zig
@@ -0,0 +1,55 @@
+const std = @import("std");
+
+pub fn build(b: *std.Build) void {
+    const target = b.standardTargetOptions(.{});
+    const optimize = b.standardOptimizeOption(.{});
+
+    const exe = b.addExecutable(.{
+        .name = "lg",
+        .root_source_file = b.path("src/main.zig"),
+        .target = target,
+        .optimize = optimize,
+    });
+
+    // This declares intent for the executable to be installed into the
+    // standard location when the user invokes the "install" step (the default
+    // step when running `zig build`).
+    b.installArtifact(exe);
+
+    // This *creates* a Run step in the build graph, to be executed when another
+    // step is evaluated that depends on it. The next line below will establish
+    // such a dependency.
+    const run_cmd = b.addRunArtifact(exe);
+
+    // By making the run step depend on the install step, it will be run from the
+    // installation directory rather than directly from within the cache directory.
+    // This is not necessary, however, if the application depends on other installed
+    // files, this ensures they will be present and in the expected location.
+    run_cmd.step.dependOn(b.getInstallStep());
+
+    // This allows the user to pass arguments to the application in the build
+    // command itself, like this: `zig build run -- arg1 arg2 etc`
+    if (b.args) |args| {
+        run_cmd.addArgs(args);
+    }
+
+    // This creates a build step. It will be visible in the `zig build --help` menu,
+    // and can be selected like this: `zig build run`
+    // This will evaluate the `run` step rather than the default, which is "install".
+    const run_step = b.step("run", "Run the app");
+    run_step.dependOn(&run_cmd.step);
+
+    const exe_unit_tests = b.addTest(.{
+        .root_source_file = b.path("src/main.zig"),
+        .target = target,
+        .optimize = optimize,
+    });
+
+    const run_exe_unit_tests = b.addRunArtifact(exe_unit_tests);
+
+    // Similar to creating the run step earlier, this exposes a `test` step to
+    // the `zig build --help` menu, providing a way for the user to request
+    // running the unit tests.
+    const test_step = b.step("test", "Run unit tests");
+    test_step.dependOn(&run_exe_unit_tests.step);
+}
diff --git a/build.zig.zon b/build.zig.zon
new file mode 100644
index 0000000..6d6fadb
--- /dev/null
+++ b/build.zig.zon
@@ -0,0 +1,72 @@
+.{
+    // This is the default name used by packages depending on this one. For
+    // example, when a user runs `zig fetch --save <url>`, this field is used
+    // as the key in the `dependencies` table. Although the user can choose a
+    // different name, most users will stick with this provided value.
+    //
+    // It is redundant to include "zig" in this name because it is already
+    // within the Zig package namespace.
+    .name = "lg",
+
+    // This is a [Semantic Version](https://semver.org/).
+    // In a future version of Zig it will be used for package deduplication.
+    .version = "0.0.0",
+
+    // This field is optional.
+    // This is currently advisory only; Zig does not yet do anything
+    // with this value.
+    //.minimum_zig_version = "0.11.0",
+
+    // This field is optional.
+    // Each dependency must either provide a `url` and `hash`, or a `path`.
+    // `zig build --fetch` can be used to fetch all dependencies of a package, recursively.
+    // Once all dependencies are fetched, `zig build` no longer requires
+    // internet connectivity.
+    .dependencies = .{
+        // See `zig fetch --save <url>` for a command-line interface for adding dependencies.
+        //.example = .{
+        //    // When updating this field to a new URL, be sure to delete the corresponding
+        //    // `hash`, otherwise you are communicating that you expect to find the old hash at
+        //    // the new URL.
+        //    .url = "https://example.com/foo.tar.gz",
+        //
+        //    // This is computed from the file contents of the directory of files that is
+        //    // obtained after fetching `url` and applying the inclusion rules given by
+        //    // `paths`.
+        //    //
+        //    // This field is the source of truth; packages do not come from a `url`; they
+        //    // come from a `hash`. `url` is just one of many possible mirrors for how to
+        //    // obtain a package matching this `hash`.
+        //    //
+        //    // Uses the [multihash](https://multiformats.io/multihash/) format.
+        //    .hash = "...",
+        //
+        //    // When this is provided, the package is found in a directory relative to the
+        //    // build root. In this case the package's hash is irrelevant and therefore not
+        //    // computed. This field and `url` are mutually exclusive.
+        //    .path = "foo",
+
+        //    // When this is set to `true`, a package is declared to be lazily
+        //    // fetched. This makes the dependency only get fetched if it is
+        //    // actually used.
+        //    .lazy = false,
+        //},
+    },
+
+    // Specifies the set of files and directories that are included in this package.
+    // Only files and directories listed here are included in the `hash` that
+    // is computed for this package. Only files listed here will remain on disk
+    // when using the zig package manager. As a rule of thumb, one should list
+    // files required for compilation plus any license(s).
+    // Paths are relative to the build root. Use the empty string (`""`) to refer to
+    // the build root itself.
+    // A directory listed here means that all files within, recursively, are included.
+    .paths = .{
+        "build.zig",
+        "build.zig.zon",
+        "src",
+        // For example...
+        //"LICENSE",
+        //"README.md",
+    },
+}
diff --git a/src/main.zig b/src/main.zig
new file mode 100644
index 0000000..8964c3c
--- /dev/null
+++ b/src/main.zig
@@ -0,0 +1,34 @@
+const std = @import("std");
+const Regex = @import("regex.zig");
+
+pub fn main() !void {
+    const stdio = std.io.getStdIn().writer();
+    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
+    var allocator = gpa.allocator();
+
+    defer _ = gpa.deinit();
+
+    const lexer = try allocator.create(Regex.Lexer);
+    lexer.* = Regex.Lexer.init("abc|123[a-Z]()+", &allocator);
+    defer allocator.destroy(lexer);
+
+    while (true) {
+        const token = lexer.advance() catch |err| switch (err) {
+            Regex.Lexer.Error.EndOfBuffer => {
+                break;
+            },
+            else => {
+                return err;
+            },
+        };
+
+        try stdio.print("{}\n", .{token});
+
+        if (token.value) |v| {
+            switch (v) {
+                .Class => |*rl| rl.deinit(),
+                else => {},
+            }
+        }
+    }
+}
diff --git a/src/regex.zig b/src/regex.zig
new file mode 100644
index 0000000..ed66df8
--- /dev/null
+++ b/src/regex.zig
@@ -0,0 +1,200 @@
+const std = @import("std");
+
+pub const Token = struct {
+    pub const Kind = enum { Literal, LParen, RParen, Class, Dot, Star, Plus, Question, Or };
+    pub const Value = union(Token.Kind) { Literal: u8, LParen, RParen, Class: RangeList, Dot, Star, Plus, Question, Or };
+
+    kind: Token.Kind,
+    value: ?Token.Value,
+    pos: usize,
+
+    pub fn format(self: *const Token, comptime fmt: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void {
+        if (fmt.len != 0) { // must be {} or {any}
+            return std.invalidFmtError(fmt, self);
+        }
+
+        try writer.print("{{ kind: {s}", .{@tagName(self.kind)});
+
+        if (self.value) |val| {
+            switch (val) {
+                .Literal => |literal| try writer.print(", value: {c}", .{literal}),
+                .Class => |rl| try writer.print(", value: {}", .{rl}),
+                else => {},
+            }
+        }
+
+        try writer.print(" }}", .{});
+    }
+};
+
+pub const Lexer = struct {
+    start: usize,
+    cursor: usize,
+    regexp: []const u8,
+    allocator: *std.mem.Allocator,
+
+    pub const Error = error{ EndOfBuffer, InvalidCharacter, UnexpectedCharater };
+
+    pub fn init(regexp: []const u8, allocator: *std.mem.Allocator) Lexer {
+        return .{ .cursor = 0, .start = 0, .regexp = regexp, .allocator = allocator };
+    }
+
+    fn readChar(self: *Lexer) !u8 {
+        if (self.cursor >= self.regexp.len) {
+            return Error.EndOfBuffer;
+        }
+
+        return self.regexp[self.cursor];
+    }
+
+    pub fn advance(self: *Lexer) !Token {
+        self.start = self.cursor;
+
+        var c = try self.readChar();
+
+        const inferred_type: Token.Kind = switch (c) {
+            '(' => .LParen,
+            ')' => .RParen,
+            '[' => .Class,
+            '.' => .Dot,
+            '*' => .Star,
+            '+' => .Plus,
+            '?' => .Question,
+            '|' => .Or,
+            else => .Literal,
+        };
+
+        switch (inferred_type) {
+            .Literal => {
+                if (c == '\\') {
+                    self.cursor += 1;
+                    c = try self.readChar(); // needs more verbose error handling
+                    c = escapedChar(c);
+                }
+                self.cursor += 1;
+                return .{ .kind = inferred_type, .value = .{ .Literal = c }, .pos = self.start };
+            },
+            .Class => {
+                while (c != ']') { // needs more verbose error handling
+                    if (self.cursor >= self.regexp.len) {
+                        return Error.EndOfBuffer;
+                    }
+                    self.cursor += 1;
+                }
+                return .{ .kind = inferred_type, .value = .{ .Class = try RangeList.init(self.regexp[self.start + 1 .. self.cursor - 1], self.allocator) }, .pos = self.start };
+            },
+            else => {
+                self.cursor += 1;
+                return .{ .kind = inferred_type, .pos = self.start, .value = null };
+            },
+        }
+    }
+
+    fn escapedChar(c: u8) u8 {
+        return switch (c) {
+            'n' => '\n', // placeholder for now
+            't' => '\t',
+            'r' => '\r',
+            else => c,
+        };
+    }
+};
+
+// could use something like a bst, but thinking about what regex classes people actually write makes me think that a list is more efficient
+pub const RangeList = struct {
+    negated: bool,
+    head: ?*Node,
+    allocator: *std.mem.Allocator,
+
+    const Node = struct {
+        range: struct { min: u8, max: u8 },
+        next: ?*Node,
+    };
+
+    // need to do more error handling
+    pub fn init(str: []const u8, allocator: *std.mem.Allocator) !RangeList {
+        var rl = RangeList{
+            .negated = false,
+            .head = null,
+            .allocator = allocator,
+        };
+
+        var i: usize = 0;
+
+        if (str.len == 0) {
+            return rl;
+        }
+
+        if (str[0] == '^') {
+            rl.negated = true;
+            i += 1;
+        }
+
+        var p = rl.head;
+        while (i < str.len) : (i += 1) {
+            const node = try rl.allocator.create(Node);
+
+            const min = str[i];
+            var max = min;
+
+            i += 1;
+
+            if (i + 1 < str.len and str[i] == '-') {
+                i += 1;
+                max = str[i];
+                i += 1;
+            }
+
+            node.* = .{
+                .range = .{ .min = min, .max = max },
+                .next = null,
+            };
+
+            if (p) |tail| {
+                tail.next = node;
+                p = tail.next;
+            } else {
+                rl.head = node;
+                p = rl.head;
+            }
+        }
+        return rl;
+    }
+
+    pub fn deinit(self: *const RangeList) void {
+        var itr = self.head;
+
+        while (itr) |node| {
+            const prev = node;
+            itr = node.next;
+            self.allocator.destroy(prev);
+        }
+    }
+
+    pub fn format(self: *const RangeList, comptime fmt: []const u8, _: std.fmt.FormatOptions, writer: anytype) !void {
+        var itr = self.head;
+
+        if (fmt.len != 0) { // must be {} or {any}
+            return std.invalidFmtError(fmt, self);
+        }
+
+        if (self.negated) {
+            try writer.print("^", .{});
+        }
+
+        try writer.print("[", .{});
+
+        while (itr) |node| : (itr = node.next) {
+            if (node.range.min == node.range.max) {
+                try writer.print("{c}", .{node.range.min});
+            } else {
+                try writer.print("{c}-{c}", .{ node.range.min, node.range.max });
+            }
+
+            if (node.next) |_| {
+                try writer.print(", ", .{});
+            }
+        }
+        try writer.print("]", .{});
+    }
+};