Unknown state [2025-08-04]

2025-08-04 22:28:49 -04:00
parent 6e4b76a6d9
commit 3b252de1ca
27 changed files with 24509 additions and 0 deletions
--- a/md/.envrc
+++ b/md/.envrc
@@ -0,0 +1 @@
+PATH_add zig-out/bin
--- a/md/.tool-versions
+++ b/md/.tool-versions
@@ -0,0 +1 @@
+zig 0.15.0-dev.905+edf785db0
--- a/md/CommonMark
+++ b/md/CommonMark
--- a/md/build.zig
+++ b/md/build.zig
@@ -0,0 +1,52 @@
+const std = @import("std");
+
+pub fn build(b: *std.Build) void {
+    const target = b.standardTargetOptions(.{});
+    const optimize = b.standardOptimizeOption(.{});
+
+    const mod = b.addModule("md", .{
+        .root_source_file = b.path("src/root.zig"),
+        .target = target,
+    });
+
+    const exe = b.addExecutable(.{
+        .name = "md",
+        .root_module = b.createModule(.{
+            .root_source_file = b.path("src/main.zig"),
+            .target = target,
+            .optimize = optimize,
+            .imports = &.{
+                .{ .name = "md", .module = mod },
+            },
+        }),
+    });
+
+    b.installArtifact(exe);
+
+    const run_step = b.step("run", "Run the app");
+
+    const run_cmd = b.addRunArtifact(exe);
+    run_step.dependOn(&run_cmd.step);
+
+    run_cmd.step.dependOn(b.getInstallStep());
+
+    if (b.args) |args| {
+        run_cmd.addArgs(args);
+    }
+
+    const mod_tests = b.addTest(.{
+        .root_module = mod,
+    });
+
+    const run_mod_tests = b.addRunArtifact(mod_tests);
+
+    const exe_tests = b.addTest(.{
+        .root_module = exe.root_module,
+    });
+
+    const run_exe_tests = b.addRunArtifact(exe_tests);
+
+    const test_step = b.step("test", "Run tests");
+    test_step.dependOn(&run_mod_tests.step);
+    test_step.dependOn(&run_exe_tests.step);
+}
--- a/md/build.zig.zon
+++ b/md/build.zig.zon
@@ -0,0 +1,81 @@
+.{
+    // This is the default name used by packages depending on this one. For
+    // example, when a user runs `zig fetch --save <url>`, this field is used
+    // as the key in the `dependencies` table. Although the user can choose a
+    // different name, most users will stick with this provided value.
+    //
+    // It is redundant to include "zig" in this name because it is already
+    // within the Zig package namespace.
+    .name = .md,
+    // This is a [Semantic Version](https://semver.org/).
+    // In a future version of Zig it will be used for package deduplication.
+    .version = "0.0.0",
+    // Together with name, this represents a globally unique package
+    // identifier. This field is generated by the Zig toolchain when the
+    // package is first created, and then *never changes*. This allows
+    // unambiguous detection of one package being an updated version of
+    // another.
+    //
+    // When forking a Zig project, this id should be regenerated (delete the
+    // field and run `zig build`) if the upstream project is still maintained.
+    // Otherwise, the fork is *hostile*, attempting to take control over the
+    // original project's identity. Thus it is recommended to leave the comment
+    // on the following line intact, so that it shows up in code reviews that
+    // modify the field.
+    .fingerprint = 0xdb55a2544fdebaae, // Changing this has security and trust implications.
+    // Tracks the earliest Zig version that the package considers to be a
+    // supported use case.
+    .minimum_zig_version = "0.15.0-dev.905+edf785db0",
+    // This field is optional.
+    // Each dependency must either provide a `url` and `hash`, or a `path`.
+    // `zig build --fetch` can be used to fetch all dependencies of a package, recursively.
+    // Once all dependencies are fetched, `zig build` no longer requires
+    // internet connectivity.
+    .dependencies = .{
+        // See `zig fetch --save <url>` for a command-line interface for adding dependencies.
+        //.example = .{
+        //    // When updating this field to a new URL, be sure to delete the corresponding
+        //    // `hash`, otherwise you are communicating that you expect to find the old hash at
+        //    // the new URL. If the contents of a URL change this will result in a hash mismatch
+        //    // which will prevent zig from using it.
+        //    .url = "https://example.com/foo.tar.gz",
+        //
+        //    // This is computed from the file contents of the directory of files that is
+        //    // obtained after fetching `url` and applying the inclusion rules given by
+        //    // `paths`.
+        //    //
+        //    // This field is the source of truth; packages do not come from a `url`; they
+        //    // come from a `hash`. `url` is just one of many possible mirrors for how to
+        //    // obtain a package matching this `hash`.
+        //    //
+        //    // Uses the [multihash](https://multiformats.io/multihash/) format.
+        //    .hash = "...",
+        //
+        //    // When this is provided, the package is found in a directory relative to the
+        //    // build root. In this case the package's hash is irrelevant and therefore not
+        //    // computed. This field and `url` are mutually exclusive.
+        //    .path = "foo",
+        //
+        //    // When this is set to `true`, a package is declared to be lazily
+        //    // fetched. This makes the dependency only get fetched if it is
+        //    // actually used.
+        //    .lazy = false,
+        //},
+    },
+    // Specifies the set of files and directories that are included in this package.
+    // Only files and directories listed here are included in the `hash` that
+    // is computed for this package. Only files listed here will remain on disk
+    // when using the zig package manager. As a rule of thumb, one should list
+    // files required for compilation plus any license(s).
+    // Paths are relative to the build root. Use the empty string (`""`) to refer to
+    // the build root itself.
+    // A directory listed here means that all files within, recursively, are included.
+    .paths = .{
+        "build.zig",
+        "build.zig.zon",
+        "src",
+        // For example...
+        //"LICENSE",
+        //"README.md",
+    },
+}
--- a/md/python.md
+++ b/md/python.md
@@ -0,0 +1,194 @@
+Title: Python from Scratch
+Date: 2025-02-01
+Summary: Building up the Python Data Model from scratch.
+
+ * ** * ** * ** * ** * ** * ** * ** * ** * ** * **
+
+# Learning to Read
+
+---
+
+**First:** A Python program is made up of _tokens_; you can think of these as "
+words". Some examples of tokens:
+
+- `"hello world"`
+- `6`
+- `(`
+- `while`
+- `print`
+
+Generally there are four types of token in Python, although in practice the
+lines between them get blurred a little bit.
+
+- _Literals_ literally represent some value. `"hello world"` and `6` and `4.2`
+  are examples of such literals; the first represents some text and the others
+  represent numbers. This is _literal_ as opposed to some indirect
+  representation like `4 + 2` or `"hello" + " " + "world"`.
+
+- _Operators_ include things like math operators `+`, `-`, `*`, but also things
+  like the function call operator `( )`, boolean operators `and`, and myriad
+  other operators. [There's a comprehensive list here][expressions] but beware -
+  there's a lot and some of them are pretty technical. The main point is that
+  `( )` and `+` are the same _kind of thing_ as far as the Python interpreter is
+  concerned.
+
+- _Keywords_ are special directives that tell Python how to behave. This
+  includes things like `if` and `def` and `while`. Technically, operators are
+  also keywords (for example `and` is a keyword) but that's not super relevant
+  here.
+
+- ___Names___ are the last - and most important - kind of token. `print` is a
+  name. Variable names are names. Function names are names. Class names are
+  names. Module names are names. In all cases, a name represents some _thing_,
+  and Python can fetch that thing if given its name.
+
+[expressions]: https://docs.python.org/3/reference/expressions.html
+
+So if I give Python this code:
+
+```py
+x = "world"
+print("hello " + x)
+```
+
+You should first identify the tokens:
+
+- _Name_ `x`
+- _Operator_ `=`
+- _Literal_ `"world"`
+- _Name_ `print`
+- _Operator_ `( )`
+- _Literal_ `"hello "`
+- _Operator_ `+`
+- _Name_ `x`
+
+The first line of code binds `"world"` to the name `x`.
+
+The expression `"hello " + x` looks up the value named by `x` and concatenates
+it with the literal value `"hello "`. This produces the string `"hello world"`.
+
+The expression `print( ... )` looks up the value - the function - named by
+`print` and uses the `( )` operator to call it with the string `"hello world"`.
+
+To be crystal clear: `x` and `print` _are the same kind of token_, it's just
+that their named values have different types. One is a string, the other a
+function. The string can be _operated on_ with the `+` operator, and the
+function can be _operated on_ with the `( )` operator.
+
+It is valid to write `print(print)`; here we are looking up the name `print`,
+and passing that value to the function named by `print`. This should be no more
+or less surprising than being able to write `x + x` or `5 * 4`.
+
+# Namespaces
+
+**First-and-a-half:** A _namespace_ is a collection of names.
+
+You might also hear this called a "scope". This is the reason I say "maybe three
+or four, depending how you count"; this is really part of that fundamental idea
+of a _name_, but I'll list it separately to be extra clear.
+
+There are some special structures in Python that introduce new namespaces. Each
+_module_ has a "global" namespace; these are names that can be referenced
+anywhere in a given file or script. Each _function_ has a "local" namespace;
+these are names that can only be accessed within the function.
+
+For example:
+
+```py
+x = "eggs"
+
+
+def spam():
+    y = "ham"
+
+    # I can print(x) here.
+
+# But I cannot print(y) here.
+```
+
+Objects also have namespaces. Names on objects are called "attributes", and they
+may be simple values or functions, just how regular names might be simple
+values (`x`, `y`) or functions (`print`, `spam`). You access attributes with the
+`.` operator.
+
+```py
+obj = range(10)
+print(
+    obj.stop)  # find the value named by `obj`, then find the value named by `stop`. 10.
+```
+
+Finally, there is the built-in namespace. These are names that are accessible
+always, from anywhere, by default. Names like `print` and `range` are defined
+here. [Here's a comprehensive list of built-in names](https://docs.python.org/3/library/functions.html).
+
+# Strings
+
+**Second:** you asked about characters and letters, so you may appreciate some
+background on strings.
+
+A _string_ is a sequence of characters. A _character_ is simply a number to
+which we, by convention, assign some meaning. For example, by convention, we've
+all agreed that the number `74` means `J`. This convention is called an
+_encoding_. The default encoding is called UTF-8 and is specified by a committee
+called the _Unicode Consortium_. This encoding includes characters from many
+current and ancient languages, various symbols and typographical marks, emojis,
+flags, etc. The important thing to remember is each one of these things, really,
+is just an integer. And all our devices just agree that when they see a given
+integer they will look up the appropriate symbol in an appropriate font.
+
+You can switch between the string representation and the numerical
+representation with the `encode` and `decode` methods on strings. Really, these
+are the same, you're just telling Python to tell your console to draw them
+differently.
+
+```py
+>> > list('Fizz'.encode())
+[70, 105, 122, 122]
+>> > bytes([66, 117, 122, 122]).decode()
+'Buzz'
+```
+
+For continuity: `list`, `encode`, `decode`, and `bytes` are all names. `( )`,
+`[ ]`, `,`, and `.` are all operators. The numbers and `'Fizz'` are literals.
+
+† Technically, `[66, 117, 122, 122]` in its entirety is a literal - `,` is a
+keyword, not an operator - but that's neither here nor there for these purposes.
+
+‡ The symbol `†` is number 8224 and the symbol `‡` is number 8225.
+
+# Names
+
+**Second-and-a-half:** names are strings.
+
+Names are just strings, and namespaces are just `dict`. You can access them with
+`locals()` and `globals()`, although in practice you almost never need to do
+this directly. It's better to just use the name itself.
+
+```py
+import pprint
+
+x = range(10)
+function = print
+pprint.pprint(globals())
+```
+
+This outputs:
+
+```
+{'__annotations__': {},
+ '__builtins__': <module 'builtins' (built-in)>,
+ '__cached__': None,
+ '__doc__': None,
+ '__file__': '<stdin>',
+ '__loader__': <class '_frozen_importlib.BuiltinImporter'>,
+ '__name__': '__main__',
+ '__package__': None,
+ '__spec__': None,
+ 'function': <built-in function print>,
+ 'pprint': <module 'pprint' from 'python3.12/pprint.py'>,
+ 'x': range(0, 10)}
+```
+
+For continuity: `import pprint` binds the name `pprint` to the module
+`pprint.py` from the standard library. The line `pprint.pprint( ... )` fetches
+the function `pprint` from that module, and calls it.
--- a/md/spec-0.31.2.md
+++ b/md/spec-0.31.2.md
--- a/md/src/main.zig
+++ b/md/src/main.zig
@@ -0,0 +1,156 @@
+//! Markdown Parser.
+//!
+//! Leaf blocks
+//!     Thematic Breaks
+//!     ATX headings
+//!     Setext headings
+//!     Indented code blocks
+//!     Fenced code blocks
+//!     HTML blocks
+//!     Link reference definitions
+//!     Paragraphs
+//!     Blank lines
+//!
+//! Container blocks
+//!     Block Quotes
+//!     List Items
+//!     Lists
+//!
+//! Inlines
+//!     Code spans
+//!     emph, strong
+//!     Links
+//!     Images
+//!     Autolinks
+//!     Raw HTML
+//!     Hard line breaks
+//!     Soft line breaks
+//!     Text
+
+const BlockType = enum {
+    // Leaf Blocks
+    Break,
+    Heading,
+    Code,
+    HTML,
+    LinkDefinition,
+    Paragraph,
+    Blank,
+
+    // Container Blocks
+    Quote,
+    List,
+    // ListItem,
+};
+
+const Block = struct {
+    type: BlockType,
+    text: []const u8,
+    depth: u8 = 0,
+};
+
+const std = @import("std");
+const md = @import("md");
+
+pub fn render(src: []const u8, alloc: std.mem.Allocator) !void {
+    var it = std.mem.splitScalar(u8, src, '\n');
+
+    var blocks = std.ArrayList(Block).init(alloc);
+    defer blocks.deinit();
+
+    line_search: while (it.next()) |line| {
+        // early check for blank line
+        for (line) |ch| {
+            if (!std.ascii.isWhitespace(ch)) break;
+        } else {
+            try blocks.append(.{ .type = .Blank, .text = line });
+            continue :line_search;
+        }
+
+        // figure out indentation
+        var depth: u8 = 0;
+        var start: u8 = 0;
+
+        for (line) |ch| {
+            switch (ch) {
+                ' ' => depth += 1,
+                '\t' => depth += 4,
+                else => break,
+            }
+            start += 1;
+        }
+        // at this point, start will not be line.len because we checked
+        // it is not blank
+
+        if (depth >= 4) {
+            // todo something special has to happen. lists and code blocks.
+        }
+
+        // test thematic break
+        switch (line[start]) {
+            '*', '-', '_' => |kind| {
+                var count: u8 = 0;
+                for (line[start..]) |ch| {
+                    if (ch == kind) {
+                        count += 1;
+                        continue;
+                    }
+
+                    if (std.ascii.isWhitespace(ch)) continue;
+
+                    break;
+                } else {
+                    if (count >= 3) {
+                        try blocks.append(.{ .type = .Break, .text = line });
+                        continue :line_search;
+                    }
+                }
+            },
+            else => {},
+        }
+    }
+
+    for (blocks.items) |block| {
+        std.debug.print("{any}\n", .{block});
+    }
+
+    // TODO split into paragraphs, list elements
+    //
+
+    // const State = enum {
+    //     Block,
+    // };
+
+    // mach: switch (State.Block) {
+    //     .Block => {
+    //         continue :mach .Block;
+    //     },
+    // }
+
+    // while (it.next()) |line| {}
+}
+
+pub fn main() !void {
+    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
+    defer _ = gpa.deinit();
+    const alloc = gpa.allocator();
+
+    const args = try std.process.argsAlloc(alloc);
+    defer std.process.argsFree(alloc, args);
+
+    const buf = try alloc.alloc(u8, 1 << 20);
+    defer alloc.free(buf);
+
+    for (args[1..]) |path| {
+        const file = try std.fs.cwd().openFile(path, .{ .mode = .read_only });
+        defer file.close();
+        const stat = try file.stat();
+        if (stat.size > buf.len)
+            return error.FileTooBig;
+
+        const n = try file.readAll(buf);
+        const content = buf[0..n];
+
+        try render(content, alloc);
+    }
+}
--- a/md/src/root.zig
+++ b/md/src/root.zig
@@ -0,0 +1,54 @@
+const std = @import("std");
+
+pub fn parse(src: []const u8, alloc: std.mem.Allocator) ![][]const u8 {
+    var it = std.mem.splitScalar(u8, src, '\n');
+
+    var lines = std.ArrayList([]const u8).init(alloc);
+
+    while (it.next()) |line| {
+        try lines.append(line);
+    }
+
+    return try lines.toOwnedSlice();
+}
+
+// pub fn render(src: []const u8, alloc: std.mem.Allocator) ![]const u8 {
+//     // std.mem.tokenizeScalar(comptime T: type, buffer: []const T, delimiter: T)
+// }
+
+test "split-lines" {
+    const alloc = std.testing.allocator;
+
+    const src =
+        \\hello world!
+        \\
+        \\this is some content!
+        \\
+    ;
+
+    const lines = try parse(src, alloc);
+    defer alloc.free(lines);
+
+    const expect: []const []const u8 = &.{
+        "hello world!",
+        "",
+        "this is some content!",
+        "",
+    };
+
+    try std.testing.expectEqualDeep(
+        expect,
+        lines,
+    );
+}
+
+test "2.2 Tabs" {
+    // const alloc = std.testing.allocator;
+    //
+    // const md = "\tfoo\tbaz\t\tbim";
+
+    // const expect = "<pre><code>foo\tbaz\t\tbim</code></pre>";
+    // const actual = try render(md, alloc);
+    // defer alloc.free(actual);
+    // std.testing.expectEqualStrings(expect, actual);
+}
--- a/md/zig-out/bin/md
+++ b/md/zig-out/bin/md