Unknown state [2025-08-04]

This commit is contained in:
2025-08-04 22:28:49 -04:00
parent 6e4b76a6d9
commit 3b252de1ca
27 changed files with 24509 additions and 0 deletions

1
md/.envrc Normal file
View File

@@ -0,0 +1 @@
PATH_add zig-out/bin

1
md/.tool-versions Normal file
View File

@@ -0,0 +1 @@
zig 0.15.0-dev.905+edf785db0

13601
md/CommonMark Spec.html Normal file

File diff suppressed because it is too large Load Diff

52
md/build.zig Normal file
View File

@@ -0,0 +1,52 @@
const std = @import("std");
pub fn build(b: *std.Build) void {
const target = b.standardTargetOptions(.{});
const optimize = b.standardOptimizeOption(.{});
const mod = b.addModule("md", .{
.root_source_file = b.path("src/root.zig"),
.target = target,
});
const exe = b.addExecutable(.{
.name = "md",
.root_module = b.createModule(.{
.root_source_file = b.path("src/main.zig"),
.target = target,
.optimize = optimize,
.imports = &.{
.{ .name = "md", .module = mod },
},
}),
});
b.installArtifact(exe);
const run_step = b.step("run", "Run the app");
const run_cmd = b.addRunArtifact(exe);
run_step.dependOn(&run_cmd.step);
run_cmd.step.dependOn(b.getInstallStep());
if (b.args) |args| {
run_cmd.addArgs(args);
}
const mod_tests = b.addTest(.{
.root_module = mod,
});
const run_mod_tests = b.addRunArtifact(mod_tests);
const exe_tests = b.addTest(.{
.root_module = exe.root_module,
});
const run_exe_tests = b.addRunArtifact(exe_tests);
const test_step = b.step("test", "Run tests");
test_step.dependOn(&run_mod_tests.step);
test_step.dependOn(&run_exe_tests.step);
}

81
md/build.zig.zon Normal file
View File

@@ -0,0 +1,81 @@
.{
// This is the default name used by packages depending on this one. For
// example, when a user runs `zig fetch --save <url>`, this field is used
// as the key in the `dependencies` table. Although the user can choose a
// different name, most users will stick with this provided value.
//
// It is redundant to include "zig" in this name because it is already
// within the Zig package namespace.
.name = .md,
// This is a [Semantic Version](https://semver.org/).
// In a future version of Zig it will be used for package deduplication.
.version = "0.0.0",
// Together with name, this represents a globally unique package
// identifier. This field is generated by the Zig toolchain when the
// package is first created, and then *never changes*. This allows
// unambiguous detection of one package being an updated version of
// another.
//
// When forking a Zig project, this id should be regenerated (delete the
// field and run `zig build`) if the upstream project is still maintained.
// Otherwise, the fork is *hostile*, attempting to take control over the
// original project's identity. Thus it is recommended to leave the comment
// on the following line intact, so that it shows up in code reviews that
// modify the field.
.fingerprint = 0xdb55a2544fdebaae, // Changing this has security and trust implications.
// Tracks the earliest Zig version that the package considers to be a
// supported use case.
.minimum_zig_version = "0.15.0-dev.905+edf785db0",
// This field is optional.
// Each dependency must either provide a `url` and `hash`, or a `path`.
// `zig build --fetch` can be used to fetch all dependencies of a package, recursively.
// Once all dependencies are fetched, `zig build` no longer requires
// internet connectivity.
.dependencies = .{
// See `zig fetch --save <url>` for a command-line interface for adding dependencies.
//.example = .{
// // When updating this field to a new URL, be sure to delete the corresponding
// // `hash`, otherwise you are communicating that you expect to find the old hash at
// // the new URL. If the contents of a URL change this will result in a hash mismatch
// // which will prevent zig from using it.
// .url = "https://example.com/foo.tar.gz",
//
// // This is computed from the file contents of the directory of files that is
// // obtained after fetching `url` and applying the inclusion rules given by
// // `paths`.
// //
// // This field is the source of truth; packages do not come from a `url`; they
// // come from a `hash`. `url` is just one of many possible mirrors for how to
// // obtain a package matching this `hash`.
// //
// // Uses the [multihash](https://multiformats.io/multihash/) format.
// .hash = "...",
//
// // When this is provided, the package is found in a directory relative to the
// // build root. In this case the package's hash is irrelevant and therefore not
// // computed. This field and `url` are mutually exclusive.
// .path = "foo",
//
// // When this is set to `true`, a package is declared to be lazily
// // fetched. This makes the dependency only get fetched if it is
// // actually used.
// .lazy = false,
//},
},
// Specifies the set of files and directories that are included in this package.
// Only files and directories listed here are included in the `hash` that
// is computed for this package. Only files listed here will remain on disk
// when using the zig package manager. As a rule of thumb, one should list
// files required for compilation plus any license(s).
// Paths are relative to the build root. Use the empty string (`""`) to refer to
// the build root itself.
// A directory listed here means that all files within, recursively, are included.
.paths = .{
"build.zig",
"build.zig.zon",
"src",
// For example...
//"LICENSE",
//"README.md",
},
}

194
md/python.md Normal file
View File

@@ -0,0 +1,194 @@
Title: Python from Scratch
Date: 2025-02-01
Summary: Building up the Python Data Model from scratch.
* ** * ** * ** * ** * ** * ** * ** * ** * ** * **
# Learning to Read
---
**First:** A Python program is made up of _tokens_; you can think of these as "
words". Some examples of tokens:
- `"hello world"`
- `6`
- `(`
- `while`
- `print`
Generally there are four types of token in Python, although in practice the
lines between them get blurred a little bit.
- _Literals_ literally represent some value. `"hello world"` and `6` and `4.2`
are examples of such literals; the first represents some text and the others
represent numbers. This is _literal_ as opposed to some indirect
representation like `4 + 2` or `"hello" + " " + "world"`.
- _Operators_ include things like math operators `+`, `-`, `*`, but also things
like the function call operator `( )`, boolean operators `and`, and myriad
other operators. [There's a comprehensive list here][expressions] but beware -
there's a lot and some of them are pretty technical. The main point is that
`( )` and `+` are the same _kind of thing_ as far as the Python interpreter is
concerned.
- _Keywords_ are special directives that tell Python how to behave. This
includes things like `if` and `def` and `while`. Technically, operators are
also keywords (for example `and` is a keyword) but that's not super relevant
here.
- ___Names___ are the last - and most important - kind of token. `print` is a
name. Variable names are names. Function names are names. Class names are
names. Module names are names. In all cases, a name represents some _thing_,
and Python can fetch that thing if given its name.
[expressions]: https://docs.python.org/3/reference/expressions.html
So if I give Python this code:
```py
x = "world"
print("hello " + x)
```
You should first identify the tokens:
- _Name_ `x`
- _Operator_ `=`
- _Literal_ `"world"`
- _Name_ `print`
- _Operator_ `( )`
- _Literal_ `"hello "`
- _Operator_ `+`
- _Name_ `x`
The first line of code binds `"world"` to the name `x`.
The expression `"hello " + x` looks up the value named by `x` and concatenates
it with the literal value `"hello "`. This produces the string `"hello world"`.
The expression `print( ... )` looks up the value - the function - named by
`print` and uses the `( )` operator to call it with the string `"hello world"`.
To be crystal clear: `x` and `print` _are the same kind of token_, it's just
that their named values have different types. One is a string, the other a
function. The string can be _operated on_ with the `+` operator, and the
function can be _operated on_ with the `( )` operator.
It is valid to write `print(print)`; here we are looking up the name `print`,
and passing that value to the function named by `print`. This should be no more
or less surprising than being able to write `x + x` or `5 * 4`.
# Namespaces
**First-and-a-half:** A _namespace_ is a collection of names.
You might also hear this called a "scope". This is the reason I say "maybe three
or four, depending how you count"; this is really part of that fundamental idea
of a _name_, but I'll list it separately to be extra clear.
There are some special structures in Python that introduce new namespaces. Each
_module_ has a "global" namespace; these are names that can be referenced
anywhere in a given file or script. Each _function_ has a "local" namespace;
these are names that can only be accessed within the function.
For example:
```py
x = "eggs"
def spam():
y = "ham"
# I can print(x) here.
# But I cannot print(y) here.
```
Objects also have namespaces. Names on objects are called "attributes", and they
may be simple values or functions, just how regular names might be simple
values (`x`, `y`) or functions (`print`, `spam`). You access attributes with the
`.` operator.
```py
obj = range(10)
print(
obj.stop) # find the value named by `obj`, then find the value named by `stop`. 10.
```
Finally, there is the built-in namespace. These are names that are accessible
always, from anywhere, by default. Names like `print` and `range` are defined
here. [Here's a comprehensive list of built-in names](https://docs.python.org/3/library/functions.html).
# Strings
**Second:** you asked about characters and letters, so you may appreciate some
background on strings.
A _string_ is a sequence of characters. A _character_ is simply a number to
which we, by convention, assign some meaning. For example, by convention, we've
all agreed that the number `74` means `J`. This convention is called an
_encoding_. The default encoding is called UTF-8 and is specified by a committee
called the _Unicode Consortium_. This encoding includes characters from many
current and ancient languages, various symbols and typographical marks, emojis,
flags, etc. The important thing to remember is each one of these things, really,
is just an integer. And all our devices just agree that when they see a given
integer they will look up the appropriate symbol in an appropriate font.
You can switch between the string representation and the numerical
representation with the `encode` and `decode` methods on strings. Really, these
are the same, you're just telling Python to tell your console to draw them
differently.
```py
>> > list('Fizz'.encode())
[70, 105, 122, 122]
>> > bytes([66, 117, 122, 122]).decode()
'Buzz'
```
For continuity: `list`, `encode`, `decode`, and `bytes` are all names. `( )`,
`[ ]`, `,`, and `.` are all operators. The numbers and `'Fizz'` are literals.
† Technically, `[66, 117, 122, 122]` in its entirety is a literal - `,` is a
keyword, not an operator - but that's neither here nor there for these purposes.
‡ The symbol `†` is number 8224 and the symbol `‡` is number 8225.
# Names
**Second-and-a-half:** names are strings.
Names are just strings, and namespaces are just `dict`. You can access them with
`locals()` and `globals()`, although in practice you almost never need to do
this directly. It's better to just use the name itself.
```py
import pprint
x = range(10)
function = print
pprint.pprint(globals())
```
This outputs:
```
{'__annotations__': {},
'__builtins__': <module 'builtins' (built-in)>,
'__cached__': None,
'__doc__': None,
'__file__': '<stdin>',
'__loader__': <class '_frozen_importlib.BuiltinImporter'>,
'__name__': '__main__',
'__package__': None,
'__spec__': None,
'function': <built-in function print>,
'pprint': <module 'pprint' from 'python3.12/pprint.py'>,
'x': range(0, 10)}
```
For continuity: `import pprint` binds the name `pprint` to the module
`pprint.py` from the standard library. The line `pprint.pprint( ... )` fetches
the function `pprint` from that module, and calls it.

9756
md/spec-0.31.2.md Normal file

File diff suppressed because it is too large Load Diff

156
md/src/main.zig Normal file
View File

@@ -0,0 +1,156 @@
//! Markdown Parser.
//!
//! Leaf blocks
//! Thematic Breaks
//! ATX headings
//! Setext headings
//! Indented code blocks
//! Fenced code blocks
//! HTML blocks
//! Link reference definitions
//! Paragraphs
//! Blank lines
//!
//! Container blocks
//! Block Quotes
//! List Items
//! Lists
//!
//! Inlines
//! Code spans
//! emph, strong
//! Links
//! Images
//! Autolinks
//! Raw HTML
//! Hard line breaks
//! Soft line breaks
//! Text
const BlockType = enum {
// Leaf Blocks
Break,
Heading,
Code,
HTML,
LinkDefinition,
Paragraph,
Blank,
// Container Blocks
Quote,
List,
// ListItem,
};
const Block = struct {
type: BlockType,
text: []const u8,
depth: u8 = 0,
};
const std = @import("std");
const md = @import("md");
pub fn render(src: []const u8, alloc: std.mem.Allocator) !void {
var it = std.mem.splitScalar(u8, src, '\n');
var blocks = std.ArrayList(Block).init(alloc);
defer blocks.deinit();
line_search: while (it.next()) |line| {
// early check for blank line
for (line) |ch| {
if (!std.ascii.isWhitespace(ch)) break;
} else {
try blocks.append(.{ .type = .Blank, .text = line });
continue :line_search;
}
// figure out indentation
var depth: u8 = 0;
var start: u8 = 0;
for (line) |ch| {
switch (ch) {
' ' => depth += 1,
'\t' => depth += 4,
else => break,
}
start += 1;
}
// at this point, start will not be line.len because we checked
// it is not blank
if (depth >= 4) {
// todo something special has to happen. lists and code blocks.
}
// test thematic break
switch (line[start]) {
'*', '-', '_' => |kind| {
var count: u8 = 0;
for (line[start..]) |ch| {
if (ch == kind) {
count += 1;
continue;
}
if (std.ascii.isWhitespace(ch)) continue;
break;
} else {
if (count >= 3) {
try blocks.append(.{ .type = .Break, .text = line });
continue :line_search;
}
}
},
else => {},
}
}
for (blocks.items) |block| {
std.debug.print("{any}\n", .{block});
}
// TODO split into paragraphs, list elements
//
// const State = enum {
// Block,
// };
// mach: switch (State.Block) {
// .Block => {
// continue :mach .Block;
// },
// }
// while (it.next()) |line| {}
}
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
defer _ = gpa.deinit();
const alloc = gpa.allocator();
const args = try std.process.argsAlloc(alloc);
defer std.process.argsFree(alloc, args);
const buf = try alloc.alloc(u8, 1 << 20);
defer alloc.free(buf);
for (args[1..]) |path| {
const file = try std.fs.cwd().openFile(path, .{ .mode = .read_only });
defer file.close();
const stat = try file.stat();
if (stat.size > buf.len)
return error.FileTooBig;
const n = try file.readAll(buf);
const content = buf[0..n];
try render(content, alloc);
}
}

54
md/src/root.zig Normal file
View File

@@ -0,0 +1,54 @@
const std = @import("std");
pub fn parse(src: []const u8, alloc: std.mem.Allocator) ![][]const u8 {
var it = std.mem.splitScalar(u8, src, '\n');
var lines = std.ArrayList([]const u8).init(alloc);
while (it.next()) |line| {
try lines.append(line);
}
return try lines.toOwnedSlice();
}
// pub fn render(src: []const u8, alloc: std.mem.Allocator) ![]const u8 {
// // std.mem.tokenizeScalar(comptime T: type, buffer: []const T, delimiter: T)
// }
test "split-lines" {
const alloc = std.testing.allocator;
const src =
\\hello world!
\\
\\this is some content!
\\
;
const lines = try parse(src, alloc);
defer alloc.free(lines);
const expect: []const []const u8 = &.{
"hello world!",
"",
"this is some content!",
"",
};
try std.testing.expectEqualDeep(
expect,
lines,
);
}
test "2.2 Tabs" {
// const alloc = std.testing.allocator;
//
// const md = "\tfoo\tbaz\t\tbim";
// const expect = "<pre><code>foo\tbaz\t\tbim</code></pre>";
// const actual = try render(md, alloc);
// defer alloc.free(actual);
// std.testing.expectEqualStrings(expect, actual);
}

BIN
md/zig-out/bin/md Executable file

Binary file not shown.