From b49c64862ea34b1d235fc2ac653ed1d3d8ce8737 Mon Sep 17 00:00:00 2001 From: Robin Voetter Date: Fri, 25 Oct 2019 22:45:57 +0200 Subject: [PATCH] Dingen --- build.zig | 2 +- generator/main.zig | 5 + generator/utf8.zig | 109 ++++++++++++++ generator/xml.zig | 350 +++++++-------------------------------------- 4 files changed, 169 insertions(+), 297 deletions(-) create mode 100644 generator/utf8.zig diff --git a/build.zig b/build.zig index 3c0173a..00bdc62 100644 --- a/build.zig +++ b/build.zig @@ -6,7 +6,7 @@ pub fn build(b: *Builder) void { generator.setBuildMode(b.standardReleaseOptions()); var test_step = b.step("test", "Run all the tests"); - test_step.dependOn(&b.addTest("generator/xml.zig").step); + test_step.dependOn(&b.addTest("generator/main.zig").step); const run_cmd = generator.run(); diff --git a/generator/main.zig b/generator/main.zig index 55ca3ba..d47a6ae 100644 --- a/generator/main.zig +++ b/generator/main.zig @@ -3,3 +3,8 @@ const std = @import("std"); pub fn main() void { std.debug.warn("Test\n"); } + +test "main" { + _ = @import("xml.zig"); + _ = @import("utf8.zig"); +} diff --git a/generator/utf8.zig b/generator/utf8.zig new file mode 100644 index 0000000..387ab1f --- /dev/null +++ b/generator/utf8.zig @@ -0,0 +1,109 @@ +const std = @import("std"); +const unicode = std.unicode; +const testing = std.testing; + +fn Utf8Iterator(comptime ReadError: type) type { + return struct { + const Self = @This(); + pub const Stream = std.io.InStream(ReadError); + + in: *Stream, + + pub fn init(in: *Stream) Self { + return Self{ + .in = in + }; + } + + pub fn next(self: *Self) !?u32 { + var cp: [4]u8 = undefined; + if ((try self.in.readFull(cp[0 .. 1])) != 1) { + return null; + } + + const cp_len = try unicode.utf8ByteSequenceLength(cp[0]); + try self.in.readNoEof(cp[1 .. cp_len]); + return try unicode.utf8Decode(cp[0 .. cp_len]); + } + }; +} + +pub fn PeekUtf8Iterator(comptime buffer_size: usize, comptime ReadError: type) type { + return struct { + const Self = @This(); + pub const Stream = Utf8Iterator(ReadError).Stream; + + it: Utf8Iterator(ReadError), + buf: [buffer_size]u32, + head: usize, + size: usize, + + pub fn init(in: *Utf8Iterator(ReadError).Stream) Self { + return Self{ + .it = Utf8Iterator(ReadError).init(in), + .buf = undefined, + .head = 0, + .size = 0 + }; + } + + pub fn next(self: *Self) !?u32 { + if (self.size > 0) { + const cp = self.buf[self.head]; + self.head = (self.head + 1) % buffer_size; + self.size -= 1; + return cp; + } + + return try self.it.next(); + } + + pub fn peek(self: *Self, offset: usize) !?u32 { + std.debug.assert(offset < buffer_size); + while (self.size <= offset) { + const cp = (try self.it.next()) orelse return null; + self.buf[(self.head + self.size) % buffer_size] = cp; + self.size += 1; + } + + return self.buf[(self.head + offset) % buffer_size]; + } + + pub fn peekNoEof(self: *Self, offset: usize) !?u32 { + return (try self.peek(offset)) orelse return error.EndOfStream; + } + + pub fn discard(self: *Self, amount: usize) void { + std.debug.assert(amount < self.size); + + var i: usize = 0; + while (i < amount) : (i += 1) { + _ = self.next(); + } + } + }; +} + +test "PeekUtf8Iterator" { + var slice_in = std.io.SliceInStream.init("abcd"); + var it = PeekUtf8Iterator(4, std.io.SliceInStream.Error).init(&slice_in.stream); + testing.expect((try it.peek(0)).? == 'a'); + testing.expect((try it.peek(1)).? == 'b'); + testing.expect((try it.peek(2)).? == 'c'); + testing.expect((try it.peek(3)).? == 'd'); + + testing.expect((try it.next()).? == 'a'); + testing.expect((try it.peek(0)).? == 'b'); + testing.expect((try it.peek(1)).? == 'c'); + testing.expect((try it.peek(2)).? == 'd'); + + testing.expect((try it.next()).? == 'b'); + testing.expect((try it.peek(0)).? == 'c'); + testing.expect((try it.peek(1)).? == 'd'); + testing.expect((try it.peek(2)) == null); + + testing.expect((try it.next()).? == 'c'); + testing.expect((try it.peek(0)).? == 'd'); + testing.expect((try it.peek(1)) == null); + testing.expectError(error.EndOfStream, it.peekNoEof(1)); +} diff --git a/generator/xml.zig b/generator/xml.zig index 77f9ff1..9edc8ce 100644 --- a/generator/xml.zig +++ b/generator/xml.zig @@ -1,316 +1,74 @@ const std = @import("std"); -const unicode = std.unicode; -const testing = std.testing; +const Allocator = std.mem.Allocator; +const ArenaAllocator = std.heap.ArenaAllocator; -const Entity = struct { - substitute: []const u8, - char: u8 +pub const TagType = enum { + tag, + proc_instr }; -const entities = [_]Entity{ - Entity{.substitute = "amp;", .char = '&'}, - Entity{.substitute = "lt;", .char = '<'}, - Entity{.substitute = "gt;", .char = '>'}, - Entity{.substitute = "quot;", .char = '"'}, - Entity{.substitute = "apos;", .char = '\''}, +pub const Node = struct { + pub name: []const u8, + pub type: TagType, + pub attributes: []Attribute, + pub children: []Element }; -fn XmlUtf8Iterator(comptime ReadError: type) type { - return struct { - const Self = @This(); - pub const Stream = std.io.InStream(ReadError); +pub const Attribute = struct { + pub key: []const u8, + pub value: []const u8 +}; - in: *Stream, +pub const Element = union(enum) { + pub text: []const u8, + pub node: *Node +}; - pub fn init(in: *Stream) Self { - return Self{ - .in = in - }; - } +pub const Document = struct { + arena: ArenaAllocator, - pub fn next(self: *Self) !?u32 { - const cp = (try self.nextCodepoint()) orelse return null; - return if (cp == '&') try self.nextEntity() else cp; - } + pub xml_decl: ?*Node, + pub root: *Node, - fn nextEntity(self: *Self) !u32 { - var entity = [_]u8{0} ** 5; - - for (entity) |*c| { - const cp = (try self.nextCodepoint()) orelse return error.InvalidEntity; - c.* = std.math.cast(u8, cp) catch return error.InvalidEntity; - - if (cp == ';') { - break; - } - } else { - return error.InvalidEntity; - } - - for (entities) |*e| { - if (std.mem.startsWith(u8, entity, e.substitute)) { - return e.char; - } - } - - return error.InvalidEntity; - } - - fn nextCodepoint(self: *Self) !?u32 { - var cp: [4]u8 = undefined; - if ((try self.in.readFull(cp[0 .. 1])) != 1) { - return null; - } - - const cp_len = try unicode.utf8ByteSequenceLength(cp[0]); - try self.in.readNoEof(cp[1 .. cp_len]); - return try unicode.utf8Decode(cp[0 .. cp_len]); - } - }; -} - -fn testXmlUtf8Iterator(text: []const u8, expected: []const u8) !void { - var slice_in = std.io.SliceInStream.init(text); - var it = XmlUtf8Iterator(std.io.SliceInStream.Error).init(&slice_in.stream); - - var i: usize = 0; - defer testing.expect(i == expected.len); - - while (try it.next()) |cp| { - testing.expect(cp == expected[i]); - i += 1; + pub fn deinit(self: *Document) void { + self.arena.deinit(); } -} - -test "XmlUtf8Iterator" { - try testXmlUtf8Iterator("simpleabc", "simpleabc"); - try testXmlUtf8Iterator("a<b&c>d"e'f<", "ad\"e'f<"); - testing.expectError(error.InvalidEntity, testXmlUtf8Iterator("python&", "python")); - testing.expectError(error.InvalidEntity, testXmlUtf8Iterator("python&&", "python")); - testing.expectError(error.InvalidEntity, testXmlUtf8Iterator("python&test;", "python")); - testing.expectError(error.InvalidEntity, testXmlUtf8Iterator("python&boa", "python")); -} - -pub fn PeekXmlUtf8Iterator(comptime buffer_size: usize, comptime ReadError: type) type { - return struct { - const Self = @This(); - pub const Stream = XmlUtf8Iterator(ReadError).Stream; - - it: XmlUtf8Iterator(ReadError), - buf: [buffer_size]u32, - head: usize, - size: usize, - - pub fn init(in: *XmlUtf8Iterator(ReadError).Stream) Self { - return Self{ - .it = XmlUtf8Iterator(ReadError).init(in), - .buf = undefined, - .head = 0, - .size = 0 - }; - } - - pub fn next(self: *Self) !?u32 { - if (self.size > 0) { - const cp = self.buf[self.head]; - self.head = (self.head + 1) % buffer_size; - self.size -= 1; - return cp; - } - - return try self.it.next(); - } - - pub fn peek(self: *Self, offset: usize) !?u32 { - std.debug.assert(offset < buffer_size); - while (self.size <= offset) { - self.buf[(self.head + self.size) % buffer_size] = (try self.it.next()) orelse return null; - self.size += 1; - } - - return self.buf[(self.head + offset) % buffer_size]; - } - - pub fn peekCheckStr(self: *Self, str: []const u8) !bool { - for (str) |c, i| { - if (try self.peek(i) != c) { - return false; - } - } - - return true; - } - }; -} - -test "PeekXmlUtf8Iterator" { - var slice_in = std.io.SliceInStream.init("abcd"); - var it = PeekXmlUtf8Iterator(4, std.io.SliceInStream.Error).init(&slice_in.stream); - testing.expect((try it.peek(0)).? == 'a'); - testing.expect((try it.peek(1)).? == 'b'); - testing.expect((try it.peek(2)).? == 'c'); - testing.expect((try it.peek(3)).? == 'd'); - - testing.expect((try it.next()).? == 'a'); - testing.expect((try it.peek(0)).? == 'b'); - testing.expect((try it.peek(1)).? == 'c'); - testing.expect((try it.peek(2)).? == 'd'); - - testing.expect((try it.next()).? == 'b'); - testing.expect((try it.peek(0)).? == 'c'); - testing.expect((try it.peek(1)).? == 'd'); - testing.expect((try it.peek(2)) == null); - - testing.expect((try it.next()).? == 'c'); - testing.expect((try it.peek(0)).? == 'd'); - testing.expect((try it.peek(1)) == null); -} - -pub const Event = enum { - open, - open_pi, - open_comment, - close, - close_slash, - close_pi, - close_comment, - attrib_key, - attrib_value, - content }; -pub fn Parser(ReadError: type) type { - return struct { - const Self = @This(); - const Iterator = PeekXmlUtf8Iterator(4, ReadError); +const Parser = struct { + alloc: *Allocator, + source: []const u8, + offset: usize, - in: Iterator, - state: Event, + fn element(self: *Self) !Element { - pub fn init(in: *Iterator.Stream) Self { - return Self{ - .in = Iterator.init(in), - .state = .content - }; + } + + fn text(self: *Self) ![]const u8 { + const start = self.offset; + const end = if (std.mem.indexOfPos(self.source, self.offset, "<")) |offset| offset else self.source.len; + self.offset = end; + return self.source[start .. end]; // TODO: Decode + } + + fn node(self: *Self) !*Node { + std.debug.assert(try self.peekNoEof(0) == '<'); + } + + fn peekNoEof(self: *const Self, offset: usize) !u8 { + if (offset + self.offset >= self.source.len) { + return error.EndOfStream; } - pub fn nextEvent(self: *Self) !?Event { - const state = (try self.parseNextEvent()) orelse return null; - return state; - } + return self.source[offset + self.offset]; + } +}; - fn parseNextEvent(self: *Self) !?Event { - while (true) { - switch ((try self.in.peek(0)) orelse return null) { - ' ', '\t', '\n', '\r' => { - self.discard(1); - continue; - }, - '<' => { - switch (try self.peekNoEof(1)) { - '?' => { - self.discard(2); - return .open_pi; - }, - '!' => { - self.discard(2); - try self.matchStr("--"); - return .open_comment; - }, - else => { - self.discard(1); - return .open; - } - } - }, - '>' => { - self.discard(1); - return .close; - }, - '?' => { - self.discard(1); - try self.matchChar('>'); - return .close_pi; - }, - '/' => { - self.discard(1); - try self.matchChar('>'); - return .close_slash; - }, - '-' => { - self.discard(1); - try self.matchStr("-!>"); - return .close_comment; - }, - else => return .content - } - } - } - - pub fn next(self: *Self) !?u32 { - switch (self.state) { - .open => { - switch (try self.peekNoEof(0)) { - '>' => return null, - '/' => { - if ((try self.peekNoEof(1)) == '>') { - return null; - } - }, - else => |cp| { - self.discard(1); - return cp; - } - } - }, - .open_pi => { - switch (try self.peekNoEof(0)) { - '?' => { - if ((try self.peekNoEof(1)) == '>') { - return null; - } - }, - else => |cp| { - self.discard(1); - return cp; - } - } - }, - .open_comment => { - if (try self.in.peekCheckStr("--!>")) { - return null; - } - - const cp = try self.peekNoEof(0); - self.discard(1); - return cp; - } - } - } - - fn matchStr(self: *Self, seq: []const u8) !void { - for (seq) |c| { - try self.matchChar(c); - } - } - - fn matchChar(self: *Self, expected: u32) !void { - if ((try self.peekNoEof(0)) != expected) { - return error.Syntax; - } - - self.discard(1); - } - - fn peekNoEof(self: *Self, offset: usize) !u32 { - return (try self.in.peek(offset)) orelse error.EndOfStream; - } - - fn discard(self: *Self, num: usize) void { - var i: usize = 0; - while (i < num) : (i += 1) { - _ = self.in.next(); - } - } +pub fn parse(alloc: *Allocator, source: []const u8) !Document { + var arena = ArenaAllocator.init(alloc); + var parser = Parser { + .alloc = alloc, + .source = source, + .offset = 0 }; }