diff --git a/generator/main.zig b/generator/main.zig index d47a6ae..f162908 100644 --- a/generator/main.zig +++ b/generator/main.zig @@ -1,10 +1,20 @@ const std = @import("std"); +const xml = @import("xml.zig"); -pub fn main() void { - std.debug.warn("Test\n"); +pub fn main() !void { + const file = try std.fs.cwd().openFileC(std.os.argv[1], .{}); + defer file.close(); + + const size = try file.seekableStream().stream.getEndPos(); + const source = try std.heap.page_allocator.alloc(u8, size); + defer std.heap.page_allocator.free(source); + + _ = try file.inStream().stream.read(source); + + var doc = try xml.parse(std.heap.page_allocator, source); + defer doc.deinit(); } test "main" { _ = @import("xml.zig"); - _ = @import("utf8.zig"); } diff --git a/generator/utf8.zig b/generator/utf8.zig deleted file mode 100644 index 387ab1f..0000000 --- a/generator/utf8.zig +++ /dev/null @@ -1,109 +0,0 @@ -const std = @import("std"); -const unicode = std.unicode; -const testing = std.testing; - -fn Utf8Iterator(comptime ReadError: type) type { - return struct { - const Self = @This(); - pub const Stream = std.io.InStream(ReadError); - - in: *Stream, - - pub fn init(in: *Stream) Self { - return Self{ - .in = in - }; - } - - pub fn next(self: *Self) !?u32 { - var cp: [4]u8 = undefined; - if ((try self.in.readFull(cp[0 .. 1])) != 1) { - return null; - } - - const cp_len = try unicode.utf8ByteSequenceLength(cp[0]); - try self.in.readNoEof(cp[1 .. cp_len]); - return try unicode.utf8Decode(cp[0 .. cp_len]); - } - }; -} - -pub fn PeekUtf8Iterator(comptime buffer_size: usize, comptime ReadError: type) type { - return struct { - const Self = @This(); - pub const Stream = Utf8Iterator(ReadError).Stream; - - it: Utf8Iterator(ReadError), - buf: [buffer_size]u32, - head: usize, - size: usize, - - pub fn init(in: *Utf8Iterator(ReadError).Stream) Self { - return Self{ - .it = Utf8Iterator(ReadError).init(in), - .buf = undefined, - .head = 0, - .size = 0 - }; - } - - pub fn next(self: *Self) !?u32 { - if (self.size > 0) { - const cp = self.buf[self.head]; - self.head = (self.head + 1) % buffer_size; - self.size -= 1; - return cp; - } - - return try self.it.next(); - } - - pub fn peek(self: *Self, offset: usize) !?u32 { - std.debug.assert(offset < buffer_size); - while (self.size <= offset) { - const cp = (try self.it.next()) orelse return null; - self.buf[(self.head + self.size) % buffer_size] = cp; - self.size += 1; - } - - return self.buf[(self.head + offset) % buffer_size]; - } - - pub fn peekNoEof(self: *Self, offset: usize) !?u32 { - return (try self.peek(offset)) orelse return error.EndOfStream; - } - - pub fn discard(self: *Self, amount: usize) void { - std.debug.assert(amount < self.size); - - var i: usize = 0; - while (i < amount) : (i += 1) { - _ = self.next(); - } - } - }; -} - -test "PeekUtf8Iterator" { - var slice_in = std.io.SliceInStream.init("abcd"); - var it = PeekUtf8Iterator(4, std.io.SliceInStream.Error).init(&slice_in.stream); - testing.expect((try it.peek(0)).? == 'a'); - testing.expect((try it.peek(1)).? == 'b'); - testing.expect((try it.peek(2)).? == 'c'); - testing.expect((try it.peek(3)).? == 'd'); - - testing.expect((try it.next()).? == 'a'); - testing.expect((try it.peek(0)).? == 'b'); - testing.expect((try it.peek(1)).? == 'c'); - testing.expect((try it.peek(2)).? == 'd'); - - testing.expect((try it.next()).? == 'b'); - testing.expect((try it.peek(0)).? == 'c'); - testing.expect((try it.peek(1)).? == 'd'); - testing.expect((try it.peek(2)) == null); - - testing.expect((try it.next()).? == 'c'); - testing.expect((try it.peek(0)).? == 'd'); - testing.expect((try it.peek(1)) == null); - testing.expectError(error.EndOfStream, it.peekNoEof(1)); -} diff --git a/generator/xml.zig b/generator/xml.zig index 9edc8ce..7456efa 100644 --- a/generator/xml.zig +++ b/generator/xml.zig @@ -1,74 +1,553 @@ const std = @import("std"); -const Allocator = std.mem.Allocator; +const mem = std.mem; +const testing = std.testing; +const Allocator = mem.Allocator; const ArenaAllocator = std.heap.ArenaAllocator; - -pub const TagType = enum { - tag, - proc_instr -}; - -pub const Node = struct { - pub name: []const u8, - pub type: TagType, - pub attributes: []Attribute, - pub children: []Element -}; +const SegmentedList = std.SegmentedList; pub const Attribute = struct { - pub key: []const u8, - pub value: []const u8 + name: []const u8, + value: []const u8 }; -pub const Element = union(enum) { - pub text: []const u8, - pub node: *Node +pub const Content = union(enum) { + CharData: []const u8, + Comment: []const u8, + Element: *Element +}; + +// Wrapper to work around compiler crash +pub const Child = struct { + content: Content +}; + +pub const Element = struct { + tag: []const u8, + attributes: SegmentedList(*Attribute, 0), + children: SegmentedList(Child, 0), + + fn init(tag: []const u8, alloc: *Allocator) Element { + return .{ + .tag = tag, + .attributes = SegmentedList(*Attribute, 0).init(alloc), + .children = SegmentedList(Child, 0).init(alloc), + }; + } +}; + +pub const XmlDecl = struct { + version: []const u8, + encoding: ?[]const u8, + standalone: ?bool }; pub const Document = struct { arena: ArenaAllocator, - - pub xml_decl: ?*Node, - pub root: *Node, + xml_decl: ?*XmlDecl, + root: *Element, pub fn deinit(self: *Document) void { self.arena.deinit(); } }; -const Parser = struct { - alloc: *Allocator, +const ParseContext = struct { source: []const u8, offset: usize, + line: usize, + column: usize, - fn element(self: *Self) !Element { - + fn init(source: []const u8) ParseContext { + return .{ + .source = source, + .offset = 0, + .line = 0, + .column = 0 + }; } - fn text(self: *Self) ![]const u8 { - const start = self.offset; - const end = if (std.mem.indexOfPos(self.source, self.offset, "<")) |offset| offset else self.source.len; - self.offset = end; - return self.source[start .. end]; // TODO: Decode + fn peek(self: *ParseContext) ?u8 { + return if (self.offset < self.source.len) self.source[self.offset] else null; } - fn node(self: *Self) !*Node { - std.debug.assert(try self.peekNoEof(0) == '<'); - } - - fn peekNoEof(self: *const Self, offset: usize) !u8 { - if (offset + self.offset >= self.source.len) { - return error.EndOfStream; + fn consume(self: *ParseContext) !u8 { + if (self.offset < self.source.len) { + return self.consumeNoEof(); } - return self.source[offset + self.offset]; + return error.UnexpectedEof; + } + + fn consumeNoEof(self: *ParseContext) u8 { + std.debug.assert(self.offset < self.source.len); + const c = self.source[self.offset]; + self.offset += 1; + + if (c == '\n') { + self.line += 1; + self.column = 0; + } else { + self.column += 1; + } + + return c; + } + + fn eat(self: *ParseContext, char: u8) bool { + self.expect(char) catch return false; + return true; + } + + fn expect(self: *ParseContext, expected: u8) !void { + if (self.peek()) |actual| { + if (expected != actual) { + return error.UnexpectedCharacter; + } + + _ = self.consumeNoEof(); + return; + } + + return error.UnexpectedEof; + } + + fn eatStr(self: *ParseContext, text: []const u8) bool { + self.expectStr(text) catch return false; + return true; + } + + fn expectStr(self: *ParseContext, text: []const u8) !void { + if (self.source.len < self.offset + text.len) { + return error.UnexpectedEof; + } else if (std.mem.startsWith(u8, self.source[self.offset ..], text)) { + var i: usize = 0; + while (i < text.len) : (i += 1) { + _ = self.consumeNoEof(); + } + + return; + } + + return error.UnexpectedCharacter; + } + + fn eatWs(self: *ParseContext) bool { + var ws = false; + + while (self.peek()) |ch| { + switch (ch) { + ' ', '\t', '\n', '\r' => { + ws = true; + _ = self.consumeNoEof(); + }, + else => break + } + } + + return ws; + } + + fn expectWs(self: *ParseContext) !void { + if (!self.eatWs()) return error.UnexpectedCharacter; + } + + fn currentLine(self: ParseContext) []const u8 { + var begin: usize = 0; + if (mem.lastIndexOf(u8, self.source[0 .. self.offset], "\n")) |prev_nl| { + begin = prev_nl + 1; + } + + var end = mem.indexOfPos(u8, self.source, self.offset, "\n") orelse self.source.len; + return self.source[begin .. end]; } }; -pub fn parse(alloc: *Allocator, source: []const u8) !Document { - var arena = ArenaAllocator.init(alloc); - var parser = Parser { - .alloc = alloc, - .source = source, - .offset = 0 +test "ParseContext" { + { + var ctx = ParseContext.init("I like pythons"); + testing.expectEqual(@as(?u8, 'I'), ctx.peek()); + testing.expectEqual(@as(u8, 'I'), ctx.consumeNoEof()); + testing.expectEqual(@as(?u8, ' '), ctx.peek()); + testing.expectEqual(@as(u8, ' '), try ctx.consume()); + + testing.expect(ctx.eat('l')); + testing.expectEqual(@as(?u8, 'i'), ctx.peek()); + testing.expectEqual(false, ctx.eat('a')); + testing.expectEqual(@as(?u8, 'i'), ctx.peek()); + + try ctx.expect('i'); + testing.expectEqual(@as(?u8, 'k'), ctx.peek()); + testing.expectError(error.UnexpectedCharacter, ctx.expect('a')); + testing.expectEqual(@as(?u8, 'k'), ctx.peek()); + + testing.expect(ctx.eatStr("ke")); + testing.expectEqual(@as(?u8, ' '), ctx.peek()); + + testing.expect(ctx.eatWs()); + testing.expectEqual(@as(?u8, 'p'), ctx.peek()); + testing.expectEqual(false, ctx.eatWs()); + testing.expectEqual(@as(?u8, 'p'), ctx.peek()); + + testing.expectEqual(false, ctx.eatStr("aaaaaaaaa")); + testing.expectEqual(@as(?u8, 'p'), ctx.peek()); + + testing.expectError(error.UnexpectedEof, ctx.expectStr("aaaaaaaaa")); + testing.expectEqual(@as(?u8, 'p'), ctx.peek()); + testing.expectError(error.UnexpectedCharacter, ctx.expectStr("pytn")); + testing.expectEqual(@as(?u8, 'p'), ctx.peek()); + try ctx.expectStr("python"); + testing.expectEqual(@as(?u8, 's'), ctx.peek()); + } + + { + var ctx = ParseContext.init(""); + testing.expectEqual(ctx.peek(), null); + testing.expectError(error.UnexpectedEof, ctx.consume()); + testing.expectEqual(ctx.eat('p'), false); + testing.expectError(error.UnexpectedEof, ctx.expect('p')); + } +} + +pub const ParseError = error { + IllegalCharacter, + UnexpectedEof, + UnexpectedCharacter, + UnclosedValue, + UnclosedComment, + InvalidName, + InvalidEntity, + InvalidStandaloneValue, + NonMatchingClosingTag, + InvalidDocument, + OutOfMemory +}; + +pub fn parse(backing_allocator: *Allocator, source: []const u8) !Document { + var ctx = ParseContext.init(source); + return parseDocument(&ctx, backing_allocator) catch |err| { + std.debug.warn("{}\n", .{ctx.currentLine()}); + + var i: usize = 0; + while (i < ctx.column) : (i += 1) { + std.debug.warn(" ", .{}); + } + + std.debug.warn("^\n", .{}); + + return err; }; } + +fn parseDocument(ctx: *ParseContext, backing_allocator: *Allocator) !Document { + var doc = Document{ + .arena = ArenaAllocator.init(backing_allocator), + .xml_decl = null, + .root = undefined + }; + + doc.xml_decl = try tryParseProlog(ctx, &doc.arena.allocator); + _ = ctx.eatWs(); + doc.root = (try tryParseElement(ctx, &doc.arena.allocator)) orelse return error.InvalidDocument; + _ = ctx.eatWs(); + + if (ctx.peek() != null) return error.InvalidDocument; + + return doc; +} + +fn parseAttrValue(ctx: *ParseContext, alloc: *Allocator) ![]const u8 { + const quote = try ctx.consume(); + if (quote != '"' and quote != '\'') return error.UnexpectedCharacter; + + const begin = ctx.offset; + + while (true) { + const c = ctx.consume() catch return error.UnclosedValue; + if (c == quote) break; + } + + const end = ctx.offset - 1; + + return try dupeAndUnescape(alloc, ctx.source[begin .. end]); +} + +fn parseEqAttrValue(ctx: *ParseContext, alloc: *Allocator) ![]const u8 { + _ = ctx.eatWs(); + try ctx.expect('='); + _ = ctx.eatWs(); + + return try parseAttrValue(ctx, alloc); +} + +fn parseNameNoDupe(ctx: *ParseContext) ![]const u8 { + // XML's spec on names is very long, so to make this easier + // we just take any character that is not special and not whitespace + + const begin = ctx.offset; + + while (ctx.peek()) |ch| { + switch (ch) { + ' ', '\t', '\n', '\r' => break, + '&', '"', '\'', '<', '>', '?', '=', '/' => break, + else => _ = ctx.consumeNoEof() + } + } + + const end = ctx.offset; + if (begin == end) return error.InvalidName; + + return ctx.source[begin .. end]; +} + +fn tryParseCharData(ctx: *ParseContext, alloc: *Allocator) !?[]const u8 { + const begin = ctx.offset; + + while (ctx.peek()) |ch| { + switch (ch) { + '<', '>' => break, + else => _ = ctx.consumeNoEof() + } + } + + const end = ctx.offset; + if (begin == end) return null; + + return try dupeAndUnescape(alloc, ctx.source[begin .. end]); +} + +fn parseContent(ctx: *ParseContext, alloc: *Allocator) ParseError!Content { + if (try tryParseCharData(ctx, alloc)) |cd| { + return Content{.CharData = cd}; + } else if (try tryParseComment(ctx, alloc)) |comment| { + return Content{.Comment = comment}; + } else if (try tryParseElement(ctx, alloc)) |elem| { + return Content{.Element = elem}; + } else { + return error.UnexpectedCharacter; + } +} + +fn tryParseAttr(ctx: *ParseContext, alloc: *Allocator) !?*Attribute { + const name = parseNameNoDupe(ctx) catch return null; + _ = ctx.eatWs(); + try ctx.expect('='); + _ = ctx.eatWs(); + const value = try parseAttrValue(ctx, alloc); + + const attr = try alloc.create(Attribute); + attr.name = try mem.dupe(alloc, u8, name); + attr.value = value; + return attr; +} + +fn tryParseElement(ctx: *ParseContext, alloc: *Allocator) !?*Element { + const start = ctx.offset; + if (!ctx.eat('<')) return null; + const tag = parseNameNoDupe(ctx) catch { + ctx.offset = start; + return null; + }; + + const element = try alloc.create(Element); + element.* = Element.init(try std.mem.dupe(alloc, u8, tag), alloc); + + while (ctx.eatWs()) { + const attr = (try tryParseAttr(ctx, alloc)) orelse break; + try element.attributes.push(attr); + } + + if (ctx.eatStr("/>")) { + return element; + } + + try ctx.expect('>'); + + while (true) { + if (ctx.peek() == null) { + return error.UnexpectedEof; + } else if (ctx.eatStr("'); + return element; +} + +test "tryParseElement" { + { + var ctx = ParseContext.init("<= a='b'/>"); + testing.expectEqual(@as(?*Element, null), try tryParseElement(&ctx, std.debug.global_allocator)); + testing.expectEqual(@as(?u8, '<'), ctx.peek()); + } + + { + var ctx = ParseContext.init(""); + const elem = try tryParseElement(&ctx, std.debug.global_allocator); + testing.expectEqualSlices(u8, elem.?.tag, "python"); + + const size_attr = elem.?.attributes.at(0).*; + testing.expectEqualSlices(u8, size_attr.name, "size"); + testing.expectEqualSlices(u8, size_attr.value, "15"); + + const color_attr = elem.?.attributes.at(1).*; + testing.expectEqualSlices(u8, color_attr.name, "color"); + testing.expectEqualSlices(u8, color_attr.value, "green"); + } + + { + var ctx = ParseContext.init("test"); + const elem = try tryParseElement(&ctx, std.debug.global_allocator); + testing.expectEqualSlices(u8, elem.?.tag, "python"); + testing.expectEqualSlices(u8, elem.?.children.at(0).content.CharData, "test"); + } + + { + var ctx = ParseContext.init("bdf"); + const elem = try tryParseElement(&ctx, std.debug.global_allocator); + testing.expectEqualSlices(u8, elem.?.tag, "a"); + testing.expectEqualSlices(u8, elem.?.children.at(0).content.CharData, "b"); + testing.expectEqualSlices(u8, elem.?.children.at(1).content.Element.tag, "c"); + testing.expectEqualSlices(u8, elem.?.children.at(2).content.CharData, "d"); + testing.expectEqualSlices(u8, elem.?.children.at(3).content.Element.tag, "e"); + testing.expectEqualSlices(u8, elem.?.children.at(4).content.CharData, "f"); + testing.expectEqualSlices(u8, elem.?.children.at(5).content.Comment, "g"); + } +} + +fn tryParseProlog(ctx: *ParseContext, alloc: *Allocator) !?*XmlDecl { + const start = ctx.offset; + if (!ctx.eatStr(""); + return decl; +} + +test "tryParseProlog" { + { + var ctx = ParseContext.init(""); + testing.expectEqual(@as(?*XmlDecl, null), try tryParseProlog(&ctx, std.debug.global_allocator)); + testing.expectEqual(@as(?u8, '<'), ctx.peek()); + } + + { + var ctx = ParseContext.init(""); + const decl = try tryParseProlog(&ctx, std.debug.global_allocator); + testing.expectEqualSlices(u8, "aa", decl.?.version); + testing.expectEqual(@as(?[]const u8, null), decl.?.encoding); + testing.expectEqual(@as(?bool, null), decl.?.standalone); + } + + { + var ctx = ParseContext.init(""); + const decl = try tryParseProlog(&ctx, std.debug.global_allocator); + testing.expectEqualSlices(u8, "aa", decl.?.version); + testing.expectEqualSlices(u8, "bbb", decl.?.encoding.?); + testing.expectEqual(@as(?bool, true), decl.?.standalone.?); + } +} + +fn tryParseComment(ctx: *ParseContext, alloc: *Allocator) !?[]const u8 { + if (!ctx.eatStr("")) { + _ = ctx.consume() catch return error.UnclosedComment; + } + + const end = ctx.offset - "-->".len; + return try mem.dupe(alloc, u8, ctx.source[begin .. end]); +} + +fn unescapeEntity(text: []const u8) !u8 { + const EntitySubstition = struct { + text: []const u8, + replacement: u8 + }; + + const entities = [_]EntitySubstition{ + .{.text = "<", .replacement = '<'}, + .{.text = ">", .replacement = '>'}, + .{.text = "&", .replacement = '&'}, + .{.text = "'", .replacement = '\''}, + .{.text = """, .replacement = '"'} + }; + + for (entities) |entity| { + if (std.mem.eql(u8, text, entity.text)) return entity.replacement; + } + + return error.InvalidEntity; +} + +fn dupeAndUnescape(alloc: *Allocator, text: []const u8) ![]const u8 { + const str = try alloc.alloc(u8, text.len); + + var j: usize = 0; + var i: usize = 0; + while (i < text.len) : (j += 1) { + if (text[i] == '&') { + const entity_end = 1 + (mem.indexOfPos(u8, text, i, ";") orelse return error.InvalidEntity); + str[j] = try unescapeEntity(text[i .. entity_end]); + i = entity_end; + } else { + str[j] = text[i]; + i += 1; + } + } + + return alloc.shrink(str, j); +} + +test "dupeAndUnescape" { + testing.expectEqualSlices(u8, "test", try dupeAndUnescape(std.debug.global_allocator, "test")); + testing.expectEqualSlices(u8, "ad\"e'f<", try dupeAndUnescape(std.debug.global_allocator, "a<b&c>d"e'f<")); + testing.expectError(error.InvalidEntity, dupeAndUnescape(std.debug.global_allocator, "python&")); + testing.expectError(error.InvalidEntity, dupeAndUnescape(std.debug.global_allocator, "python&&")); + testing.expectError(error.InvalidEntity, dupeAndUnescape(std.debug.global_allocator, "python&test;")); + testing.expectError(error.InvalidEntity, dupeAndUnescape(std.debug.global_allocator, "python&boa")); +}