diff --git a/generator/xml.zig b/generator/xml.zig index 2515e4a..77f9ff1 100644 --- a/generator/xml.zig +++ b/generator/xml.zig @@ -15,7 +15,7 @@ const entities = [_]Entity{ Entity{.substitute = "apos;", .char = '\''}, }; -fn XmlUtf8Parser(comptime ReadError: type) type { +fn XmlUtf8Iterator(comptime ReadError: type) type { return struct { const Self = @This(); pub const Stream = std.io.InStream(ReadError); @@ -69,24 +69,248 @@ fn XmlUtf8Parser(comptime ReadError: type) type { }; } -fn testXmlUtf8Parser(text: []const u8, expected: []const u8) !void { +fn testXmlUtf8Iterator(text: []const u8, expected: []const u8) !void { var slice_in = std.io.SliceInStream.init(text); - var p = XmlUtf8Parser(std.io.SliceInStream.Error).init(&slice_in.stream); + var it = XmlUtf8Iterator(std.io.SliceInStream.Error).init(&slice_in.stream); var i: usize = 0; defer testing.expect(i == expected.len); - while (try p.next()) |cp| { + while (try it.next()) |cp| { testing.expect(cp == expected[i]); i += 1; } } -test "XmlUtf8Parser" { - try testXmlUtf8Parser("simpleabc", "simpleabc"); - try testXmlUtf8Parser("a<b&c>d"e'f<", "ad\"e'f<"); - testing.expectError(error.InvalidEntity, testXmlUtf8Parser("python&", "python")); - testing.expectError(error.InvalidEntity, testXmlUtf8Parser("python&&", "python")); - testing.expectError(error.InvalidEntity, testXmlUtf8Parser("python&test;", "python")); - testing.expectError(error.InvalidEntity, testXmlUtf8Parser("python&boa", "python")); +test "XmlUtf8Iterator" { + try testXmlUtf8Iterator("simpleabc", "simpleabc"); + try testXmlUtf8Iterator("a<b&c>d"e'f<", "ad\"e'f<"); + testing.expectError(error.InvalidEntity, testXmlUtf8Iterator("python&", "python")); + testing.expectError(error.InvalidEntity, testXmlUtf8Iterator("python&&", "python")); + testing.expectError(error.InvalidEntity, testXmlUtf8Iterator("python&test;", "python")); + testing.expectError(error.InvalidEntity, testXmlUtf8Iterator("python&boa", "python")); +} + +pub fn PeekXmlUtf8Iterator(comptime buffer_size: usize, comptime ReadError: type) type { + return struct { + const Self = @This(); + pub const Stream = XmlUtf8Iterator(ReadError).Stream; + + it: XmlUtf8Iterator(ReadError), + buf: [buffer_size]u32, + head: usize, + size: usize, + + pub fn init(in: *XmlUtf8Iterator(ReadError).Stream) Self { + return Self{ + .it = XmlUtf8Iterator(ReadError).init(in), + .buf = undefined, + .head = 0, + .size = 0 + }; + } + + pub fn next(self: *Self) !?u32 { + if (self.size > 0) { + const cp = self.buf[self.head]; + self.head = (self.head + 1) % buffer_size; + self.size -= 1; + return cp; + } + + return try self.it.next(); + } + + pub fn peek(self: *Self, offset: usize) !?u32 { + std.debug.assert(offset < buffer_size); + while (self.size <= offset) { + self.buf[(self.head + self.size) % buffer_size] = (try self.it.next()) orelse return null; + self.size += 1; + } + + return self.buf[(self.head + offset) % buffer_size]; + } + + pub fn peekCheckStr(self: *Self, str: []const u8) !bool { + for (str) |c, i| { + if (try self.peek(i) != c) { + return false; + } + } + + return true; + } + }; +} + +test "PeekXmlUtf8Iterator" { + var slice_in = std.io.SliceInStream.init("abcd"); + var it = PeekXmlUtf8Iterator(4, std.io.SliceInStream.Error).init(&slice_in.stream); + testing.expect((try it.peek(0)).? == 'a'); + testing.expect((try it.peek(1)).? == 'b'); + testing.expect((try it.peek(2)).? == 'c'); + testing.expect((try it.peek(3)).? == 'd'); + + testing.expect((try it.next()).? == 'a'); + testing.expect((try it.peek(0)).? == 'b'); + testing.expect((try it.peek(1)).? == 'c'); + testing.expect((try it.peek(2)).? == 'd'); + + testing.expect((try it.next()).? == 'b'); + testing.expect((try it.peek(0)).? == 'c'); + testing.expect((try it.peek(1)).? == 'd'); + testing.expect((try it.peek(2)) == null); + + testing.expect((try it.next()).? == 'c'); + testing.expect((try it.peek(0)).? == 'd'); + testing.expect((try it.peek(1)) == null); +} + +pub const Event = enum { + open, + open_pi, + open_comment, + close, + close_slash, + close_pi, + close_comment, + attrib_key, + attrib_value, + content +}; + +pub fn Parser(ReadError: type) type { + return struct { + const Self = @This(); + const Iterator = PeekXmlUtf8Iterator(4, ReadError); + + in: Iterator, + state: Event, + + pub fn init(in: *Iterator.Stream) Self { + return Self{ + .in = Iterator.init(in), + .state = .content + }; + } + + pub fn nextEvent(self: *Self) !?Event { + const state = (try self.parseNextEvent()) orelse return null; + return state; + } + + fn parseNextEvent(self: *Self) !?Event { + while (true) { + switch ((try self.in.peek(0)) orelse return null) { + ' ', '\t', '\n', '\r' => { + self.discard(1); + continue; + }, + '<' => { + switch (try self.peekNoEof(1)) { + '?' => { + self.discard(2); + return .open_pi; + }, + '!' => { + self.discard(2); + try self.matchStr("--"); + return .open_comment; + }, + else => { + self.discard(1); + return .open; + } + } + }, + '>' => { + self.discard(1); + return .close; + }, + '?' => { + self.discard(1); + try self.matchChar('>'); + return .close_pi; + }, + '/' => { + self.discard(1); + try self.matchChar('>'); + return .close_slash; + }, + '-' => { + self.discard(1); + try self.matchStr("-!>"); + return .close_comment; + }, + else => return .content + } + } + } + + pub fn next(self: *Self) !?u32 { + switch (self.state) { + .open => { + switch (try self.peekNoEof(0)) { + '>' => return null, + '/' => { + if ((try self.peekNoEof(1)) == '>') { + return null; + } + }, + else => |cp| { + self.discard(1); + return cp; + } + } + }, + .open_pi => { + switch (try self.peekNoEof(0)) { + '?' => { + if ((try self.peekNoEof(1)) == '>') { + return null; + } + }, + else => |cp| { + self.discard(1); + return cp; + } + } + }, + .open_comment => { + if (try self.in.peekCheckStr("--!>")) { + return null; + } + + const cp = try self.peekNoEof(0); + self.discard(1); + return cp; + } + } + } + + fn matchStr(self: *Self, seq: []const u8) !void { + for (seq) |c| { + try self.matchChar(c); + } + } + + fn matchChar(self: *Self, expected: u32) !void { + if ((try self.peekNoEof(0)) != expected) { + return error.Syntax; + } + + self.discard(1); + } + + fn peekNoEof(self: *Self, offset: usize) !u32 { + return (try self.in.peek(offset)) orelse error.EndOfStream; + } + + fn discard(self: *Self, num: usize) void { + var i: usize = 0; + while (i < num) : (i += 1) { + _ = self.in.next(); + } + } + }; }