diff --git a/generator/xml.zig b/generator/xml.zig index e293320..08013b6 100644 --- a/generator/xml.zig +++ b/generator/xml.zig @@ -1,121 +1,123 @@ const std = @import("std"); const unicode = std.unicode; const testing = std.testing; -const InStream = std.io.InStream; -pub const Text = struct { - pub raw: []const u8, - - pub fn iter_unescaped(self: *Text) UnescapeIterator { - return UnescapeIterator{ - .raw = self.raw, - .i = 0 - }; - } +const Entity = struct { + substitute: []const u8, + char: u8 }; -pub const UnescapeIterator = struct{ - raw: []const u8, - i: usize, +const entities = [_]Entity{ + Entity{.substitute = "amp;", .char = '&'}, + Entity{.substitute = "lt;", .char = '<'}, + Entity{.substitute = "gt;", .char = '>'}, + Entity{.substitute = "quot;", .char = '"'}, + Entity{.substitute = "apos;", .char = '\''}, +}; - pub fn init(raw: []const u8) UnescapeIterator { - return UnescapeIterator { - .raw = raw, - .i = 0 - }; - } +fn XmlUtf8Parser(comptime ReadError: type) type { + return struct { + pub const Self = @This(); + pub const Stream = std.io.InStream(ReadError); - fn nextCodepoint(self: *UnescapeIterator) !?u32 { - if (self.i >= self.raw.len) { - return null; + in: *Stream, + current: ?u32, + + pub fn init(in: *Stream) Self { + return Self{ + .in = in, + .current = null + }; } - const cp_len = try unicode.utf8ByteSequenceLength(self.raw[self.i]); - if (cp_len + self.i > self.raw.len) { - return error.InvalidUtf8; + pub fn consume(self: *Self) void { + self.current = null; } - const cp = try unicode.utf8Decode(self.raw[self.i .. self.i + cp_len]); - self.i += cp_len; - return cp; - } - - fn unescape(self: *UnescapeIterator) !u32 { - var entity: [5]u8 = undefined; - - var offset: usize = 0; - while (try self.nextCodepoint()) |cp| { - entity[offset] = std.math.cast(u8, cp) catch return error.InvalidEntity; - if (cp == ';') { - break; + pub fn peek(self: *Self) !?u32 { + if (self.current) |cp| { + return cp; } - offset += 1; - if (offset == entity.len) { + const cp = (try self.nextCodepoint()) orelse return null; + self.current = if (cp == '&') try self.nextEntity() else cp; + return self.current.?; + } + + pub fn consumeAndPeek(self: *Self) !?u32 { + self.consume(); + return try self.peek(); + } + + fn nextEntity(self: *Self) !u32 { + var entity = [_]u8{0} ** 5; + + for (entity) |*c| { + const cp = (try self.nextCodepoint()) orelse return error.InvalidEntity; + c.* = std.math.cast(u8, cp) catch return error.InvalidEntity; + + if (cp == ';') { + break; + } + } else { return error.InvalidEntity; } - } else { + + for (entities) |*e| { + if (std.mem.startsWith(u8, entity, e.substitute)) { + return e.char; + } + } + return error.InvalidEntity; } - if (std.mem.startsWith(u8, entity, "lt;")) { - return '<'; - } else if (std.mem.startsWith(u8, entity, "gt;")) { - return '>'; - } else if (std.mem.startsWith(u8, entity, "amp;")) { - return '&'; - } else if (std.mem.startsWith(u8, entity, "apos;")) { - return '\''; - } else if (std.mem.startsWith(u8, entity, "quot;")) { - return '"'; - } else { - return error.InvalidEntity; + fn nextCodepoint(self: *Self) !?u32 { + var cp: [4]u8 = undefined; + if ((try self.in.readFull(cp[0 .. 1])) != 1) { + return null; + } + + const cp_len = try unicode.utf8ByteSequenceLength(cp[0]); + try self.in.readNoEof(cp[1 .. cp_len]); + return try unicode.utf8Decode(cp[0 .. cp_len]); } - } + }; +} - pub fn next(self: *UnescapeIterator) !?u32 { - const cp = (try self.nextCodepoint()) orelse return null; +fn testXmlUtf8Parser(text: []const u8, expected: []const u8) !void { + var slice_in = std.io.SliceInStream.init(text); + var p = XmlUtf8Parser(std.io.SliceInStream.Error).init(&slice_in.stream); - if (cp == '&') { - return try self.unescape(); - } else { - return cp; - } - } -}; - -fn testUnescapeIterator(text: []const u8, expected: []const u8) !void { - var it = UnescapeIterator.init(text); var i: usize = 0; defer testing.expect(i == expected.len); - while (try it.next()) |cp| { + while (try p.consumeAndPeek()) |cp| { testing.expect(cp == expected[i]); i += 1; } + + testing.expect((try p.consumeAndPeek()) == null); } -test "unescape iterator" { - try testUnescapeIterator("simpleabc", "simpleabc"); - try testUnescapeIterator("a<b&c>d"e'f<", "ad\"e'f<"); - testing.expectError(error.InvalidEntity, testUnescapeIterator("oof&&", "oof")); - testing.expectError(error.InvalidEntity, testUnescapeIterator("oof&test;", "oof")); - testing.expectError(error.InvalidEntity, testUnescapeIterator("oof&pythons", "oof")); +test "XmlUtf8Parser" { + try testXmlUtf8Parser("simpleabc", "simpleabc"); + try testXmlUtf8Parser("a<b&c>d"e'f<", "ad\"e'f<"); + testing.expectError(error.InvalidEntity, testXmlUtf8Parser("python&", "python")); + testing.expectError(error.InvalidEntity, testXmlUtf8Parser("python&&", "python")); + testing.expectError(error.InvalidEntity, testXmlUtf8Parser("python&test;", "python")); + testing.expectError(error.InvalidEntity, testXmlUtf8Parser("python&boa", "python")); + + var slice_in = std.io.SliceInStream.init("test"); + var p = XmlUtf8Parser(std.io.SliceInStream.Error).init(&slice_in.stream); + + testing.expect((try p.peek()).? == 't'); + testing.expect((try p.peek()).? == 't'); + p.consume(); + testing.expect((try p.peek()).? == 'e'); + testing.expect((try p.peek()).? == 'e'); + testing.expect((try p.consumeAndPeek()).? == 's'); + testing.expect((try p.consumeAndPeek()).? == 't'); + testing.expect((try p.consumeAndPeek()) == null); + testing.expect((try p.peek()) == null); } - -pub const Event = union(enum) { - open_tag, - close_tag, - attribute, - text, -}; - -pub const Parser = struct { - source: []const u8, - - pub fn init(source: []const u8) Parser { - return Parser{ - .source = source - }; - } -};