improve xml parser

This commit is contained in:
Robin Voetter
2022-06-25 11:16:05 +02:00
parent f6f5f66f20
commit cc87740422
5 changed files with 261 additions and 292 deletions

View File

@@ -23,7 +23,6 @@ pub fn isZigPrimitiveType(name: []const u8) bool {
"f32", "f32",
"f64", "f64",
"f128", "f128",
"c_longdouble",
"noreturn", "noreturn",
"type", "type",
"anyerror", "anyerror",
@@ -35,6 +34,7 @@ pub fn isZigPrimitiveType(name: []const u8) bool {
"c_ulong", "c_ulong",
"c_longlong", "c_longlong",
"c_ulonglong", "c_ulonglong",
"c_longdouble",
// Removed in stage 2 in https://github.com/ziglang/zig/commit/05cf44933d753f7a5a53ab289ea60fd43761de57, // Removed in stage 2 in https://github.com/ziglang/zig/commit/05cf44933d753f7a5a53ab289ea60fd43761de57,
// but these are still invalid identifiers in stage 1. // but these are still invalid identifiers in stage 1.
"undefined", "undefined",
@@ -52,12 +52,12 @@ pub fn isZigPrimitiveType(name: []const u8) bool {
return false; return false;
} }
pub fn writeIdentifier(out: anytype, id: []const u8) !void { pub fn writeIdentifier(writer: anytype, id: []const u8) !void {
// https://github.com/ziglang/zig/issues/2897 // https://github.com/ziglang/zig/issues/2897
if (isZigPrimitiveType(id)) { if (isZigPrimitiveType(id)) {
try out.print("@\"{}\"", .{std.zig.fmtEscapes(id)}); try writer.print("@\"{}\"", .{std.zig.fmtEscapes(id)});
} else { } else {
try out.print("{}", .{std.zig.fmtId(id)}); try writer.print("{}", .{std.zig.fmtId(id)});
} }
} }

View File

@@ -164,11 +164,11 @@ pub const XmlCTokenizer = struct {
} }
fn elemToToken(elem: *xml.Element) !?Token { fn elemToToken(elem: *xml.Element) !?Token {
if (elem.children.items.len != 1 or elem.children.items[0] != .CharData) { if (elem.children.len != 1 or elem.children[0] != .char_data) {
return error.InvalidXml; return error.InvalidXml;
} }
const text = elem.children.items[0].CharData; const text = elem.children[0].char_data;
if (mem.eql(u8, elem.tag, "type")) { if (mem.eql(u8, elem.tag, "type")) {
return Token{ .kind = .type_name, .text = text }; return Token{ .kind = .type_name, .text = text };
} else if (mem.eql(u8, elem.tag, "enum")) { } else if (mem.eql(u8, elem.tag, "enum")) {
@@ -203,9 +203,9 @@ pub const XmlCTokenizer = struct {
if (self.it.next()) |child| { if (self.it.next()) |child| {
switch (child.*) { switch (child.*) {
.CharData => |cdata| self.ctok = CTokenizer{ .source = cdata, .in_comment = in_comment }, .char_data => |cdata| self.ctok = CTokenizer{ .source = cdata, .in_comment = in_comment },
.Comment => {}, // xml comment .comment => {}, // xml comment
.Element => |elem| if (!in_comment) if (try elemToToken(elem)) |tok| return tok, .element => |elem| if (!in_comment) if (try elemToToken(elem)) |tok| return tok,
} }
} else { } else {
return null; return null;

View File

@@ -24,7 +24,6 @@ pub fn parseXml(backing_allocator: Allocator, root: *xml.Element) !ParseResult {
const allocator = arena.allocator(); const allocator = arena.allocator();
var reg = registry.Registry{ var reg = registry.Registry{
.copyright = root.getCharData("comment") orelse return error.InvalidRegistry,
.decls = try parseDeclarations(allocator, root), .decls = try parseDeclarations(allocator, root),
.api_constants = try parseApiConstants(allocator, root), .api_constants = try parseApiConstants(allocator, root),
.tags = try parseTags(allocator, root), .tags = try parseTags(allocator, root),
@@ -42,7 +41,7 @@ fn parseDeclarations(allocator: Allocator, root: *xml.Element) ![]registry.Decla
var types_elem = root.findChildByTag("types") orelse return error.InvalidRegistry; var types_elem = root.findChildByTag("types") orelse return error.InvalidRegistry;
var commands_elem = root.findChildByTag("commands") orelse return error.InvalidRegistry; var commands_elem = root.findChildByTag("commands") orelse return error.InvalidRegistry;
const decl_upper_bound = types_elem.children.items.len + commands_elem.children.items.len; const decl_upper_bound = types_elem.children.len + commands_elem.children.len;
const decls = try allocator.alloc(registry.Declaration, decl_upper_bound); const decls = try allocator.alloc(registry.Declaration, decl_upper_bound);
var count: usize = 0; var count: usize = 0;
@@ -186,7 +185,7 @@ fn parseContainer(allocator: Allocator, ty: *xml.Element, is_union: bool) !regis
}; };
} }
var members = try allocator.alloc(registry.Container.Field, ty.children.items.len); var members = try allocator.alloc(registry.Container.Field, ty.children.len);
var i: usize = 0; var i: usize = 0;
var it = ty.findChildrenByTag("member"); var it = ty.findChildrenByTag("member");
@@ -356,7 +355,7 @@ fn parseEnumFields(allocator: Allocator, elem: *xml.Element) !registry.Enum {
else else
32; 32;
const fields = try allocator.alloc(registry.Enum.Field, elem.children.items.len); const fields = try allocator.alloc(registry.Enum.Field, elem.children.len);
var i: usize = 0; var i: usize = 0;
var it = elem.findChildrenByTag("enum"); var it = elem.findChildrenByTag("enum");
@@ -451,7 +450,7 @@ fn parseCommand(allocator: Allocator, elem: *xml.Element) !registry.Declaration
var proto_xctok = cparse.XmlCTokenizer.init(proto); var proto_xctok = cparse.XmlCTokenizer.init(proto);
const command_decl = try cparse.parseParamOrProto(allocator, &proto_xctok, false); const command_decl = try cparse.parseParamOrProto(allocator, &proto_xctok, false);
var params = try allocator.alloc(registry.Command.Param, elem.children.items.len); var params = try allocator.alloc(registry.Command.Param, elem.children.len);
var i: usize = 0; var i: usize = 0;
var it = elem.findChildrenByTag("param"); var it = elem.findChildrenByTag("param");
@@ -527,7 +526,7 @@ fn parseApiConstants(allocator: Allocator, root: *xml.Element) ![]registry.ApiCo
break :blk n_defines; break :blk n_defines;
}; };
const constants = try allocator.alloc(registry.ApiConstant, enums.children.items.len + n_defines); const constants = try allocator.alloc(registry.ApiConstant, enums.children.len + n_defines);
var i: usize = 0; var i: usize = 0;
var it = enums.findChildrenByTag("enum"); var it = enums.findChildrenByTag("enum");
@@ -564,7 +563,7 @@ fn parseDefines(types: *xml.Element, out: []registry.ApiConstant) !usize {
if (mem.eql(u8, name, "VK_HEADER_VERSION")) { if (mem.eql(u8, name, "VK_HEADER_VERSION")) {
out[i] = .{ out[i] = .{
.name = name, .name = name,
.value = .{ .expr = mem.trim(u8, ty.children.items[2].CharData, " ") }, .value = .{ .expr = mem.trim(u8, ty.children[2].char_data, " ") },
}; };
} else { } else {
var xctok = cparse.XmlCTokenizer.init(ty); var xctok = cparse.XmlCTokenizer.init(ty);
@@ -581,7 +580,7 @@ fn parseDefines(types: *xml.Element, out: []registry.ApiConstant) !usize {
fn parseTags(allocator: Allocator, root: *xml.Element) ![]registry.Tag { fn parseTags(allocator: Allocator, root: *xml.Element) ![]registry.Tag {
var tags_elem = root.findChildByTag("tags") orelse return error.InvalidRegistry; var tags_elem = root.findChildByTag("tags") orelse return error.InvalidRegistry;
const tags = try allocator.alloc(registry.Tag, tags_elem.children.items.len); const tags = try allocator.alloc(registry.Tag, tags_elem.children.len);
var i: usize = 0; var i: usize = 0;
var it = tags_elem.findChildrenByTag("tag"); var it = tags_elem.findChildrenByTag("tag");
@@ -620,7 +619,7 @@ fn parseFeature(allocator: Allocator, feature: *xml.Element) !registry.Feature {
break :blk try splitFeatureLevel(number, "."); break :blk try splitFeatureLevel(number, ".");
}; };
var requires = try allocator.alloc(registry.Require, feature.children.items.len); var requires = try allocator.alloc(registry.Require, feature.children.len);
var i: usize = 0; var i: usize = 0;
var it = feature.findChildrenByTag("require"); var it = feature.findChildrenByTag("require");
while (it.next()) |require| { while (it.next()) |require| {
@@ -745,7 +744,7 @@ fn parseRequire(allocator: Allocator, require: *xml.Element, extnumber: ?u31) !r
fn parseExtensions(allocator: Allocator, root: *xml.Element) ![]registry.Extension { fn parseExtensions(allocator: Allocator, root: *xml.Element) ![]registry.Extension {
const extensions_elem = root.findChildByTag("extensions") orelse return error.InvalidRegistry; const extensions_elem = root.findChildByTag("extensions") orelse return error.InvalidRegistry;
const extensions = try allocator.alloc(registry.Extension, extensions_elem.children.items.len); const extensions = try allocator.alloc(registry.Extension, extensions_elem.children.len);
var i: usize = 0; var i: usize = 0;
var it = extensions_elem.findChildrenByTag("extension"); var it = extensions_elem.findChildrenByTag("extension");
while (it.next()) |extension| { while (it.next()) |extension| {
@@ -823,7 +822,7 @@ fn parseExtension(allocator: Allocator, extension: *xml.Element) !registry.Exten
break :blk try splitCommaAlloc(allocator, requires_str); break :blk try splitCommaAlloc(allocator, requires_str);
}; };
var requires = try allocator.alloc(registry.Require, extension.children.items.len); var requires = try allocator.alloc(registry.Require, extension.children.len);
var i: usize = 0; var i: usize = 0;
var it = extension.findChildrenByTag("require"); var it = extension.findChildrenByTag("require");
while (it.next()) |require| { while (it.next()) |require| {

View File

@@ -1,5 +1,4 @@
pub const Registry = struct { pub const Registry = struct {
copyright: []const u8,
decls: []Declaration, decls: []Declaration,
api_constants: []ApiConstant, api_constants: []ApiConstant,
tags: []Tag, tags: []Tag,

View File

@@ -3,7 +3,6 @@ const mem = std.mem;
const testing = std.testing; const testing = std.testing;
const Allocator = mem.Allocator; const Allocator = mem.Allocator;
const ArenaAllocator = std.heap.ArenaAllocator; const ArenaAllocator = std.heap.ArenaAllocator;
const ArrayList = std.ArrayList;
pub const Attribute = struct { pub const Attribute = struct {
name: []const u8, name: []const u8,
@@ -11,29 +10,18 @@ pub const Attribute = struct {
}; };
pub const Content = union(enum) { pub const Content = union(enum) {
CharData: []const u8, char_data: []const u8,
Comment: []const u8, comment: []const u8,
Element: *Element, element: *Element,
}; };
pub const Element = struct { pub const Element = struct {
pub const AttributeList = ArrayList(*Attribute);
pub const ContentList = ArrayList(Content);
tag: []const u8, tag: []const u8,
attributes: AttributeList, attributes: []Attribute = &.{},
children: ContentList, children: []Content = &.{},
fn init(tag: []const u8, alloc: Allocator) Element { pub fn getAttribute(self: Element, attrib_name: []const u8) ?[]const u8 {
return .{ for (self.attributes) |child| {
.tag = tag,
.attributes = AttributeList.init(alloc),
.children = ContentList.init(alloc),
};
}
pub fn getAttribute(self: *Element, attrib_name: []const u8) ?[]const u8 {
for (self.attributes.items) |child| {
if (mem.eql(u8, child.name, attrib_name)) { if (mem.eql(u8, child.name, attrib_name)) {
return child.value; return child.value;
} }
@@ -42,36 +30,36 @@ pub const Element = struct {
return null; return null;
} }
pub fn getCharData(self: *Element, child_tag: []const u8) ?[]const u8 { pub fn getCharData(self: Element, child_tag: []const u8) ?[]const u8 {
const child = self.findChildByTag(child_tag) orelse return null; const child = self.findChildByTag(child_tag) orelse return null;
if (child.children.items.len != 1) { if (child.children.len != 1) {
return null; return null;
} }
return switch (child.children.items[0]) { return switch (child.children[0]) {
.CharData => |char_data| char_data, .char_data => |char_data| char_data,
else => null, else => null,
}; };
} }
pub fn iterator(self: *Element) ChildIterator { pub fn iterator(self: Element) ChildIterator {
return .{ return .{
.items = self.children.items, .items = self.children,
.i = 0, .i = 0,
}; };
} }
pub fn elements(self: *Element) ChildElementIterator { pub fn elements(self: Element) ChildElementIterator {
return .{ return .{
.inner = self.iterator(), .inner = self.iterator(),
}; };
} }
pub fn findChildByTag(self: *Element, tag: []const u8) ?*Element { pub fn findChildByTag(self: Element, tag: []const u8) ?*Element {
return self.findChildrenByTag(tag).next(); return self.findChildrenByTag(tag).next();
} }
pub fn findChildrenByTag(self: *Element, tag: []const u8) FindChildrenByTagIterator { pub fn findChildrenByTag(self: Element, tag: []const u8) FindChildrenByTagIterator {
return .{ return .{
.inner = self.elements(), .inner = self.elements(),
.tag = tag, .tag = tag,
@@ -97,11 +85,11 @@ pub const Element = struct {
pub fn next(self: *ChildElementIterator) ?*Element { pub fn next(self: *ChildElementIterator) ?*Element {
while (self.inner.next()) |child| { while (self.inner.next()) |child| {
if (child.* != .Element) { if (child.* != .element) {
continue; continue;
} }
return child.*.Element; return child.*.element;
} }
return null; return null;
@@ -126,15 +114,9 @@ pub const Element = struct {
}; };
}; };
pub const XmlDecl = struct {
version: []const u8,
encoding: ?[]const u8,
standalone: ?bool,
};
pub const Document = struct { pub const Document = struct {
arena: ArenaAllocator, arena: ArenaAllocator,
xml_decl: ?*XmlDecl, xml_decl: ?*Element,
root: *Element, root: *Element,
pub fn deinit(self: Document) void { pub fn deinit(self: Document) void {
@@ -143,13 +125,13 @@ pub const Document = struct {
} }
}; };
const ParseContext = struct { const Parser = struct {
source: []const u8, source: []const u8,
offset: usize, offset: usize,
line: usize, line: usize,
column: usize, column: usize,
fn init(source: []const u8) ParseContext { fn init(source: []const u8) Parser {
return .{ return .{
.source = source, .source = source,
.offset = 0, .offset = 0,
@@ -158,11 +140,11 @@ const ParseContext = struct {
}; };
} }
fn peek(self: *ParseContext) ?u8 { fn peek(self: *Parser) ?u8 {
return if (self.offset < self.source.len) self.source[self.offset] else null; return if (self.offset < self.source.len) self.source[self.offset] else null;
} }
fn consume(self: *ParseContext) !u8 { fn consume(self: *Parser) !u8 {
if (self.offset < self.source.len) { if (self.offset < self.source.len) {
return self.consumeNoEof(); return self.consumeNoEof();
} }
@@ -170,7 +152,7 @@ const ParseContext = struct {
return error.UnexpectedEof; return error.UnexpectedEof;
} }
fn consumeNoEof(self: *ParseContext) u8 { fn consumeNoEof(self: *Parser) u8 {
std.debug.assert(self.offset < self.source.len); std.debug.assert(self.offset < self.source.len);
const c = self.source[self.offset]; const c = self.source[self.offset];
self.offset += 1; self.offset += 1;
@@ -185,12 +167,12 @@ const ParseContext = struct {
return c; return c;
} }
fn eat(self: *ParseContext, char: u8) bool { fn eat(self: *Parser, char: u8) bool {
self.expect(char) catch return false; self.expect(char) catch return false;
return true; return true;
} }
fn expect(self: *ParseContext, expected: u8) !void { fn expect(self: *Parser, expected: u8) !void {
if (self.peek()) |actual| { if (self.peek()) |actual| {
if (expected != actual) { if (expected != actual) {
return error.UnexpectedCharacter; return error.UnexpectedCharacter;
@@ -203,15 +185,15 @@ const ParseContext = struct {
return error.UnexpectedEof; return error.UnexpectedEof;
} }
fn eatStr(self: *ParseContext, text: []const u8) bool { fn eatStr(self: *Parser, text: []const u8) bool {
self.expectStr(text) catch return false; self.expectStr(text) catch return false;
return true; return true;
} }
fn expectStr(self: *ParseContext, text: []const u8) !void { fn expectStr(self: *Parser, text: []const u8) !void {
if (self.source.len < self.offset + text.len) { if (self.source.len < self.offset + text.len) {
return error.UnexpectedEof; return error.UnexpectedEof;
} else if (std.mem.startsWith(u8, self.source[self.offset..], text)) { } else if (mem.startsWith(u8, self.source[self.offset..], text)) {
var i: usize = 0; var i: usize = 0;
while (i < text.len) : (i += 1) { while (i < text.len) : (i += 1) {
_ = self.consumeNoEof(); _ = self.consumeNoEof();
@@ -223,7 +205,7 @@ const ParseContext = struct {
return error.UnexpectedCharacter; return error.UnexpectedCharacter;
} }
fn eatWs(self: *ParseContext) bool { fn eatWs(self: *Parser) bool {
var ws = false; var ws = false;
while (self.peek()) |ch| { while (self.peek()) |ch| {
@@ -239,11 +221,11 @@ const ParseContext = struct {
return ws; return ws;
} }
fn expectWs(self: *ParseContext) !void { fn expectWs(self: *Parser) !void {
if (!self.eatWs()) return error.UnexpectedCharacter; if (!self.eatWs()) return error.UnexpectedCharacter;
} }
fn currentLine(self: ParseContext) []const u8 { fn currentLine(self: Parser) []const u8 {
var begin: usize = 0; var begin: usize = 0;
if (mem.lastIndexOfScalar(u8, self.source[0..self.offset], '\n')) |prev_nl| { if (mem.lastIndexOfScalar(u8, self.source[0..self.offset], '\n')) |prev_nl| {
begin = prev_nl + 1; begin = prev_nl + 1;
@@ -254,49 +236,49 @@ const ParseContext = struct {
} }
}; };
test "ParseContext" { test "xml: Parser" {
{ {
var ctx = ParseContext.init("I like pythons"); var parser = Parser.init("I like pythons");
try testing.expectEqual(@as(?u8, 'I'), ctx.peek()); try testing.expectEqual(@as(?u8, 'I'), parser.peek());
try testing.expectEqual(@as(u8, 'I'), ctx.consumeNoEof()); try testing.expectEqual(@as(u8, 'I'), parser.consumeNoEof());
try testing.expectEqual(@as(?u8, ' '), ctx.peek()); try testing.expectEqual(@as(?u8, ' '), parser.peek());
try testing.expectEqual(@as(u8, ' '), try ctx.consume()); try testing.expectEqual(@as(u8, ' '), try parser.consume());
try testing.expect(ctx.eat('l')); try testing.expect(parser.eat('l'));
try testing.expectEqual(@as(?u8, 'i'), ctx.peek()); try testing.expectEqual(@as(?u8, 'i'), parser.peek());
try testing.expectEqual(false, ctx.eat('a')); try testing.expectEqual(false, parser.eat('a'));
try testing.expectEqual(@as(?u8, 'i'), ctx.peek()); try testing.expectEqual(@as(?u8, 'i'), parser.peek());
try ctx.expect('i'); try parser.expect('i');
try testing.expectEqual(@as(?u8, 'k'), ctx.peek()); try testing.expectEqual(@as(?u8, 'k'), parser.peek());
try testing.expectError(error.UnexpectedCharacter, ctx.expect('a')); try testing.expectError(error.UnexpectedCharacter, parser.expect('a'));
try testing.expectEqual(@as(?u8, 'k'), ctx.peek()); try testing.expectEqual(@as(?u8, 'k'), parser.peek());
try testing.expect(ctx.eatStr("ke")); try testing.expect(parser.eatStr("ke"));
try testing.expectEqual(@as(?u8, ' '), ctx.peek()); try testing.expectEqual(@as(?u8, ' '), parser.peek());
try testing.expect(ctx.eatWs()); try testing.expect(parser.eatWs());
try testing.expectEqual(@as(?u8, 'p'), ctx.peek()); try testing.expectEqual(@as(?u8, 'p'), parser.peek());
try testing.expectEqual(false, ctx.eatWs()); try testing.expectEqual(false, parser.eatWs());
try testing.expectEqual(@as(?u8, 'p'), ctx.peek()); try testing.expectEqual(@as(?u8, 'p'), parser.peek());
try testing.expectEqual(false, ctx.eatStr("aaaaaaaaa")); try testing.expectEqual(false, parser.eatStr("aaaaaaaaa"));
try testing.expectEqual(@as(?u8, 'p'), ctx.peek()); try testing.expectEqual(@as(?u8, 'p'), parser.peek());
try testing.expectError(error.UnexpectedEof, ctx.expectStr("aaaaaaaaa")); try testing.expectError(error.UnexpectedEof, parser.expectStr("aaaaaaaaa"));
try testing.expectEqual(@as(?u8, 'p'), ctx.peek()); try testing.expectEqual(@as(?u8, 'p'), parser.peek());
try testing.expectError(error.UnexpectedCharacter, ctx.expectStr("pytn")); try testing.expectError(error.UnexpectedCharacter, parser.expectStr("pytn"));
try testing.expectEqual(@as(?u8, 'p'), ctx.peek()); try testing.expectEqual(@as(?u8, 'p'), parser.peek());
try ctx.expectStr("python"); try parser.expectStr("python");
try testing.expectEqual(@as(?u8, 's'), ctx.peek()); try testing.expectEqual(@as(?u8, 's'), parser.peek());
} }
{ {
var ctx = ParseContext.init(""); var parser = Parser.init("");
try testing.expectEqual(ctx.peek(), null); try testing.expectEqual(parser.peek(), null);
try testing.expectError(error.UnexpectedEof, ctx.consume()); try testing.expectError(error.UnexpectedEof, parser.consume());
try testing.expectEqual(ctx.eat('p'), false); try testing.expectEqual(parser.eat('p'), false);
try testing.expectError(error.UnexpectedEof, ctx.expect('p')); try testing.expectError(error.UnexpectedEof, parser.expect('p'));
} }
} }
@@ -315,11 +297,11 @@ pub const ParseError = error{
}; };
pub fn parse(backing_allocator: Allocator, source: []const u8) !Document { pub fn parse(backing_allocator: Allocator, source: []const u8) !Document {
var ctx = ParseContext.init(source); var parser = Parser.init(source);
return try parseDocument(&ctx, backing_allocator); return try parseDocument(&parser, backing_allocator);
} }
fn parseDocument(ctx: *ParseContext, backing_allocator: Allocator) !Document { fn parseDocument(parser: *Parser, backing_allocator: Allocator) !Document {
var doc = Document{ var doc = Document{
.arena = ArenaAllocator.init(backing_allocator), .arena = ArenaAllocator.init(backing_allocator),
.xml_decl = null, .xml_decl = null,
@@ -330,280 +312,269 @@ fn parseDocument(ctx: *ParseContext, backing_allocator: Allocator) !Document {
const allocator = doc.arena.allocator(); const allocator = doc.arena.allocator();
try trySkipComments(ctx, allocator); try skipComments(parser, allocator);
doc.xml_decl = try tryParseProlog(ctx, allocator); doc.xml_decl = try parseElement(parser, allocator, .xml_decl);
_ = ctx.eatWs(); _ = parser.eatWs();
try trySkipComments(ctx, allocator); try skipComments(parser, allocator);
doc.root = (try tryParseElement(ctx, allocator)) orelse return error.InvalidDocument; doc.root = (try parseElement(parser, allocator, .element)) orelse return error.InvalidDocument;
_ = ctx.eatWs(); _ = parser.eatWs();
try trySkipComments(ctx, allocator); try skipComments(parser, allocator);
if (ctx.peek() != null) return error.InvalidDocument; if (parser.peek() != null) return error.InvalidDocument;
return doc; return doc;
} }
fn parseAttrValue(ctx: *ParseContext, alloc: Allocator) ![]const u8 { fn parseAttrValue(parser: *Parser, alloc: Allocator) ![]const u8 {
const quote = try ctx.consume(); const quote = try parser.consume();
if (quote != '"' and quote != '\'') return error.UnexpectedCharacter; if (quote != '"' and quote != '\'') return error.UnexpectedCharacter;
const begin = ctx.offset; const begin = parser.offset;
while (true) { while (true) {
const c = ctx.consume() catch return error.UnclosedValue; const c = parser.consume() catch return error.UnclosedValue;
if (c == quote) break; if (c == quote) break;
} }
const end = ctx.offset - 1; const end = parser.offset - 1;
return try dupeAndUnescape(alloc, ctx.source[begin..end]); return try unescape(alloc, parser.source[begin..end]);
} }
fn parseEqAttrValue(ctx: *ParseContext, alloc: Allocator) ![]const u8 { fn parseEqAttrValue(parser: *Parser, alloc: Allocator) ![]const u8 {
_ = ctx.eatWs(); _ = parser.eatWs();
try ctx.expect('='); try parser.expect('=');
_ = ctx.eatWs(); _ = parser.eatWs();
return try parseAttrValue(ctx, alloc); return try parseAttrValue(parser, alloc);
} }
fn parseNameNoDupe(ctx: *ParseContext) ![]const u8 { fn parseNameNoDupe(parser: *Parser) ![]const u8 {
// XML's spec on names is very long, so to make this easier // XML's spec on names is very long, so to make this easier
// we just take any character that is not special and not whitespace // we just take any character that is not special and not whitespace
const begin = ctx.offset; const begin = parser.offset;
while (ctx.peek()) |ch| { while (parser.peek()) |ch| {
switch (ch) { switch (ch) {
' ', '\t', '\n', '\r' => break, ' ', '\t', '\n', '\r' => break,
'&', '"', '\'', '<', '>', '?', '=', '/' => break, '&', '"', '\'', '<', '>', '?', '=', '/' => break,
else => _ = ctx.consumeNoEof(), else => _ = parser.consumeNoEof(),
} }
} }
const end = ctx.offset; const end = parser.offset;
if (begin == end) return error.InvalidName; if (begin == end) return error.InvalidName;
return ctx.source[begin..end]; return parser.source[begin..end];
} }
fn tryParseCharData(ctx: *ParseContext, alloc: Allocator) !?[]const u8 { fn parseCharData(parser: *Parser, alloc: Allocator) !?[]const u8 {
const begin = ctx.offset; const begin = parser.offset;
while (ctx.peek()) |ch| { while (parser.peek()) |ch| {
switch (ch) { switch (ch) {
'<' => break, '<' => break,
else => _ = ctx.consumeNoEof(), else => _ = parser.consumeNoEof(),
} }
} }
const end = ctx.offset; const end = parser.offset;
if (begin == end) return null; if (begin == end) return null;
return try dupeAndUnescape(alloc, ctx.source[begin..end]); return try unescape(alloc, parser.source[begin..end]);
} }
fn parseContent(ctx: *ParseContext, alloc: Allocator) ParseError!Content { fn parseContent(parser: *Parser, alloc: Allocator) ParseError!Content {
if (try tryParseCharData(ctx, alloc)) |cd| { if (try parseCharData(parser, alloc)) |cd| {
return Content{ .CharData = cd }; return Content{ .char_data = cd };
} else if (try tryParseComment(ctx, alloc)) |comment| { } else if (try parseComment(parser, alloc)) |comment| {
return Content{ .Comment = comment }; return Content{ .comment = comment };
} else if (try tryParseElement(ctx, alloc)) |elem| { } else if (try parseElement(parser, alloc, .element)) |elem| {
return Content{ .Element = elem }; return Content{ .element = elem };
} else { } else {
return error.UnexpectedCharacter; return error.UnexpectedCharacter;
} }
} }
fn tryParseAttr(ctx: *ParseContext, alloc: Allocator) !?*Attribute { fn parseAttr(parser: *Parser, alloc: Allocator) !?Attribute {
const name = parseNameNoDupe(ctx) catch return null; const name = parseNameNoDupe(parser) catch return null;
_ = ctx.eatWs(); _ = parser.eatWs();
try ctx.expect('='); try parser.expect('=');
_ = ctx.eatWs(); _ = parser.eatWs();
const value = try parseAttrValue(ctx, alloc); const value = try parseAttrValue(parser, alloc);
const attr = try alloc.create(Attribute); const attr = Attribute{
attr.name = try alloc.dupe(u8, name); .name = try alloc.dupe(u8, name),
attr.value = value; .value = value,
};
return attr; return attr;
} }
fn tryParseElement(ctx: *ParseContext, alloc: Allocator) !?*Element { const ElementKind = enum {
const start = ctx.offset; xml_decl,
if (!ctx.eat('<')) return null; element,
const tag = parseNameNoDupe(ctx) catch {
ctx.offset = start;
return null;
}; };
const element = try alloc.create(Element); fn parseElement(parser: *Parser, alloc: Allocator, comptime kind: ElementKind) !?*Element {
element.* = Element.init(try alloc.dupe(u8, tag), alloc); const start = parser.offset;
while (ctx.eatWs()) { const tag = switch (kind) {
const attr = (try tryParseAttr(ctx, alloc)) orelse break; .xml_decl => blk: {
try element.attributes.append(attr); if (!parser.eatStr("<?") or !mem.eql(u8, try parseNameNoDupe(parser), "xml")) {
parser.offset = start;
return null;
}
break :blk "xml";
},
.element => blk: {
if (!parser.eat('<')) return null;
const tag = parseNameNoDupe(parser) catch {
parser.offset = start;
return null;
};
break :blk tag;
}
};
var attributes = std.ArrayList(Attribute).init(alloc);
defer attributes.deinit();
var children = std.ArrayList(Content).init(alloc);
defer children.deinit();
while (parser.eatWs()) {
const attr = (try parseAttr(parser, alloc)) orelse break;
try attributes.append(attr);
} }
if (ctx.eatStr("/>")) { switch (kind) {
return element; .xml_decl => try parser.expectStr("?>"),
} .element => {
if (!parser.eatStr("/>")) {
try ctx.expect('>'); try parser.expect('>');
while (true) { while (true) {
if (ctx.peek() == null) { if (parser.peek() == null) {
return error.UnexpectedEof; return error.UnexpectedEof;
} else if (ctx.eatStr("</")) { } else if (parser.eatStr("</")) {
break; break;
} }
const content = try parseContent(ctx, alloc); const content = try parseContent(parser, alloc);
try element.children.append(content); try children.append(content);
} }
const closing_tag = try parseNameNoDupe(ctx); const closing_tag = try parseNameNoDupe(parser);
if (!std.mem.eql(u8, tag, closing_tag)) { if (!mem.eql(u8, tag, closing_tag)) {
return error.NonMatchingClosingTag; return error.NonMatchingClosingTag;
} }
_ = ctx.eatWs(); _ = parser.eatWs();
try ctx.expect('>'); try parser.expect('>');
}
}
}
const element = try alloc.create(Element);
element.* = .{
.tag = try alloc.dupe(u8, tag),
.attributes = attributes.toOwnedSlice(),
.children = children.toOwnedSlice(),
};
return element; return element;
} }
test "tryParseElement" { test "xml: parseElement" {
var arena = std.heap.ArenaAllocator.init(testing.allocator); var arena = ArenaAllocator.init(testing.allocator);
defer arena.deinit(); defer arena.deinit();
const alloc = arena.allocator(); const alloc = arena.allocator();
{ {
var ctx = ParseContext.init("<= a='b'/>"); var parser = Parser.init("<= a='b'/>");
try testing.expectEqual(@as(?*Element, null), try tryParseElement(&ctx, alloc)); try testing.expectEqual(@as(?*Element, null), try parseElement(&parser, alloc, .element));
try testing.expectEqual(@as(?u8, '<'), ctx.peek()); try testing.expectEqual(@as(?u8, '<'), parser.peek());
} }
{ {
var ctx = ParseContext.init("<python size='15' color = \"green\"/>"); var parser = Parser.init("<python size='15' color = \"green\"/>");
const elem = try tryParseElement(&ctx, alloc); const elem = try parseElement(&parser, alloc, .element);
try testing.expectEqualSlices(u8, elem.?.tag, "python"); try testing.expectEqualSlices(u8, elem.?.tag, "python");
const size_attr = elem.?.attributes.items[0]; const size_attr = elem.?.attributes[0];
try testing.expectEqualSlices(u8, size_attr.name, "size"); try testing.expectEqualSlices(u8, size_attr.name, "size");
try testing.expectEqualSlices(u8, size_attr.value, "15"); try testing.expectEqualSlices(u8, size_attr.value, "15");
const color_attr = elem.?.attributes.items[1]; const color_attr = elem.?.attributes[1];
try testing.expectEqualSlices(u8, color_attr.name, "color"); try testing.expectEqualSlices(u8, color_attr.name, "color");
try testing.expectEqualSlices(u8, color_attr.value, "green"); try testing.expectEqualSlices(u8, color_attr.value, "green");
} }
{ {
var ctx = ParseContext.init("<python>test</python>"); var parser = Parser.init("<python>test</python>");
const elem = try tryParseElement(&ctx, alloc); const elem = try parseElement(&parser, alloc, .element);
try testing.expectEqualSlices(u8, elem.?.tag, "python"); try testing.expectEqualSlices(u8, elem.?.tag, "python");
try testing.expectEqualSlices(u8, elem.?.children.items[0].CharData, "test"); try testing.expectEqualSlices(u8, elem.?.children[0].char_data, "test");
} }
{ {
var ctx = ParseContext.init("<a>b<c/>d<e/>f<!--g--></a>"); var parser = Parser.init("<a>b<c/>d<e/>f<!--g--></a>");
const elem = try tryParseElement(&ctx, alloc); const elem = try parseElement(&parser, alloc, .element);
try testing.expectEqualSlices(u8, elem.?.tag, "a"); try testing.expectEqualSlices(u8, elem.?.tag, "a");
try testing.expectEqualSlices(u8, elem.?.children.items[0].CharData, "b"); try testing.expectEqualSlices(u8, elem.?.children[0].char_data, "b");
try testing.expectEqualSlices(u8, elem.?.children.items[1].Element.tag, "c"); try testing.expectEqualSlices(u8, elem.?.children[1].element.tag, "c");
try testing.expectEqualSlices(u8, elem.?.children.items[2].CharData, "d"); try testing.expectEqualSlices(u8, elem.?.children[2].char_data, "d");
try testing.expectEqualSlices(u8, elem.?.children.items[3].Element.tag, "e"); try testing.expectEqualSlices(u8, elem.?.children[3].element.tag, "e");
try testing.expectEqualSlices(u8, elem.?.children.items[4].CharData, "f"); try testing.expectEqualSlices(u8, elem.?.children[4].char_data, "f");
try testing.expectEqualSlices(u8, elem.?.children.items[5].Comment, "g"); try testing.expectEqualSlices(u8, elem.?.children[5].comment, "g");
} }
} }
fn tryParseProlog(ctx: *ParseContext, alloc: Allocator) !?*XmlDecl { test "xml: parse prolog" {
const start = ctx.offset; var arena = ArenaAllocator.init(testing.allocator);
if (!ctx.eatStr("<?") or !mem.eql(u8, try parseNameNoDupe(ctx), "xml")) {
ctx.offset = start;
return null;
}
const decl = try alloc.create(XmlDecl);
decl.encoding = null;
decl.standalone = null;
// Version info is mandatory
try ctx.expectWs();
try ctx.expectStr("version");
decl.version = try parseEqAttrValue(ctx, alloc);
if (ctx.eatWs()) {
// Optional encoding and standalone info
var require_ws = false;
if (ctx.eatStr("encoding")) {
decl.encoding = try parseEqAttrValue(ctx, alloc);
require_ws = true;
}
if (require_ws == ctx.eatWs() and ctx.eatStr("standalone")) {
const standalone = try parseEqAttrValue(ctx, alloc);
if (std.mem.eql(u8, standalone, "yes")) {
decl.standalone = true;
} else if (std.mem.eql(u8, standalone, "no")) {
decl.standalone = false;
} else {
return error.InvalidStandaloneValue;
}
}
_ = ctx.eatWs();
}
try ctx.expectStr("?>");
return decl;
}
test "tryParseProlog" {
var arena = std.heap.ArenaAllocator.init(testing.allocator);
defer arena.deinit(); defer arena.deinit();
const alloc = arena.allocator(); const a = arena.allocator();
{ {
var ctx = ParseContext.init("<?xmla version='aa'?>"); var parser = Parser.init("<?xmla version='aa'?>");
try testing.expectEqual(@as(?*XmlDecl, null), try tryParseProlog(&ctx, alloc)); try testing.expectEqual(@as(?*Element, null), try parseElement(&parser, a, .xml_decl));
try testing.expectEqual(@as(?u8, '<'), ctx.peek()); try testing.expectEqual(@as(?u8, '<'), parser.peek());
} }
{ {
var ctx = ParseContext.init("<?xml version='aa'?>"); var parser = Parser.init("<?xml version='aa'?>");
const decl = try tryParseProlog(&ctx, alloc); const decl = try parseElement(&parser, a, .xml_decl);
try testing.expectEqualSlices(u8, "aa", decl.?.version); try testing.expectEqualSlices(u8, "aa", decl.?.getAttribute("version").?);
try testing.expectEqual(@as(?[]const u8, null), decl.?.encoding); try testing.expectEqual(@as(?[]const u8, null), decl.?.getAttribute("encoding"));
try testing.expectEqual(@as(?bool, null), decl.?.standalone); try testing.expectEqual(@as(?[]const u8, null), decl.?.getAttribute("standalone"));
} }
{ {
var ctx = ParseContext.init("<?xml version=\"aa\" encoding = 'bbb' standalone \t = 'yes'?>"); var parser = Parser.init("<?xml version=\"ccc\" encoding = 'bbb' standalone \t = 'yes'?>");
const decl = try tryParseProlog(&ctx, alloc); const decl = try parseElement(&parser, a, .xml_decl);
try testing.expectEqualSlices(u8, "aa", decl.?.version); try testing.expectEqualSlices(u8, "ccc", decl.?.getAttribute("version").?);
try testing.expectEqualSlices(u8, "bbb", decl.?.encoding.?); try testing.expectEqualSlices(u8, "bbb", decl.?.getAttribute("encoding").?);
try testing.expectEqual(@as(?bool, true), decl.?.standalone.?); try testing.expectEqualSlices(u8, "yes", decl.?.getAttribute("standalone").?);
} }
} }
fn trySkipComments(ctx: *ParseContext, alloc: Allocator) !void { fn skipComments(parser: *Parser, alloc: Allocator) !void {
while (try tryParseComment(ctx, alloc)) |_| { while ((try parseComment(parser, alloc)) != null) {
_ = ctx.eatWs(); _ = parser.eatWs();
} }
} }
fn tryParseComment(ctx: *ParseContext, alloc: Allocator) !?[]const u8 { fn parseComment(parser: *Parser, alloc: Allocator) !?[]const u8 {
if (!ctx.eatStr("<!--")) return null; if (!parser.eatStr("<!--")) return null;
const begin = ctx.offset; const begin = parser.offset;
while (!ctx.eatStr("-->")) { while (!parser.eatStr("-->")) {
_ = ctx.consume() catch return error.UnclosedComment; _ = parser.consume() catch return error.UnclosedComment;
} }
const end = ctx.offset - "-->".len; const end = parser.offset - "-->".len;
return try alloc.dupe(u8, ctx.source[begin..end]); return try alloc.dupe(u8, parser.source[begin..end]);
} }
fn unescapeEntity(text: []const u8) !u8 { fn unescapeEntity(text: []const u8) !u8 {
@@ -618,49 +589,49 @@ fn unescapeEntity(text: []const u8) !u8 {
}; };
for (entities) |entity| { for (entities) |entity| {
if (std.mem.eql(u8, text, entity.text)) return entity.replacement; if (mem.eql(u8, text, entity.text)) return entity.replacement;
} }
return error.InvalidEntity; return error.InvalidEntity;
} }
fn dupeAndUnescape(alloc: Allocator, text: []const u8) ![]const u8 { fn unescape(arena: Allocator, text: []const u8) ![]const u8 {
const str = try alloc.alloc(u8, text.len); const unescaped = try arena.alloc(u8, text.len);
var j: usize = 0; var j: usize = 0;
var i: usize = 0; var i: usize = 0;
while (i < text.len) : (j += 1) { while (i < text.len) : (j += 1) {
if (text[i] == '&') { if (text[i] == '&') {
const entity_end = 1 + (mem.indexOfScalarPos(u8, text, i, ';') orelse return error.InvalidEntity); const entity_end = 1 + (mem.indexOfScalarPos(u8, text, i, ';') orelse return error.InvalidEntity);
str[j] = try unescapeEntity(text[i..entity_end]); unescaped[j] = try unescapeEntity(text[i..entity_end]);
i = entity_end; i = entity_end;
} else { } else {
str[j] = text[i]; unescaped[j] = text[i];
i += 1; i += 1;
} }
} }
return alloc.shrink(str, j); return unescaped[0..j];
} }
test "dupeAndUnescape" { test "xml: unescape" {
var arena = std.heap.ArenaAllocator.init(testing.allocator); var arena = ArenaAllocator.init(testing.allocator);
defer arena.deinit(); defer arena.deinit();
const alloc = arena.allocator(); const a = arena.allocator();
try testing.expectEqualSlices(u8, "test", try dupeAndUnescape(alloc, "test")); try testing.expectEqualSlices(u8, "test", try unescape(a, "test"));
try testing.expectEqualSlices(u8, "a<b&c>d\"e'f<", try dupeAndUnescape(alloc, "a&lt;b&amp;c&gt;d&quot;e&apos;f&lt;")); try testing.expectEqualSlices(u8, "a<b&c>d\"e'f<", try unescape(a, "a&lt;b&amp;c&gt;d&quot;e&apos;f&lt;"));
try testing.expectError(error.InvalidEntity, dupeAndUnescape(alloc, "python&")); try testing.expectError(error.InvalidEntity, unescape(a, "python&"));
try testing.expectError(error.InvalidEntity, dupeAndUnescape(alloc, "python&&")); try testing.expectError(error.InvalidEntity, unescape(a, "python&&"));
try testing.expectError(error.InvalidEntity, dupeAndUnescape(alloc, "python&test;")); try testing.expectError(error.InvalidEntity, unescape(a, "python&test;"));
try testing.expectError(error.InvalidEntity, dupeAndUnescape(alloc, "python&boa")); try testing.expectError(error.InvalidEntity, unescape(a, "python&boa"));
} }
test "Top level comments" { test "xml: top level comments" {
var arena = std.heap.ArenaAllocator.init(testing.allocator); var arena = ArenaAllocator.init(testing.allocator);
defer arena.deinit(); defer arena.deinit();
const alloc = arena.allocator(); const a = arena.allocator();
const doc = try parse(alloc, "<?xml version='aa'?><!--comment--><python color='green'/><!--another comment-->"); const doc = try parse(a, "<?xml version='aa'?><!--comment--><python color='green'/><!--another comment-->");
try testing.expectEqualSlices(u8, "python", doc.root.tag); try testing.expectEqualSlices(u8, "python", doc.root.tag);
} }