Xml parser

This commit is contained in:
Robin Voetter
2020-01-05 18:35:30 +01:00
parent b49c64862e
commit 7304493f51
3 changed files with 536 additions and 156 deletions

View File

@@ -1,10 +1,20 @@
const std = @import("std");
const xml = @import("xml.zig");
pub fn main() void {
std.debug.warn("Test\n");
pub fn main() !void {
const file = try std.fs.cwd().openFileC(std.os.argv[1], .{});
defer file.close();
const size = try file.seekableStream().stream.getEndPos();
const source = try std.heap.page_allocator.alloc(u8, size);
defer std.heap.page_allocator.free(source);
_ = try file.inStream().stream.read(source);
var doc = try xml.parse(std.heap.page_allocator, source);
defer doc.deinit();
}
test "main" {
_ = @import("xml.zig");
_ = @import("utf8.zig");
}

View File

@@ -1,109 +0,0 @@
const std = @import("std");
const unicode = std.unicode;
const testing = std.testing;
fn Utf8Iterator(comptime ReadError: type) type {
return struct {
const Self = @This();
pub const Stream = std.io.InStream(ReadError);
in: *Stream,
pub fn init(in: *Stream) Self {
return Self{
.in = in
};
}
pub fn next(self: *Self) !?u32 {
var cp: [4]u8 = undefined;
if ((try self.in.readFull(cp[0 .. 1])) != 1) {
return null;
}
const cp_len = try unicode.utf8ByteSequenceLength(cp[0]);
try self.in.readNoEof(cp[1 .. cp_len]);
return try unicode.utf8Decode(cp[0 .. cp_len]);
}
};
}
pub fn PeekUtf8Iterator(comptime buffer_size: usize, comptime ReadError: type) type {
return struct {
const Self = @This();
pub const Stream = Utf8Iterator(ReadError).Stream;
it: Utf8Iterator(ReadError),
buf: [buffer_size]u32,
head: usize,
size: usize,
pub fn init(in: *Utf8Iterator(ReadError).Stream) Self {
return Self{
.it = Utf8Iterator(ReadError).init(in),
.buf = undefined,
.head = 0,
.size = 0
};
}
pub fn next(self: *Self) !?u32 {
if (self.size > 0) {
const cp = self.buf[self.head];
self.head = (self.head + 1) % buffer_size;
self.size -= 1;
return cp;
}
return try self.it.next();
}
pub fn peek(self: *Self, offset: usize) !?u32 {
std.debug.assert(offset < buffer_size);
while (self.size <= offset) {
const cp = (try self.it.next()) orelse return null;
self.buf[(self.head + self.size) % buffer_size] = cp;
self.size += 1;
}
return self.buf[(self.head + offset) % buffer_size];
}
pub fn peekNoEof(self: *Self, offset: usize) !?u32 {
return (try self.peek(offset)) orelse return error.EndOfStream;
}
pub fn discard(self: *Self, amount: usize) void {
std.debug.assert(amount < self.size);
var i: usize = 0;
while (i < amount) : (i += 1) {
_ = self.next();
}
}
};
}
test "PeekUtf8Iterator" {
var slice_in = std.io.SliceInStream.init("abcd");
var it = PeekUtf8Iterator(4, std.io.SliceInStream.Error).init(&slice_in.stream);
testing.expect((try it.peek(0)).? == 'a');
testing.expect((try it.peek(1)).? == 'b');
testing.expect((try it.peek(2)).? == 'c');
testing.expect((try it.peek(3)).? == 'd');
testing.expect((try it.next()).? == 'a');
testing.expect((try it.peek(0)).? == 'b');
testing.expect((try it.peek(1)).? == 'c');
testing.expect((try it.peek(2)).? == 'd');
testing.expect((try it.next()).? == 'b');
testing.expect((try it.peek(0)).? == 'c');
testing.expect((try it.peek(1)).? == 'd');
testing.expect((try it.peek(2)) == null);
testing.expect((try it.next()).? == 'c');
testing.expect((try it.peek(0)).? == 'd');
testing.expect((try it.peek(1)) == null);
testing.expectError(error.EndOfStream, it.peekNoEof(1));
}

View File

@@ -1,74 +1,553 @@
const std = @import("std");
const Allocator = std.mem.Allocator;
const mem = std.mem;
const testing = std.testing;
const Allocator = mem.Allocator;
const ArenaAllocator = std.heap.ArenaAllocator;
pub const TagType = enum {
tag,
proc_instr
};
pub const Node = struct {
pub name: []const u8,
pub type: TagType,
pub attributes: []Attribute,
pub children: []Element
};
const SegmentedList = std.SegmentedList;
pub const Attribute = struct {
pub key: []const u8,
pub value: []const u8
name: []const u8,
value: []const u8
};
pub const Element = union(enum) {
pub text: []const u8,
pub node: *Node
pub const Content = union(enum) {
CharData: []const u8,
Comment: []const u8,
Element: *Element
};
// Wrapper to work around compiler crash
pub const Child = struct {
content: Content
};
pub const Element = struct {
tag: []const u8,
attributes: SegmentedList(*Attribute, 0),
children: SegmentedList(Child, 0),
fn init(tag: []const u8, alloc: *Allocator) Element {
return .{
.tag = tag,
.attributes = SegmentedList(*Attribute, 0).init(alloc),
.children = SegmentedList(Child, 0).init(alloc),
};
}
};
pub const XmlDecl = struct {
version: []const u8,
encoding: ?[]const u8,
standalone: ?bool
};
pub const Document = struct {
arena: ArenaAllocator,
pub xml_decl: ?*Node,
pub root: *Node,
xml_decl: ?*XmlDecl,
root: *Element,
pub fn deinit(self: *Document) void {
self.arena.deinit();
}
};
const Parser = struct {
alloc: *Allocator,
const ParseContext = struct {
source: []const u8,
offset: usize,
line: usize,
column: usize,
fn element(self: *Self) !Element {
fn init(source: []const u8) ParseContext {
return .{
.source = source,
.offset = 0,
.line = 0,
.column = 0
};
}
fn text(self: *Self) ![]const u8 {
const start = self.offset;
const end = if (std.mem.indexOfPos(self.source, self.offset, "<")) |offset| offset else self.source.len;
self.offset = end;
return self.source[start .. end]; // TODO: Decode
fn peek(self: *ParseContext) ?u8 {
return if (self.offset < self.source.len) self.source[self.offset] else null;
}
fn node(self: *Self) !*Node {
std.debug.assert(try self.peekNoEof(0) == '<');
}
fn peekNoEof(self: *const Self, offset: usize) !u8 {
if (offset + self.offset >= self.source.len) {
return error.EndOfStream;
fn consume(self: *ParseContext) !u8 {
if (self.offset < self.source.len) {
return self.consumeNoEof();
}
return self.source[offset + self.offset];
return error.UnexpectedEof;
}
fn consumeNoEof(self: *ParseContext) u8 {
std.debug.assert(self.offset < self.source.len);
const c = self.source[self.offset];
self.offset += 1;
if (c == '\n') {
self.line += 1;
self.column = 0;
} else {
self.column += 1;
}
return c;
}
fn eat(self: *ParseContext, char: u8) bool {
self.expect(char) catch return false;
return true;
}
fn expect(self: *ParseContext, expected: u8) !void {
if (self.peek()) |actual| {
if (expected != actual) {
return error.UnexpectedCharacter;
}
_ = self.consumeNoEof();
return;
}
return error.UnexpectedEof;
}
fn eatStr(self: *ParseContext, text: []const u8) bool {
self.expectStr(text) catch return false;
return true;
}
fn expectStr(self: *ParseContext, text: []const u8) !void {
if (self.source.len < self.offset + text.len) {
return error.UnexpectedEof;
} else if (std.mem.startsWith(u8, self.source[self.offset ..], text)) {
var i: usize = 0;
while (i < text.len) : (i += 1) {
_ = self.consumeNoEof();
}
return;
}
return error.UnexpectedCharacter;
}
fn eatWs(self: *ParseContext) bool {
var ws = false;
while (self.peek()) |ch| {
switch (ch) {
' ', '\t', '\n', '\r' => {
ws = true;
_ = self.consumeNoEof();
},
else => break
}
}
return ws;
}
fn expectWs(self: *ParseContext) !void {
if (!self.eatWs()) return error.UnexpectedCharacter;
}
fn currentLine(self: ParseContext) []const u8 {
var begin: usize = 0;
if (mem.lastIndexOf(u8, self.source[0 .. self.offset], "\n")) |prev_nl| {
begin = prev_nl + 1;
}
var end = mem.indexOfPos(u8, self.source, self.offset, "\n") orelse self.source.len;
return self.source[begin .. end];
}
};
pub fn parse(alloc: *Allocator, source: []const u8) !Document {
var arena = ArenaAllocator.init(alloc);
var parser = Parser {
.alloc = alloc,
.source = source,
.offset = 0
test "ParseContext" {
{
var ctx = ParseContext.init("I like pythons");
testing.expectEqual(@as(?u8, 'I'), ctx.peek());
testing.expectEqual(@as(u8, 'I'), ctx.consumeNoEof());
testing.expectEqual(@as(?u8, ' '), ctx.peek());
testing.expectEqual(@as(u8, ' '), try ctx.consume());
testing.expect(ctx.eat('l'));
testing.expectEqual(@as(?u8, 'i'), ctx.peek());
testing.expectEqual(false, ctx.eat('a'));
testing.expectEqual(@as(?u8, 'i'), ctx.peek());
try ctx.expect('i');
testing.expectEqual(@as(?u8, 'k'), ctx.peek());
testing.expectError(error.UnexpectedCharacter, ctx.expect('a'));
testing.expectEqual(@as(?u8, 'k'), ctx.peek());
testing.expect(ctx.eatStr("ke"));
testing.expectEqual(@as(?u8, ' '), ctx.peek());
testing.expect(ctx.eatWs());
testing.expectEqual(@as(?u8, 'p'), ctx.peek());
testing.expectEqual(false, ctx.eatWs());
testing.expectEqual(@as(?u8, 'p'), ctx.peek());
testing.expectEqual(false, ctx.eatStr("aaaaaaaaa"));
testing.expectEqual(@as(?u8, 'p'), ctx.peek());
testing.expectError(error.UnexpectedEof, ctx.expectStr("aaaaaaaaa"));
testing.expectEqual(@as(?u8, 'p'), ctx.peek());
testing.expectError(error.UnexpectedCharacter, ctx.expectStr("pytn"));
testing.expectEqual(@as(?u8, 'p'), ctx.peek());
try ctx.expectStr("python");
testing.expectEqual(@as(?u8, 's'), ctx.peek());
}
{
var ctx = ParseContext.init("");
testing.expectEqual(ctx.peek(), null);
testing.expectError(error.UnexpectedEof, ctx.consume());
testing.expectEqual(ctx.eat('p'), false);
testing.expectError(error.UnexpectedEof, ctx.expect('p'));
}
}
pub const ParseError = error {
IllegalCharacter,
UnexpectedEof,
UnexpectedCharacter,
UnclosedValue,
UnclosedComment,
InvalidName,
InvalidEntity,
InvalidStandaloneValue,
NonMatchingClosingTag,
InvalidDocument,
OutOfMemory
};
pub fn parse(backing_allocator: *Allocator, source: []const u8) !Document {
var ctx = ParseContext.init(source);
return parseDocument(&ctx, backing_allocator) catch |err| {
std.debug.warn("{}\n", .{ctx.currentLine()});
var i: usize = 0;
while (i < ctx.column) : (i += 1) {
std.debug.warn(" ", .{});
}
std.debug.warn("^\n", .{});
return err;
};
}
fn parseDocument(ctx: *ParseContext, backing_allocator: *Allocator) !Document {
var doc = Document{
.arena = ArenaAllocator.init(backing_allocator),
.xml_decl = null,
.root = undefined
};
doc.xml_decl = try tryParseProlog(ctx, &doc.arena.allocator);
_ = ctx.eatWs();
doc.root = (try tryParseElement(ctx, &doc.arena.allocator)) orelse return error.InvalidDocument;
_ = ctx.eatWs();
if (ctx.peek() != null) return error.InvalidDocument;
return doc;
}
fn parseAttrValue(ctx: *ParseContext, alloc: *Allocator) ![]const u8 {
const quote = try ctx.consume();
if (quote != '"' and quote != '\'') return error.UnexpectedCharacter;
const begin = ctx.offset;
while (true) {
const c = ctx.consume() catch return error.UnclosedValue;
if (c == quote) break;
}
const end = ctx.offset - 1;
return try dupeAndUnescape(alloc, ctx.source[begin .. end]);
}
fn parseEqAttrValue(ctx: *ParseContext, alloc: *Allocator) ![]const u8 {
_ = ctx.eatWs();
try ctx.expect('=');
_ = ctx.eatWs();
return try parseAttrValue(ctx, alloc);
}
fn parseNameNoDupe(ctx: *ParseContext) ![]const u8 {
// XML's spec on names is very long, so to make this easier
// we just take any character that is not special and not whitespace
const begin = ctx.offset;
while (ctx.peek()) |ch| {
switch (ch) {
' ', '\t', '\n', '\r' => break,
'&', '"', '\'', '<', '>', '?', '=', '/' => break,
else => _ = ctx.consumeNoEof()
}
}
const end = ctx.offset;
if (begin == end) return error.InvalidName;
return ctx.source[begin .. end];
}
fn tryParseCharData(ctx: *ParseContext, alloc: *Allocator) !?[]const u8 {
const begin = ctx.offset;
while (ctx.peek()) |ch| {
switch (ch) {
'<', '>' => break,
else => _ = ctx.consumeNoEof()
}
}
const end = ctx.offset;
if (begin == end) return null;
return try dupeAndUnescape(alloc, ctx.source[begin .. end]);
}
fn parseContent(ctx: *ParseContext, alloc: *Allocator) ParseError!Content {
if (try tryParseCharData(ctx, alloc)) |cd| {
return Content{.CharData = cd};
} else if (try tryParseComment(ctx, alloc)) |comment| {
return Content{.Comment = comment};
} else if (try tryParseElement(ctx, alloc)) |elem| {
return Content{.Element = elem};
} else {
return error.UnexpectedCharacter;
}
}
fn tryParseAttr(ctx: *ParseContext, alloc: *Allocator) !?*Attribute {
const name = parseNameNoDupe(ctx) catch return null;
_ = ctx.eatWs();
try ctx.expect('=');
_ = ctx.eatWs();
const value = try parseAttrValue(ctx, alloc);
const attr = try alloc.create(Attribute);
attr.name = try mem.dupe(alloc, u8, name);
attr.value = value;
return attr;
}
fn tryParseElement(ctx: *ParseContext, alloc: *Allocator) !?*Element {
const start = ctx.offset;
if (!ctx.eat('<')) return null;
const tag = parseNameNoDupe(ctx) catch {
ctx.offset = start;
return null;
};
const element = try alloc.create(Element);
element.* = Element.init(try std.mem.dupe(alloc, u8, tag), alloc);
while (ctx.eatWs()) {
const attr = (try tryParseAttr(ctx, alloc)) orelse break;
try element.attributes.push(attr);
}
if (ctx.eatStr("/>")) {
return element;
}
try ctx.expect('>');
while (true) {
if (ctx.peek() == null) {
return error.UnexpectedEof;
} else if (ctx.eatStr("</")) {
break;
}
const content = try parseContent(ctx, alloc);
try element.children.push(.{.content = content});
}
const closing_tag = try parseNameNoDupe(ctx);
if (!std.mem.eql(u8, tag, closing_tag)) {
return error.NonMatchingClosingTag;
}
_ = ctx.eatWs();
try ctx.expect('>');
return element;
}
test "tryParseElement" {
{
var ctx = ParseContext.init("<= a='b'/>");
testing.expectEqual(@as(?*Element, null), try tryParseElement(&ctx, std.debug.global_allocator));
testing.expectEqual(@as(?u8, '<'), ctx.peek());
}
{
var ctx = ParseContext.init("<python size='15' color = \"green\"/>");
const elem = try tryParseElement(&ctx, std.debug.global_allocator);
testing.expectEqualSlices(u8, elem.?.tag, "python");
const size_attr = elem.?.attributes.at(0).*;
testing.expectEqualSlices(u8, size_attr.name, "size");
testing.expectEqualSlices(u8, size_attr.value, "15");
const color_attr = elem.?.attributes.at(1).*;
testing.expectEqualSlices(u8, color_attr.name, "color");
testing.expectEqualSlices(u8, color_attr.value, "green");
}
{
var ctx = ParseContext.init("<python>test</python>");
const elem = try tryParseElement(&ctx, std.debug.global_allocator);
testing.expectEqualSlices(u8, elem.?.tag, "python");
testing.expectEqualSlices(u8, elem.?.children.at(0).content.CharData, "test");
}
{
var ctx = ParseContext.init("<a>b<c/>d<e/>f<!--g--></a>");
const elem = try tryParseElement(&ctx, std.debug.global_allocator);
testing.expectEqualSlices(u8, elem.?.tag, "a");
testing.expectEqualSlices(u8, elem.?.children.at(0).content.CharData, "b");
testing.expectEqualSlices(u8, elem.?.children.at(1).content.Element.tag, "c");
testing.expectEqualSlices(u8, elem.?.children.at(2).content.CharData, "d");
testing.expectEqualSlices(u8, elem.?.children.at(3).content.Element.tag, "e");
testing.expectEqualSlices(u8, elem.?.children.at(4).content.CharData, "f");
testing.expectEqualSlices(u8, elem.?.children.at(5).content.Comment, "g");
}
}
fn tryParseProlog(ctx: *ParseContext, alloc: *Allocator) !?*XmlDecl {
const start = ctx.offset;
if (!ctx.eatStr("<?") or !mem.eql(u8, try parseNameNoDupe(ctx), "xml")) {
ctx.offset = start;
return null;
}
const decl = try alloc.create(XmlDecl);
decl.encoding = null;
decl.standalone = null;
// Version info is mandatory
try ctx.expectWs();
try ctx.expectStr("version");
decl.version = try parseEqAttrValue(ctx, alloc);
if (ctx.eatWs()) {
// Optional encoding and standalone info
var require_ws = false;
if (ctx.eatStr("encoding")) {
decl.encoding = try parseEqAttrValue(ctx, alloc);
require_ws = true;
}
if (require_ws == ctx.eatWs() and ctx.eatStr("standalone")) {
const standalone = try parseEqAttrValue(ctx, alloc);
if (std.mem.eql(u8, standalone, "yes")) {
decl.standalone = true;
} else if (std.mem.eql(u8, standalone, "no")) {
decl.standalone = false;
} else {
return error.InvalidStandaloneValue;
}
}
_ = ctx.eatWs();
}
try ctx.expectStr("?>");
return decl;
}
test "tryParseProlog" {
{
var ctx = ParseContext.init("<?xmla version='aa'?>");
testing.expectEqual(@as(?*XmlDecl, null), try tryParseProlog(&ctx, std.debug.global_allocator));
testing.expectEqual(@as(?u8, '<'), ctx.peek());
}
{
var ctx = ParseContext.init("<?xml version='aa'?>");
const decl = try tryParseProlog(&ctx, std.debug.global_allocator);
testing.expectEqualSlices(u8, "aa", decl.?.version);
testing.expectEqual(@as(?[]const u8, null), decl.?.encoding);
testing.expectEqual(@as(?bool, null), decl.?.standalone);
}
{
var ctx = ParseContext.init("<?xml version=\"aa\" encoding = 'bbb' standalone \t = 'yes'?>");
const decl = try tryParseProlog(&ctx, std.debug.global_allocator);
testing.expectEqualSlices(u8, "aa", decl.?.version);
testing.expectEqualSlices(u8, "bbb", decl.?.encoding.?);
testing.expectEqual(@as(?bool, true), decl.?.standalone.?);
}
}
fn tryParseComment(ctx: *ParseContext, alloc: *Allocator) !?[]const u8 {
if (!ctx.eatStr("<!--")) return null;
const begin = ctx.offset;
while (!ctx.eatStr("-->")) {
_ = ctx.consume() catch return error.UnclosedComment;
}
const end = ctx.offset - "-->".len;
return try mem.dupe(alloc, u8, ctx.source[begin .. end]);
}
fn unescapeEntity(text: []const u8) !u8 {
const EntitySubstition = struct {
text: []const u8,
replacement: u8
};
const entities = [_]EntitySubstition{
.{.text = "&lt;", .replacement = '<'},
.{.text = "&gt;", .replacement = '>'},
.{.text = "&amp;", .replacement = '&'},
.{.text = "&apos;", .replacement = '\''},
.{.text = "&quot;", .replacement = '"'}
};
for (entities) |entity| {
if (std.mem.eql(u8, text, entity.text)) return entity.replacement;
}
return error.InvalidEntity;
}
fn dupeAndUnescape(alloc: *Allocator, text: []const u8) ![]const u8 {
const str = try alloc.alloc(u8, text.len);
var j: usize = 0;
var i: usize = 0;
while (i < text.len) : (j += 1) {
if (text[i] == '&') {
const entity_end = 1 + (mem.indexOfPos(u8, text, i, ";") orelse return error.InvalidEntity);
str[j] = try unescapeEntity(text[i .. entity_end]);
i = entity_end;
} else {
str[j] = text[i];
i += 1;
}
}
return alloc.shrink(str, j);
}
test "dupeAndUnescape" {
testing.expectEqualSlices(u8, "test", try dupeAndUnescape(std.debug.global_allocator, "test"));
testing.expectEqualSlices(u8, "a<b&c>d\"e'f<", try dupeAndUnescape(std.debug.global_allocator, "a&lt;b&amp;c&gt;d&quot;e&apos;f&lt;"));
testing.expectError(error.InvalidEntity, dupeAndUnescape(std.debug.global_allocator, "python&"));
testing.expectError(error.InvalidEntity, dupeAndUnescape(std.debug.global_allocator, "python&&"));
testing.expectError(error.InvalidEntity, dupeAndUnescape(std.debug.global_allocator, "python&test;"));
testing.expectError(error.InvalidEntity, dupeAndUnescape(std.debug.global_allocator, "python&boa"));
}