forked from mirror/vulkan-zig
Replace UnescapeIterator with XmlUtf8Parser
This commit is contained in:
@@ -1,121 +1,123 @@
|
|||||||
const std = @import("std");
|
const std = @import("std");
|
||||||
const unicode = std.unicode;
|
const unicode = std.unicode;
|
||||||
const testing = std.testing;
|
const testing = std.testing;
|
||||||
const InStream = std.io.InStream;
|
|
||||||
|
|
||||||
pub const Text = struct {
|
const Entity = struct {
|
||||||
pub raw: []const u8,
|
substitute: []const u8,
|
||||||
|
char: u8
|
||||||
pub fn iter_unescaped(self: *Text) UnescapeIterator {
|
|
||||||
return UnescapeIterator{
|
|
||||||
.raw = self.raw,
|
|
||||||
.i = 0
|
|
||||||
};
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
pub const UnescapeIterator = struct{
|
const entities = [_]Entity{
|
||||||
raw: []const u8,
|
Entity{.substitute = "amp;", .char = '&'},
|
||||||
i: usize,
|
Entity{.substitute = "lt;", .char = '<'},
|
||||||
|
Entity{.substitute = "gt;", .char = '>'},
|
||||||
|
Entity{.substitute = "quot;", .char = '"'},
|
||||||
|
Entity{.substitute = "apos;", .char = '\''},
|
||||||
|
};
|
||||||
|
|
||||||
pub fn init(raw: []const u8) UnescapeIterator {
|
fn XmlUtf8Parser(comptime ReadError: type) type {
|
||||||
return UnescapeIterator {
|
return struct {
|
||||||
.raw = raw,
|
pub const Self = @This();
|
||||||
.i = 0
|
pub const Stream = std.io.InStream(ReadError);
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
fn nextCodepoint(self: *UnescapeIterator) !?u32 {
|
in: *Stream,
|
||||||
if (self.i >= self.raw.len) {
|
current: ?u32,
|
||||||
return null;
|
|
||||||
|
pub fn init(in: *Stream) Self {
|
||||||
|
return Self{
|
||||||
|
.in = in,
|
||||||
|
.current = null
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
const cp_len = try unicode.utf8ByteSequenceLength(self.raw[self.i]);
|
pub fn consume(self: *Self) void {
|
||||||
if (cp_len + self.i > self.raw.len) {
|
self.current = null;
|
||||||
return error.InvalidUtf8;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const cp = try unicode.utf8Decode(self.raw[self.i .. self.i + cp_len]);
|
pub fn peek(self: *Self) !?u32 {
|
||||||
self.i += cp_len;
|
if (self.current) |cp| {
|
||||||
return cp;
|
return cp;
|
||||||
}
|
|
||||||
|
|
||||||
fn unescape(self: *UnescapeIterator) !u32 {
|
|
||||||
var entity: [5]u8 = undefined;
|
|
||||||
|
|
||||||
var offset: usize = 0;
|
|
||||||
while (try self.nextCodepoint()) |cp| {
|
|
||||||
entity[offset] = std.math.cast(u8, cp) catch return error.InvalidEntity;
|
|
||||||
if (cp == ';') {
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
offset += 1;
|
const cp = (try self.nextCodepoint()) orelse return null;
|
||||||
if (offset == entity.len) {
|
self.current = if (cp == '&') try self.nextEntity() else cp;
|
||||||
|
return self.current.?;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn consumeAndPeek(self: *Self) !?u32 {
|
||||||
|
self.consume();
|
||||||
|
return try self.peek();
|
||||||
|
}
|
||||||
|
|
||||||
|
fn nextEntity(self: *Self) !u32 {
|
||||||
|
var entity = [_]u8{0} ** 5;
|
||||||
|
|
||||||
|
for (entity) |*c| {
|
||||||
|
const cp = (try self.nextCodepoint()) orelse return error.InvalidEntity;
|
||||||
|
c.* = std.math.cast(u8, cp) catch return error.InvalidEntity;
|
||||||
|
|
||||||
|
if (cp == ';') {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
return error.InvalidEntity;
|
return error.InvalidEntity;
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
|
for (entities) |*e| {
|
||||||
|
if (std.mem.startsWith(u8, entity, e.substitute)) {
|
||||||
|
return e.char;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return error.InvalidEntity;
|
return error.InvalidEntity;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (std.mem.startsWith(u8, entity, "lt;")) {
|
fn nextCodepoint(self: *Self) !?u32 {
|
||||||
return '<';
|
var cp: [4]u8 = undefined;
|
||||||
} else if (std.mem.startsWith(u8, entity, "gt;")) {
|
if ((try self.in.readFull(cp[0 .. 1])) != 1) {
|
||||||
return '>';
|
return null;
|
||||||
} else if (std.mem.startsWith(u8, entity, "amp;")) {
|
}
|
||||||
return '&';
|
|
||||||
} else if (std.mem.startsWith(u8, entity, "apos;")) {
|
const cp_len = try unicode.utf8ByteSequenceLength(cp[0]);
|
||||||
return '\'';
|
try self.in.readNoEof(cp[1 .. cp_len]);
|
||||||
} else if (std.mem.startsWith(u8, entity, "quot;")) {
|
return try unicode.utf8Decode(cp[0 .. cp_len]);
|
||||||
return '"';
|
|
||||||
} else {
|
|
||||||
return error.InvalidEntity;
|
|
||||||
}
|
}
|
||||||
}
|
};
|
||||||
|
}
|
||||||
|
|
||||||
pub fn next(self: *UnescapeIterator) !?u32 {
|
fn testXmlUtf8Parser(text: []const u8, expected: []const u8) !void {
|
||||||
const cp = (try self.nextCodepoint()) orelse return null;
|
var slice_in = std.io.SliceInStream.init(text);
|
||||||
|
var p = XmlUtf8Parser(std.io.SliceInStream.Error).init(&slice_in.stream);
|
||||||
|
|
||||||
if (cp == '&') {
|
|
||||||
return try self.unescape();
|
|
||||||
} else {
|
|
||||||
return cp;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
fn testUnescapeIterator(text: []const u8, expected: []const u8) !void {
|
|
||||||
var it = UnescapeIterator.init(text);
|
|
||||||
var i: usize = 0;
|
var i: usize = 0;
|
||||||
defer testing.expect(i == expected.len);
|
defer testing.expect(i == expected.len);
|
||||||
|
|
||||||
while (try it.next()) |cp| {
|
while (try p.consumeAndPeek()) |cp| {
|
||||||
testing.expect(cp == expected[i]);
|
testing.expect(cp == expected[i]);
|
||||||
i += 1;
|
i += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
testing.expect((try p.consumeAndPeek()) == null);
|
||||||
}
|
}
|
||||||
|
|
||||||
test "unescape iterator" {
|
test "XmlUtf8Parser" {
|
||||||
try testUnescapeIterator("simpleabc", "simpleabc");
|
try testXmlUtf8Parser("simpleabc", "simpleabc");
|
||||||
try testUnescapeIterator("a<b&c>d"e'f<", "a<b&c>d\"e'f<");
|
try testXmlUtf8Parser("a<b&c>d"e'f<", "a<b&c>d\"e'f<");
|
||||||
testing.expectError(error.InvalidEntity, testUnescapeIterator("oof&&", "oof"));
|
testing.expectError(error.InvalidEntity, testXmlUtf8Parser("python&", "python"));
|
||||||
testing.expectError(error.InvalidEntity, testUnescapeIterator("oof&test;", "oof"));
|
testing.expectError(error.InvalidEntity, testXmlUtf8Parser("python&&", "python"));
|
||||||
testing.expectError(error.InvalidEntity, testUnescapeIterator("oof&pythons", "oof"));
|
testing.expectError(error.InvalidEntity, testXmlUtf8Parser("python&test;", "python"));
|
||||||
|
testing.expectError(error.InvalidEntity, testXmlUtf8Parser("python&boa", "python"));
|
||||||
|
|
||||||
|
var slice_in = std.io.SliceInStream.init("test");
|
||||||
|
var p = XmlUtf8Parser(std.io.SliceInStream.Error).init(&slice_in.stream);
|
||||||
|
|
||||||
|
testing.expect((try p.peek()).? == 't');
|
||||||
|
testing.expect((try p.peek()).? == 't');
|
||||||
|
p.consume();
|
||||||
|
testing.expect((try p.peek()).? == 'e');
|
||||||
|
testing.expect((try p.peek()).? == 'e');
|
||||||
|
testing.expect((try p.consumeAndPeek()).? == 's');
|
||||||
|
testing.expect((try p.consumeAndPeek()).? == 't');
|
||||||
|
testing.expect((try p.consumeAndPeek()) == null);
|
||||||
|
testing.expect((try p.peek()) == null);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub const Event = union(enum) {
|
|
||||||
open_tag,
|
|
||||||
close_tag,
|
|
||||||
attribute,
|
|
||||||
text,
|
|
||||||
};
|
|
||||||
|
|
||||||
pub const Parser = struct {
|
|
||||||
source: []const u8,
|
|
||||||
|
|
||||||
pub fn init(source: []const u8) Parser {
|
|
||||||
return Parser{
|
|
||||||
.source = source
|
|
||||||
};
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|||||||
Reference in New Issue
Block a user