Skip to content

Commit

Permalink
Add string parsing + fuzzed string test set
Browse files Browse the repository at this point in the history
I followed mostly the same procedure outlined here: https://www.ryanliptak.com/blog/fuzzing-as-test-case-generator/
but used a combination of Zua and fuzzing-lua to ultimately create the sets of inputs/outputs.

- First, a giant corpus (16k+) of fuzzed string literals was created by iterating through all the fuzzed lexer inputs and outputting the source of every <string> token to a separate file (see test/fuzz_strings_gen.zig). This step would be difficult to do with Lua's API because strings are parsed as they are lexed, meaning any relationship to the original source is lost once the token is parsed.
- Then, I used libFuzzer and fuzzing-lua to minimize the string corpus (via the -merge=1 flag).
- Then, I used Lua to generate corresponding output files containing the parsed version of each input string (this code will be committed to fuzzing-lua once I clean it up).

Kind of convoluted, but it ended up working well--there were a lot of bugs in my initial string parsing implementation that the fuzzed set allowed me to find.
  • Loading branch information
squeek502 committed Jan 8, 2020
1 parent 0735e62 commit 5de41fd
Show file tree
Hide file tree
Showing 176 changed files with 503 additions and 0 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ Goals, in order of priority:
+ [x] Improve tests, perhaps use fuzz testing
- See [Fuzzing As a Test Case Generator](https://www.ryanliptak.com/blog/fuzzing-as-test-case-generator/) and [squeek502/fuzzing-lua](https://github.com/squeek502/fuzzing-lua/)
+ [ ] Cleanup implementation
- [x] String parsing (in Lua this was done at lex-time) -> [parse.zig](src/parse.zig)
- [ ] Number parsing (in Lua this was done as lex-time)
- [ ] Parser (lparser.c/.h)
- [ ] ...

Expand Down
24 changes: 24 additions & 0 deletions build.zig
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,28 @@ pub fn build(b: *Builder) void {
bench_lex_tests.addPackagePath("zua", "src/zua.zig");
const bench_lex_test_step = b.step("bench_lex", "Bench lexer against a fuzzed corpus from fuzzing-lua");
bench_lex_test_step.dependOn(&bench_lex_tests.step);

const fuzz_strings_inputs_dir_default = "test/strings/inputs";
const fuzz_strings_outputs_dir_default = "test/strings/outputs";
const fuzz_strings_gen_dir_default = "test/strings/generated";
const fuzz_strings_inputs_dir = b.option([]const u8, "fuzz_strings_inputs_dir", "Directory with input strings for string parsing tests") orelse fuzz_strings_inputs_dir_default;
const fuzz_strings_outputs_dir = b.option([]const u8, "fuzz_strings_outputs_dir", "Directory with output strings for string parsing tests") orelse fuzz_strings_outputs_dir_default;
const fuzz_strings_gen_dir = b.option([]const u8, "fuzz_strings_gen_dir", "Directory to output generated string inputs to") orelse fuzz_strings_gen_dir_default;

var fuzz_strings = b.addTest("test/fuzz_strings.zig");
fuzz_strings.setBuildMode(mode);
fuzz_strings.addBuildOption([]const u8, "fuzz_strings_inputs_dir", b.fmt("\"{}\"", .{fuzz_strings_inputs_dir}));
fuzz_strings.addBuildOption([]const u8, "fuzz_strings_outputs_dir", b.fmt("\"{}\"", .{fuzz_strings_outputs_dir}));
fuzz_strings.addPackagePath("zua", "src/zua.zig");
const fuzz_strings_step = b.step("fuzz_strings", "Test string parsing against a fuzzed corpus from fuzzing-lua");
fuzz_strings_step.dependOn(&fuzz_strings.step);

var fuzz_strings_gen = b.addExecutable("fuzz_strings_gen", "test/fuzz_strings_gen.zig");
fuzz_strings_gen.setBuildMode(mode);
fuzz_strings_gen.addBuildOption([]const u8, "fuzz_lex_inputs_dir", b.fmt("\"{}\"", .{fuzz_lex_inputs_dir}));
fuzz_strings_gen.addBuildOption([]const u8, "fuzz_strings_gen_dir", b.fmt("\"{}\"", .{fuzz_strings_gen_dir}));
fuzz_strings_gen.addPackagePath("zua", "src/zua.zig");

const fuzz_strings_gen_run_step = b.step("fuzz_strings_gen_run", "Generate string inputs from a fuzzed corpus of lexer inputs");
fuzz_strings_gen_run_step.dependOn(&fuzz_strings_gen.run().step);
}
175 changes: 175 additions & 0 deletions src/parse.zig
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
const std = @import("std");
const lex = @import("lex.zig");

// Notes:
//
// Lua parser always parses into a function (called the 'main' function) which
// is always varargs (the values in the varargs differs depending on Lua version)

pub const Parser = struct {
/// Because the lexer has already validated that strings don't contain
/// any invalid characters, this function can be implemented without
/// the possibility of failure. Any failures are a bug in the lexer.
///
/// dest_buf must be at least as big as source to ensure it is large enough
/// to hold the parsed string
/// TODO: should this function be part of lex.Token instead?
pub fn parseStringLiteral(source_raw: []const u8, dest_buf: []u8) []u8 {
std.debug.assert(dest_buf.len >= source_raw.len);
var source: []const u8 = source_raw[0..];

// trim the start/end delimeters
var delim_len: usize = undefined;
var is_long_string: bool = false;
var skip_first_char: bool = false;
switch (source[0]) {
'\'', '"' => delim_len = 1,
'[' => {
var num_sep: usize = 0;
while (source[1 + num_sep] == '=') : (num_sep += 1) {}
std.debug.assert(source[1 + num_sep] == '[');
delim_len = 2 + num_sep;
is_long_string = true;
// if the first line of a long string is a newline char, it gets skipped
skip_first_char = source[delim_len] == '\r' or source[delim_len] == '\n';
},
else => unreachable,
}
source = source[delim_len .. source.len - delim_len];
if (skip_first_char) source = source[1..];

// like std.io.SliceOutStream but no need to check bounds of slice
// and can only append 1 character at a time (also doesn't implement Stream)
const SliceWriter = struct {
const Self = @This();

pos: usize = 0,
slice: []u8,

fn write(self: *Self, char: u8) void {
self.slice[self.pos] = char;
self.pos += 1;
}

fn getWritten(self: Self) []u8 {
return self.slice[0..self.pos];
}
};

const State = enum {
Normal,
Escaped,
EscapedNumerals,
EscapedLineEndings,
};

var writer = SliceWriter{ .slice = dest_buf };

var string_escape_n: u8 = 0;
var string_escape_i: std.math.IntFittingRange(0, 3) = 0;
var state: State = State.Normal;
var index: usize = 0;
while (index < source.len) : (index += 1) {
const c = source[index];
switch (state) {
State.Normal => switch (c) {
// Lua's string parser transforms all \r to \n
'\r' => writer.write('\n'),
'\\' => state = State.Escaped,
else => writer.write(c),
},
State.Escaped => switch (c) {
'0'...'9' => {
string_escape_n = c - '0';
string_escape_i = 1;
state = State.EscapedNumerals;
},
'\r', '\n' => {
// escaped \r and \n get transformed to \n
writer.write('\n');
state = State.EscapedLineEndings;
},
else => {
switch (c) {
'a' => writer.write('\x07'),
'b' => writer.write('\x08'),
'f' => writer.write('\x0C'),
'n' => writer.write('\n'),
'r' => writer.write('\r'),
't' => writer.write('\t'),
'v' => writer.write('\x0B'),
else => writer.write(c),
}
state = State.Normal;
},
},
State.EscapedNumerals => switch(c) {
'0'...'9' => {
string_escape_n = 10 * string_escape_n + (c - '0');
string_escape_i += 1;
if (string_escape_i == 3) {
writer.write(string_escape_n);
state = State.Normal;
}
},
else => {
writer.write(string_escape_n);
// backtrack so that we handle the current char properly
index -= 1;
state = State.Normal;
},
},
State.EscapedLineEndings => switch(c) {
'\r', '\n' => {
state = State.Normal;
},
else => {
// backtrack so that we handle the current char properly
index -= 1;
state = State.Normal;
},
},
}
}
// we could be in a state that still needs processing here,
// since we could have hit the end of the string while unsure
// if a \ddd pattern was finished
switch (state) {
State.EscapedNumerals => {
writer.write(string_escape_n);
},
State.Normal,
State.EscapedLineEndings,
=> {},
else => unreachable,
}

return writer.getWritten();
}
};

test "parseStringLiteral" {
var buf_arr: [100]u8 = undefined;
var buf: []u8 = buf_arr[0..];
std.testing.expectEqualSlices(u8, "hello", Parser.parseStringLiteral("'hello'", buf));
std.testing.expectEqualSlices(u8, "hello", Parser.parseStringLiteral("\"hello\"", buf));
std.testing.expectEqualSlices(u8, "hello", Parser.parseStringLiteral("[[hello]]", buf));
std.testing.expectEqualSlices(u8, "hello", Parser.parseStringLiteral("[=[hello]=]", buf));
std.testing.expectEqualSlices(u8, "hello", Parser.parseStringLiteral("[===[hello]===]", buf));
std.testing.expectEqualSlices(u8, "\\ \n \x0B", Parser.parseStringLiteral("'\\\\ \\n \\v'", buf));

// long strings skip initial newline
std.testing.expectEqualSlices(u8, "hello", Parser.parseStringLiteral("[[\nhello]]", buf));
std.testing.expectEqualSlices(u8, "\nhello", Parser.parseStringLiteral("[[\r\rhello]]", buf));

// escaped \r gets transformed into \n
std.testing.expectEqualSlices(u8, "\n", Parser.parseStringLiteral("\"\\\r\"", buf));

// escaped newlines and newline pairs
std.testing.expectEqualSlices(u8, "\n\\ ", Parser.parseStringLiteral("\"\\\r\\\\ \"", buf));
std.testing.expectEqualSlices(u8, "\n\\ ", Parser.parseStringLiteral("\"\\\r\n\\\\ \"", buf));
std.testing.expectEqualSlices(u8, "\n", Parser.parseStringLiteral("\"\\\n\r\"", buf));

// escaped numerals
std.testing.expectEqualSlices(u8, "\x01-\x02", Parser.parseStringLiteral("\"\\1-\\2\"", buf));
}
2 changes: 2 additions & 0 deletions src/zua.zig
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
const std = @import("std");

pub const lex = @import("lex.zig");
pub const parse = @import("parse.zig");

pub fn main() void {
}

test "zua" {
_ = @import("lex.zig");
_ = @import("parse.zig");
}
72 changes: 72 additions & 0 deletions test/fuzz_strings.zig
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
const std = @import("std");
const zua = @import("zua");
const lex = zua.lex;
const parse = zua.parse;

// Tests for comparing parsed strings between Zua and Lua.
// Expects @import("build_options").fuzz_strings_inputs_dir to be a path to
// a directory containing a corpus of inputs to test and
// @import("build_options").fuzz_strings_outputs_dir to be a path to a
// directory containing the corresponding expected string after
// parsing.
//
// A usable inputs/outputs pair can be obtained from
// https://github.com/squeek502/fuzzing-lua

const verboseTestPrinting = false;

const build_options = @import("build_options");
const inputs_dir_opt = build_options.fuzz_strings_inputs_dir;
const outputs_dir_opt = build_options.fuzz_strings_outputs_dir;

test "string input/output pairs" {
var arena_allocator = std.heap.ArenaAllocator.init(std.heap.page_allocator);
defer arena_allocator.deinit();
var allocator = &arena_allocator.allocator;

// resolve these now since Zig's std lib on Windows rejects paths with / as the path sep
const inputs_dir = try std.fs.path.resolve(allocator, &[_][]const u8{ inputs_dir_opt });
const outputs_dir = try std.fs.path.resolve(allocator, &[_][]const u8{ outputs_dir_opt });

var walker = try std.fs.walkPath(allocator, inputs_dir);
defer walker.deinit();
var path_buffer = try std.Buffer.init(allocator, outputs_dir);
defer path_buffer.deinit();
var result_buffer: [1024 * 1024]u8 = undefined;

var n: usize = 0;
while (try walker.next()) |entry| {
if (verboseTestPrinting) {
std.debug.warn("\n{}\n", .{entry.basename});
}
const contents = try entry.dir.readFileAlloc(allocator, entry.basename, std.math.maxInt(usize));
defer allocator.free(contents);

path_buffer.shrink(outputs_dir.len);
try path_buffer.appendByte(std.fs.path.sep);
try path_buffer.append(entry.basename);
const expectedContents = try std.io.readFileAlloc(allocator, path_buffer.toSliceConst());
defer allocator.free(expectedContents);

var lexer = lex.DefaultLexer.init(contents);
while (true) {
const token = lexer.next() catch |e| {
break;
};
if (token.id == lex.Token.Id.Eof) break;
if (token.id != lex.Token.Id.String) continue;

const string_source = contents[token.start..token.end];
var buf = try allocator.alloc(u8, string_source.len);
defer allocator.free(buf);
const parsed = parse.Parser.parseStringLiteral(string_source, buf);
if (verboseTestPrinting) {
std.debug.warn("got\n{x}\n", .{parsed});
std.debug.warn("expected\n{x}\n", .{expectedContents});
}
std.testing.expectEqualSlices(u8, expectedContents, parsed);
}
n += 1;
}
std.debug.warn("{} input/output pairs checked...", .{n});
}
65 changes: 65 additions & 0 deletions test/fuzz_strings_gen.zig
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
const std = @import("std");
const lex = @import("zua").lex;

// Code for generating a potentially huge collection of
// files containing the source of every string literal token
// in the corpus provided in @import("build_options").fuzz_lex_inputs_dir
// and outputting them to @import("build_options").fuzz_strings_gen_dir
//
// This is a building block for use later with fuzz_strings.zig,
// after minimizing/generating outputs with https://github.com/squeek502/fuzzing-lua

const build_options = @import("build_options");
const inputs_dir_opt = build_options.fuzz_lex_inputs_dir;
const outputs_dir_opt = build_options.fuzz_strings_gen_dir;

pub fn main() !void {
var arena_allocator = std.heap.ArenaAllocator.init(std.heap.page_allocator);
defer arena_allocator.deinit();
var allocator = &arena_allocator.allocator;

// resolve these now since Zig's std lib on Windows rejects paths with / as the path sep
const inputs_dir = try std.fs.path.resolve(allocator, &[_][]const u8{inputs_dir_opt});
const outputs_dir = try std.fs.path.resolve(allocator, &[_][]const u8{outputs_dir_opt});

// clean the outputs dir
std.fs.deleteTree(outputs_dir) catch |err| switch(err) {
error.FileNotFound => {},
else => |e| return e,
};
try std.fs.makePath(allocator, outputs_dir);

var walker = try std.fs.walkPath(allocator, inputs_dir);
defer walker.deinit();
var path_buffer = try std.Buffer.init(allocator, outputs_dir);
defer path_buffer.deinit();
var result_buffer: [1024 * 1024]u8 = undefined;

var n: usize = 0;
while (try walker.next()) |entry| {
const contents = try entry.dir.readFileAlloc(allocator, entry.basename, std.math.maxInt(usize));
defer allocator.free(contents);

var lexer = lex.DefaultLexer.init(contents);
while (true) {
const token = lexer.next() catch |e| {
break;
};
if (token.id == lex.Token.Id.Eof) break;
if (token.id != lex.Token.Id.String) continue;

path_buffer.shrink(outputs_dir.len);
try path_buffer.appendByte(std.fs.path.sep);
var buffer_out_stream = std.io.BufferOutStream.init(&path_buffer);
try buffer_out_stream.stream.print("{}", .{n});

try std.io.writeFile(path_buffer.toSliceConst(), contents[token.start..token.end]);

n += 1;
if (n % 100 == 0) {
std.debug.warn("{}...\r", .{n});
}
}
}
std.debug.warn("{} files written to '{}'\n", .{n, outputs_dir});
}
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[[[�]Hk*[�]]
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"\\\tr\tr\tr\t"
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"Ea*G"
6 changes: 6 additions & 0 deletions test/strings/inputs/0cb91f63eb37118dc0b172a3ef62967b41b8ce03
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
[[




]]
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"\1-\2"
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[[e�]]
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"\""
Binary file not shown.
Loading

0 comments on commit 5de41fd

Please sign in to comment.