From 69ff89fd1207bc95adf7a349319973c6070ce540 Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Mon, 25 May 2020 15:12:23 -0400 Subject: [PATCH] stage2 parser: heuristics to pre-allocate token arrays throughput: 72.2 MiB/s => 75.3 MiB/s I also tried the idea from the deleted comment in this commit and it made the throughput worse. --- lib/std/zig/parse.zig | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/lib/std/zig/parse.zig b/lib/std/zig/parse.zig index 0cb3103b2260..612f2b54ad56 100644 --- a/lib/std/zig/parse.zig +++ b/lib/std/zig/parse.zig @@ -14,13 +14,16 @@ pub const Error = error{ParseError} || Allocator.Error; /// Result should be freed with tree.deinit() when there are /// no more references to any of the tokens or nodes. pub fn parse(gpa: *Allocator, source: []const u8) Allocator.Error!*Tree { - // TODO optimization idea: ensureCapacity on the tokens list and - // then appendAssumeCapacity inside the loop. var token_ids = std.ArrayList(Token.Id).init(gpa); defer token_ids.deinit(); var token_locs = std.ArrayList(Token.Loc).init(gpa); defer token_locs.deinit(); + // Empirically, the zig std lib has an 8:1 ratio of source bytes to token count. + const estimated_token_count = source.len / 8; + try token_ids.ensureCapacity(estimated_token_count); + try token_locs.ensureCapacity(estimated_token_count); + var tokenizer = std.zig.Tokenizer.init(source); while (true) { const token = tokenizer.next();