Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Speed up module parsing with hand-written parser #2897

Merged
merged 5 commits into from
Apr 22, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 94 additions & 17 deletions source/dub/internal/utils.d
Original file line number Diff line number Diff line change
Expand Up @@ -609,34 +609,102 @@ string determineModuleName(BuildSettings settings, NativePath file, NativePath b

/**
* Search for module keyword in D Code
* A primitive parser to skip comments and whitespace to get
* the module's name from the module declaration.
*/
string getModuleNameFromContent(string content) {
import std.regex;
import std.string;
import std.ascii: isAlpha, isAlphaNum, isWhite;
import std.algorithm: among;
import core.exception: RangeError;

content = content.strip;
if (!content.length) return null;
enum keyword = "module";

static bool regex_initialized = false;
static Regex!char comments_pattern, module_pattern;
size_t i = 0;
size_t startIndex = 0, endIndex = 0;
auto foundKeyword = false;

if (!regex_initialized) {
comments_pattern = regex(`//[^\r\n]*\r?\n?|/\*.*?\*/|/\+.*\+/`, "gs");
module_pattern = regex(`module\s+([\w\.]+)\s*;`, "g");
regex_initialized = true;
auto ch() {
return content[i];
}

content = replaceAll(content, comments_pattern, " ");
auto result = matchFirst(content, module_pattern);

if (!result.empty) return result[1];
static bool isIdentChar(in char c) {
return !isWhite(c) && c != '/' && c != ';';
}

return null;
try {
while(i < content.length) {
if(ch == keyword[0] && content[i .. i + keyword.length] == keyword) {
// -1 because the end of the loop will advance by 1
i += keyword.length - 1;
foundKeyword = true;
}
else if(ch == '/') {
++i;
// line comment?
if(ch == '/') {
while(ch != '\n')
++i;
}
// block comment?
else if(ch == '*') {
++i;
while(ch != '*' || content[i + 1] != '/')
++i;
++i; // skip over closing '/'
}
// nested comment?
else if(ch == '+') {
++i;

size_t level = 1;

while(level > 0) {
if(ch == '/') {
++i;
if(ch == '+') {
++i;
++level;
}
}
if(ch == '+') {
++i;
if(ch == '/') {
--level;
} else continue;
}
++i;
}
}
}
else if(isIdentChar(ch) && foundKeyword) {
if(startIndex == 0)
startIndex = i;
++i; // skip the first char of the name
while(isIdentChar(ch)) {
++i;
}
// when we get here, either we're at the end of the module's identifier,
// or there are comments afterwards
if(endIndex == 0) {
endIndex = i;
}
if(!isIdentChar(ch))
return content[startIndex .. endIndex];
else continue;
} else if(!isIdentChar(ch) && foundKeyword && startIndex != 0) {
return content[startIndex .. endIndex];
}
++i;
}
return "";
} catch(RangeError) {
atilaneves marked this conversation as resolved.
Show resolved Hide resolved
return "";
}
}

unittest {
assert(getModuleNameFromContent("") == "");
assert(getModuleNameFromContent("module myPackage.myModule;") == "myPackage.myModule");
assert(getModuleNameFromContent("module myPackage.myModule;") == "myPackage.myModule", getModuleNameFromContent("module myPackage.myModule;"));
assert(getModuleNameFromContent("module \t\n myPackage.myModule \t\r\n;") == "myPackage.myModule");
assert(getModuleNameFromContent("// foo\nmodule bar;") == "bar");
assert(getModuleNameFromContent("/*\nfoo\n*/\nmodule bar;") == "bar");
Expand All @@ -648,11 +716,20 @@ unittest {
assert(getModuleNameFromContent("/+ module foo; +/\nmodule bar;") == "bar");
assert(getModuleNameFromContent("/+ /+ module foo; +/ +/\nmodule bar;") == "bar");
assert(getModuleNameFromContent("// module foo;\nmodule bar; // module foo;") == "bar");

assert(getModuleNameFromContent("// module foo;\nmodule// module foo;\nbar//module foo;\n;// module foo;") == "bar");

assert(getModuleNameFromContent("/* module foo; */\nmodule/*module foo;*/bar/*module foo;*/;") == "bar", getModuleNameFromContent("/* module foo; */\nmodule/*module foo;*/bar/*module foo;*/;"));
assert(getModuleNameFromContent("/+ /+ module foo; +/ module foo; +/ module bar;") == "bar");
//assert(getModuleNameFromContent("/+ /+ module foo; +/ module foo; +/ module bar/++/;") == "bar"); // nested comments require a context-free parser!
assert(getModuleNameFromContent("/+ /+ module foo; +/ module foo; +/ module bar/++/;") == "bar");
assert(getModuleNameFromContent("/*\nmodule sometest;\n*/\n\nmodule fakemath;\n") == "fakemath");
assert(getModuleNameFromContent("module foo_bar;") == "foo_bar");
assert(getModuleNameFromContent("module _foo_bar;") == "_foo_bar");
assert(getModuleNameFromContent("/++ ++/\nmodule foo;") == "foo");
assert(getModuleNameFromContent("module pokémon;") == "pokémon");
assert(getModuleNameFromContent("module éclair;") == "éclair");
assert(getModuleNameFromContent("/** module foo*/ module bar;") == "bar");
assert(getModuleNameFromContent("/* / module foo*/ module bar;") == "bar");
}

/**
Expand Down
Loading