From 61a9d16d5c1d4088981f7d0ca08655f9155cc015 Mon Sep 17 00:00:00 2001 From: pennae Date: Tue, 21 Dec 2021 09:17:31 +0100 Subject: [PATCH 1/4] don't strdup tokens in the lexer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit every stringy token the lexer returns is turned into a Symbol and not used further, so we don't have to strdup. using a string_view is sufficient, but due to limitations of the current parser we have to use a POD type that holds the same information. gives ~2% on system build, 6% on search, 8% on parsing alone # before Benchmark 1: nix search --offline nixpkgs hello Time (mean ± σ): 610.6 ms ± 2.4 ms [User: 602.5 ms, System: 7.8 ms] Range (min … max): 606.6 ms … 617.3 ms 50 runs Benchmark 2: nix eval -f hackage-packages.nix Time (mean ± σ): 430.1 ms ± 1.4 ms [User: 393.1 ms, System: 36.7 ms] Range (min … max): 428.2 ms … 434.2 ms 50 runs Benchmark 3: nix eval --raw --impure --expr 'with import {}; system' Time (mean ± σ): 3.032 s ± 0.005 s [User: 2.808 s, System: 0.223 s] Range (min … max): 3.023 s … 3.041 s 50 runs # after Benchmark 1: nix search --offline nixpkgs hello Time (mean ± σ): 574.7 ms ± 2.8 ms [User: 566.3 ms, System: 8.0 ms] Range (min … max): 569.2 ms … 580.7 ms 50 runs Benchmark 2: nix eval -f hackage-packages.nix Time (mean ± σ): 394.4 ms ± 0.8 ms [User: 361.8 ms, System: 32.3 ms] Range (min … max): 392.7 ms … 395.7 ms 50 runs Benchmark 3: nix eval --raw --impure --expr 'with import {}; system' Time (mean ± σ): 2.976 s ± 0.005 s [User: 2.757 s, System: 0.218 s] Range (min … max): 2.966 s … 2.990 s 50 runs --- src/libexpr/lexer.l | 14 +++++++------- src/libexpr/parser.y | 25 ++++++++++++++++--------- 2 files changed, 23 insertions(+), 16 deletions(-) diff --git a/src/libexpr/lexer.l b/src/libexpr/lexer.l index c18877e29..70e99d2d2 100644 --- a/src/libexpr/lexer.l +++ b/src/libexpr/lexer.l @@ -139,7 +139,7 @@ or { return OR_KW; } \/\/ { return UPDATE; } \+\+ { return CONCAT; } -{ID} { yylval->id = strdup(yytext); return ID; } +{ID} { yylval->id = {yytext, (size_t) yyleng}; return ID; } {INT} { errno = 0; try { yylval->n = boost::lexical_cast(yytext); @@ -221,14 +221,14 @@ or { return OR_KW; } {PATH_SEG} { POP_STATE(); PUSH_STATE(INPATH_SLASH); - yylval->path = strdup(yytext); + yylval->path = {yytext, (size_t) yyleng}; return PATH; } {HPATH_START} { POP_STATE(); PUSH_STATE(INPATH_SLASH); - yylval->path = strdup(yytext); + yylval->path = {yytext, (size_t) yyleng}; return HPATH; } @@ -237,7 +237,7 @@ or { return OR_KW; } PUSH_STATE(INPATH_SLASH); else PUSH_STATE(INPATH); - yylval->path = strdup(yytext); + yylval->path = {yytext, (size_t) yyleng}; return PATH; } {HPATH} { @@ -245,7 +245,7 @@ or { return OR_KW; } PUSH_STATE(INPATH_SLASH); else PUSH_STATE(INPATH); - yylval->path = strdup(yytext); + yylval->path = {yytext, (size_t) yyleng}; return HPATH; } @@ -280,8 +280,8 @@ or { return OR_KW; } throw ParseError("path has a trailing slash"); } -{SPATH} { yylval->path = strdup(yytext); return SPATH; } -{URI} { yylval->uri = strdup(yytext); return URI; } +{SPATH} { yylval->path = {yytext, (size_t) yyleng}; return SPATH; } +{URI} { yylval->uri = {yytext, (size_t) yyleng}; return URI; } [ \t\r\n]+ /* eat up whitespace */ \#[^\r\n]* /* single-line comments */ diff --git a/src/libexpr/parser.y b/src/libexpr/parser.y index f8aaea582..049a149cc 100644 --- a/src/libexpr/parser.y +++ b/src/libexpr/parser.y @@ -273,9 +273,16 @@ void yyerror(YYLTYPE * loc, yyscan_t scanner, ParseData * data, const char * err nix::Formal * formal; nix::NixInt n; nix::NixFloat nf; - const char * id; // !!! -> Symbol - char * path; - char * uri; + // using C a struct allows us to avoid having to define the special + // members that using string_view here would implicitly delete. + struct StringToken { + const char * p; + size_t l; + operator std::string_view() const { return {p, l}; } + }; + StringToken id; // !!! -> Symbol + StringToken path; + StringToken uri; std::vector * attrNames; std::vector > * string_parts; } @@ -397,7 +404,7 @@ expr_select expr_simple : ID { - if (strcmp($1, "__curPos") == 0) + if (strncmp($1.p, "__curPos", $1.l) == 0) $$ = new ExprPos(CUR_POS); else $$ = new ExprVar(CUR_POS, data->symbols.create($1)); @@ -414,7 +421,7 @@ expr_simple $$ = new ExprConcatStrings(CUR_POS, false, $2); } | SPATH { - string path($1 + 1, strlen($1) - 2); + string path($1.p + 1, $1.l - 2); $$ = new ExprCall(CUR_POS, new ExprVar(data->symbols.create("__findFile")), {new ExprVar(data->symbols.create("__nixPath")), @@ -460,14 +467,14 @@ string_parts_interpolated path_start : PATH { - Path path(absPath($1, data->basePath)); + Path path(absPath({$1.p, $1.l}, data->basePath)); /* add back in the trailing '/' to the first segment */ - if ($1[strlen($1)-1] == '/' && strlen($1) > 1) + if ($1.p[$1.l-1] == '/' && $1.l > 1) path += "/"; $$ = new ExprPath(path); } | HPATH { - Path path(getHome() + string($1 + 1)); + Path path(getHome() + string($1.p + 1, $1.l - 1)); $$ = new ExprPath(path); } ; @@ -543,7 +550,7 @@ attrpath attr : ID { $$ = $1; } - | OR_KW { $$ = "or"; } + | OR_KW { $$ = {"or", 2}; } ; string_attr From eee0bcee227f6a1b46116efc8915545feb5a2e86 Mon Sep 17 00:00:00 2001 From: pennae Date: Mon, 20 Dec 2021 11:29:14 +0100 Subject: [PATCH 2/4] avoid allocations in SymbolTable::create MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit speeds up parsing by ~3%, system builds by a bit more than 1% # before Benchmark 1: nix search --offline nixpkgs hello Time (mean ± σ): 574.7 ms ± 2.8 ms [User: 566.3 ms, System: 8.0 ms] Range (min … max): 569.2 ms … 580.7 ms 50 runs Benchmark 2: nix eval -f ../nixpkgs/pkgs/development/haskell-modules/hackage-packages.nix Time (mean ± σ): 394.4 ms ± 0.8 ms [User: 361.8 ms, System: 32.3 ms] Range (min … max): 392.7 ms … 395.7 ms 50 runs Benchmark 3: nix eval --raw --impure --expr 'with import {}; system' Time (mean ± σ): 2.976 s ± 0.005 s [User: 2.757 s, System: 0.218 s] Range (min … max): 2.966 s … 2.990 s 50 runs # after Benchmark 1: nix search --offline nixpkgs hello Time (mean ± σ): 572.4 ms ± 2.3 ms [User: 563.4 ms, System: 8.6 ms] Range (min … max): 566.9 ms … 579.1 ms 50 runs Benchmark 2: nix eval -f ../nixpkgs/pkgs/development/haskell-modules/hackage-packages.nix Time (mean ± σ): 381.7 ms ± 1.0 ms [User: 348.3 ms, System: 33.1 ms] Range (min … max): 380.2 ms … 387.7 ms 50 runs Benchmark 3: nix eval --raw --impure --expr 'with import {}; system' Time (mean ± σ): 2.936 s ± 0.005 s [User: 2.715 s, System: 0.221 s] Range (min … max): 2.923 s … 2.946 s 50 runs --- src/libexpr/nixexpr.cc | 2 +- src/libexpr/symbol-table.hh | 21 ++++++++++++++------- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/src/libexpr/nixexpr.cc b/src/libexpr/nixexpr.cc index a75357871..640c44c01 100644 --- a/src/libexpr/nixexpr.cc +++ b/src/libexpr/nixexpr.cc @@ -473,7 +473,7 @@ string ExprLambda::showNamePos() const size_t SymbolTable::totalSize() const { size_t n = 0; - for (auto & i : symbols) + for (auto & i : store) n += i.size(); return n; } diff --git a/src/libexpr/symbol-table.hh b/src/libexpr/symbol-table.hh index 4eb6dac81..a090ebae5 100644 --- a/src/libexpr/symbol-table.hh +++ b/src/libexpr/symbol-table.hh @@ -1,7 +1,8 @@ #pragma once +#include #include -#include +#include #include "types.hh" @@ -70,15 +71,21 @@ public: class SymbolTable { private: - typedef std::unordered_set Symbols; - Symbols symbols; + std::unordered_map symbols; + std::list store; public: Symbol create(std::string_view s) { - // FIXME: avoid allocation if 's' already exists in the symbol table. - std::pair res = symbols.emplace(std::string(s)); - return Symbol(&*res.first); + // Most symbols are looked up more than once, so we trade off insertion performance + // for lookup performance. + // TODO: could probably be done more efficiently with transparent Hash and Equals + // on the original implementation using unordered_set + auto it = symbols.find(s); + if (it != symbols.end()) return it->second; + + const string & rawSym = store.emplace_back(s); + return symbols.emplace(rawSym, Symbol(&rawSym)).first->second; } size_t size() const @@ -91,7 +98,7 @@ public: template void dump(T callback) { - for (auto & s : symbols) + for (auto & s : store) callback(s); } }; From 34e3bd10e3891afc965a7fb8fdcaacbdc900b2d5 Mon Sep 17 00:00:00 2001 From: pennae Date: Tue, 21 Dec 2021 13:56:57 +0100 Subject: [PATCH 3/4] avoid copies of parser input data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit when given a string yacc will copy the entire input to a newly allocated location so that it can add a second terminating NUL byte. since the parser is a very internal thing to EvalState we can ensure that having two terminating NUL bytes is always possible without copying, and have the parser itself merely check that the expected NULs are present. # before Benchmark 1: nix search --offline nixpkgs hello Time (mean ± σ): 572.4 ms ± 2.3 ms [User: 563.4 ms, System: 8.6 ms] Range (min … max): 566.9 ms … 579.1 ms 50 runs Benchmark 2: nix eval -f ../nixpkgs/pkgs/development/haskell-modules/hackage-packages.nix Time (mean ± σ): 381.7 ms ± 1.0 ms [User: 348.3 ms, System: 33.1 ms] Range (min … max): 380.2 ms … 387.7 ms 50 runs Benchmark 3: nix eval --raw --impure --expr 'with import {}; system' Time (mean ± σ): 2.936 s ± 0.005 s [User: 2.715 s, System: 0.221 s] Range (min … max): 2.923 s … 2.946 s 50 runs # after Benchmark 1: nix search --offline nixpkgs hello Time (mean ± σ): 571.7 ms ± 2.4 ms [User: 563.3 ms, System: 8.0 ms] Range (min … max): 566.7 ms … 579.7 ms 50 runs Benchmark 2: nix eval -f ../nixpkgs/pkgs/development/haskell-modules/hackage-packages.nix Time (mean ± σ): 376.6 ms ± 1.0 ms [User: 345.8 ms, System: 30.5 ms] Range (min … max): 374.5 ms … 379.1 ms 50 runs Benchmark 3: nix eval --raw --impure --expr 'with import {}; system' Time (mean ± σ): 2.922 s ± 0.006 s [User: 2.707 s, System: 0.215 s] Range (min … max): 2.906 s … 2.934 s 50 runs --- src/libexpr/eval.hh | 6 +++--- src/libexpr/parser.y | 23 +++++++++++++++-------- src/libexpr/primops.cc | 9 ++++++--- src/libutil/util.cc | 4 +++- src/nix-build/nix-build.cc | 2 +- src/nix/repl.cc | 2 +- 6 files changed, 29 insertions(+), 17 deletions(-) diff --git a/src/libexpr/eval.hh b/src/libexpr/eval.hh index cc63294c6..15925a6b4 100644 --- a/src/libexpr/eval.hh +++ b/src/libexpr/eval.hh @@ -181,8 +181,8 @@ public: Expr * parseExprFromFile(const Path & path, StaticEnv & staticEnv); /* Parse a Nix expression from the specified string. */ - Expr * parseExprFromString(std::string_view s, const Path & basePath, StaticEnv & staticEnv); - Expr * parseExprFromString(std::string_view s, const Path & basePath); + Expr * parseExprFromString(std::string s, const Path & basePath, StaticEnv & staticEnv); + Expr * parseExprFromString(std::string s, const Path & basePath); Expr * parseStdin(); @@ -310,7 +310,7 @@ private: friend struct ExprAttrs; friend struct ExprLet; - Expr * parse(const char * text, FileOrigin origin, const Path & path, + Expr * parse(char * text, size_t length, FileOrigin origin, const Path & path, const Path & basePath, StaticEnv & staticEnv); public: diff --git a/src/libexpr/parser.y b/src/libexpr/parser.y index 049a149cc..a3e713937 100644 --- a/src/libexpr/parser.y +++ b/src/libexpr/parser.y @@ -596,7 +596,7 @@ formal namespace nix { -Expr * EvalState::parse(const char * text, FileOrigin origin, +Expr * EvalState::parse(char * text, size_t length, FileOrigin origin, const Path & path, const Path & basePath, StaticEnv & staticEnv) { yyscan_t scanner; @@ -616,7 +616,7 @@ Expr * EvalState::parse(const char * text, FileOrigin origin, data.basePath = basePath; yylex_init(&scanner); - yy_scan_string(text, scanner); + yy_scan_buffer(text, length, scanner); int res = yyparse(scanner, &data); yylex_destroy(scanner); @@ -662,26 +662,33 @@ Expr * EvalState::parseExprFromFile(const Path & path) Expr * EvalState::parseExprFromFile(const Path & path, StaticEnv & staticEnv) { - return parse(readFile(path).c_str(), foFile, path, dirOf(path), staticEnv); + auto buffer = readFile(path); + // readFile should have left some extra space for terminators + buffer.append("\0\0", 2); + return parse(buffer.data(), buffer.size(), foFile, path, dirOf(path), staticEnv); } -Expr * EvalState::parseExprFromString(std::string_view s, const Path & basePath, StaticEnv & staticEnv) +Expr * EvalState::parseExprFromString(std::string s, const Path & basePath, StaticEnv & staticEnv) { - return parse(s.data(), foString, "", basePath, staticEnv); + s.append("\0\0", 2); + return parse(s.data(), s.size(), foString, "", basePath, staticEnv); } -Expr * EvalState::parseExprFromString(std::string_view s, const Path & basePath) +Expr * EvalState::parseExprFromString(std::string s, const Path & basePath) { - return parseExprFromString(s, basePath, staticBaseEnv); + return parseExprFromString(std::move(s), basePath, staticBaseEnv); } Expr * EvalState::parseStdin() { //Activity act(*logger, lvlTalkative, format("parsing standard input")); - return parse(drainFD(0).data(), foStdin, "", absPath("."), staticBaseEnv); + auto buffer = drainFD(0); + // drainFD should have left some extra space for terminators + buffer.append("\0\0", 2); + return parse(buffer.data(), buffer.size(), foStdin, "", absPath("."), staticBaseEnv); } diff --git a/src/libexpr/primops.cc b/src/libexpr/primops.cc index 66af373d7..852317aa3 100644 --- a/src/libexpr/primops.cc +++ b/src/libexpr/primops.cc @@ -350,7 +350,7 @@ void prim_exec(EvalState & state, const Pos & pos, Value * * args, Value & v) auto output = runProgram(program, true, commandArgs); Expr * parsed; try { - parsed = state.parseExprFromString(output, pos.file); + parsed = state.parseExprFromString(std::move(output), pos.file); } catch (Error & e) { e.addTrace(pos, "While parsing the output from '%1%'", program); throw; @@ -3800,9 +3800,12 @@ void EvalState::createBaseEnv() /* Note: we have to initialize the 'derivation' constant *after* building baseEnv/staticBaseEnv because it uses 'builtins'. */ - eval(parse( + char code[] = #include "primops/derivation.nix.gen.hh" - , foFile, sDerivationNix, "/", staticBaseEnv), *vDerivation); + // the parser needs two NUL bytes as terminators; one of them + // is implied by being a C string. + "\0"; + eval(parse(code, sizeof(code), foFile, sDerivationNix, "/", staticBaseEnv), *vDerivation); } diff --git a/src/libutil/util.cc b/src/libutil/util.cc index 43fea1b1e..f15a617b0 100644 --- a/src/libutil/util.cc +++ b/src/libutil/util.cc @@ -669,7 +669,9 @@ void writeFull(int fd, std::string_view s, bool allowInterrupts) string drainFD(int fd, bool block, const size_t reserveSize) { - StringSink sink(reserveSize); + // the parser needs two extra bytes to append terminating characters, other users will + // not care very much about the extra memory. + StringSink sink(reserveSize + 2); drainFD(fd, sink, block); return std::move(*sink.s); } diff --git a/src/nix-build/nix-build.cc b/src/nix-build/nix-build.cc index e2325c91f..b5d0f4813 100755 --- a/src/nix-build/nix-build.cc +++ b/src/nix-build/nix-build.cc @@ -296,7 +296,7 @@ static void main_nix_build(int argc, char * * argv) else for (auto i : left) { if (fromArgs) - exprs.push_back(state->parseExprFromString(i, absPath("."))); + exprs.push_back(state->parseExprFromString(std::move(i), absPath("."))); else { auto absolute = i; try { diff --git a/src/nix/repl.cc b/src/nix/repl.cc index f453343f3..c8bb5a90f 100644 --- a/src/nix/repl.cc +++ b/src/nix/repl.cc @@ -683,7 +683,7 @@ void NixRepl::addVarToScope(const Symbol & name, Value & v) Expr * NixRepl::parseString(string s) { - Expr * e = state->parseExprFromString(s, curDir, staticEnv); + Expr * e = state->parseExprFromString(std::move(s), curDir, staticEnv); return e; } From 72f42093e711db1ab43c920688bb5e59df33935d Mon Sep 17 00:00:00 2001 From: pennae Date: Tue, 21 Dec 2021 10:28:05 +0100 Subject: [PATCH 4/4] optimize unescapeStr MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainly to avoid an allocation and a copy of a string that can be modified in place (ever since EvalState holds on to the buffer, not the generated parser itself). # before Benchmark 1: nix search --offline nixpkgs hello Time (mean ± σ): 571.7 ms ± 2.4 ms [User: 563.3 ms, System: 8.0 ms] Range (min … max): 566.7 ms … 579.7 ms 50 runs Benchmark 2: nix eval -f ../nixpkgs/pkgs/development/haskell-modules/hackage-packages.nix Time (mean ± σ): 376.6 ms ± 1.0 ms [User: 345.8 ms, System: 30.5 ms] Range (min … max): 374.5 ms … 379.1 ms 50 runs Benchmark 3: nix eval --raw --impure --expr 'with import {}; system' Time (mean ± σ): 2.922 s ± 0.006 s [User: 2.707 s, System: 0.215 s] Range (min … max): 2.906 s … 2.934 s 50 runs # after Benchmark 1: nix search --offline nixpkgs hello Time (mean ± σ): 570.4 ms ± 2.8 ms [User: 561.3 ms, System: 8.6 ms] Range (min … max): 564.6 ms … 578.1 ms 50 runs Benchmark 2: nix eval -f ../nixpkgs/pkgs/development/haskell-modules/hackage-packages.nix Time (mean ± σ): 375.4 ms ± 1.3 ms [User: 343.2 ms, System: 31.7 ms] Range (min … max): 373.4 ms … 378.2 ms 50 runs Benchmark 3: nix eval --raw --impure --expr 'with import {}; system' Time (mean ± σ): 2.925 s ± 0.006 s [User: 2.704 s, System: 0.219 s] Range (min … max): 2.910 s … 2.942 s 50 runs --- src/libexpr/lexer.l | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/src/libexpr/lexer.l b/src/libexpr/lexer.l index 70e99d2d2..a0e7a1877 100644 --- a/src/libexpr/lexer.l +++ b/src/libexpr/lexer.l @@ -64,29 +64,32 @@ static void adjustLoc(YYLTYPE * loc, const char * s, size_t len) } -// FIXME: optimize -static Expr * unescapeStr(SymbolTable & symbols, const char * s, size_t length) +// we make use of the fact that the parser receives a private copy of the input +// string and can munge around in it. +static Expr * unescapeStr(SymbolTable & symbols, char * s, size_t length) { - string t; - t.reserve(length); + char * result = s; + char * t = s; char c; + // the input string is terminated with *two* NULs, so we can safely take + // *one* character after the one being checked against. while ((c = *s++)) { if (c == '\\') { - assert(*s); c = *s++; - if (c == 'n') t += '\n'; - else if (c == 'r') t += '\r'; - else if (c == 't') t += '\t'; - else t += c; + if (c == 'n') *t = '\n'; + else if (c == 'r') *t = '\r'; + else if (c == 't') *t = '\t'; + else *t = c; } else if (c == '\r') { /* Normalise CR and CR/LF into LF. */ - t += '\n'; + *t = '\n'; if (*s == '\n') s++; /* cr/lf */ } - else t += c; + else *t = c; + t++; } - return new ExprString(symbols.create(t)); + return new ExprString(symbols.create({result, size_t(t - result)})); }