From c5fdbdae321903740e0e735aa89fab5647992687 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Mon, 19 Jun 2023 16:54:05 +0200 Subject: [PATCH 1/6] LocalStore::addTempRoot(): Handle ENOENT If the garbage collector has acquired the global GC lock, but hasn't created the GC socket yet, then a client attempting to connect would get ENOENT. Note that this only happens when the GC runs for the first time on a machine. Subsequently clients will get ECONNREFUSED which was already handled. Fixes #7370. --- src/libstore/gc.cc | 13 +++++++++---- tests/gc-non-blocking.sh | 7 ++++++- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/src/libstore/gc.cc b/src/libstore/gc.cc index 0038ec802..b5b9e2049 100644 --- a/src/libstore/gc.cc +++ b/src/libstore/gc.cc @@ -138,9 +138,9 @@ void LocalStore::addTempRoot(const StorePath & path) try { nix::connect(fdRootsSocket->get(), socketPath); } catch (SysError & e) { - /* The garbage collector may have exited, so we need to - restart. */ - if (e.errNo == ECONNREFUSED) { + /* The garbage collector may have exited or not + created the socket yet, so we need to restart. */ + if (e.errNo == ECONNREFUSED || e.errNo == ENOENT) { debug("GC socket connection refused"); fdRootsSocket->close(); goto restart; @@ -503,6 +503,11 @@ void LocalStore::collectGarbage(const GCOptions & options, GCResults & results) auto fdGCLock = openGCLock(); FdLock gcLock(fdGCLock.get(), ltWrite, true, "waiting for the big garbage collector lock..."); + /* Synchronisation point to test ENOENT handling in + addTempRoot(), see tests/gc-non-blocking.sh. */ + if (auto p = getEnv("_NIX_TEST_GC_SYNC_2")) + readFile(*p); + /* Start the server for receiving new roots. */ auto socketPath = stateDir.get() + gcSocketPath; createDirs(dirOf(socketPath)); @@ -772,7 +777,7 @@ void LocalStore::collectGarbage(const GCOptions & options, GCResults & results) } }; - /* Synchronisation point for testing, see tests/gc-concurrent.sh. */ + /* Synchronisation point for testing, see tests/gc-non-blocking.sh. */ if (auto p = getEnv("_NIX_TEST_GC_SYNC")) readFile(*p); diff --git a/tests/gc-non-blocking.sh b/tests/gc-non-blocking.sh index 0d781485d..da6dbdf5d 100644 --- a/tests/gc-non-blocking.sh +++ b/tests/gc-non-blocking.sh @@ -9,16 +9,21 @@ clearStore fifo=$TEST_ROOT/test.fifo mkfifo "$fifo" +fifo2=$TEST_ROOT/test2.fifo +mkfifo "$fifo2" + dummy=$(nix store add-path ./simple.nix) running=$TEST_ROOT/running touch $running -(_NIX_TEST_GC_SYNC=$fifo nix-store --gc -vvvvv; rm $running) & +(_NIX_TEST_GC_SYNC=$fifo _NIX_TEST_GC_SYNC_2=$fifo2 nix-store --gc -vvvvv; rm $running) & pid=$! sleep 2 +(sleep 1; echo > $fifo2) & + outPath=$(nix-build --max-silent-time 60 -o "$TEST_ROOT/result" -E " with import ./config.nix; mkDerivation { From 3859b425975d0347e724b6abb513662667b3e8c7 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Wed, 21 Jun 2023 16:17:21 +0200 Subject: [PATCH 2/6] Wait for pid --- tests/gc-non-blocking.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/gc-non-blocking.sh b/tests/gc-non-blocking.sh index da6dbdf5d..7f2aebb8b 100644 --- a/tests/gc-non-blocking.sh +++ b/tests/gc-non-blocking.sh @@ -23,6 +23,7 @@ pid=$! sleep 2 (sleep 1; echo > $fifo2) & +pid2=$! outPath=$(nix-build --max-silent-time 60 -o "$TEST_ROOT/result" -E " with import ./config.nix; @@ -32,6 +33,7 @@ outPath=$(nix-build --max-silent-time 60 -o "$TEST_ROOT/result" -E " }") wait $pid +wait $pid2 (! test -e $running) (! test -e $dummy) From faf87b51f76ba9794e65e1d17dc3debf759052cd Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Mon, 8 Jan 2024 14:14:36 +0100 Subject: [PATCH 3/6] Show why GC socket connection was refused Co-authored-by: John Ericson --- src/libstore/gc.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libstore/gc.cc b/src/libstore/gc.cc index b5b9e2049..38a9c708b 100644 --- a/src/libstore/gc.cc +++ b/src/libstore/gc.cc @@ -141,7 +141,7 @@ void LocalStore::addTempRoot(const StorePath & path) /* The garbage collector may have exited or not created the socket yet, so we need to restart. */ if (e.errNo == ECONNREFUSED || e.errNo == ENOENT) { - debug("GC socket connection refused"); + debug("GC socket connection refused: %s", e.msg()) fdRootsSocket->close(); goto restart; } From 3e237598342dee46188c83fba49cc30d509ee553 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Fri, 12 Jan 2024 12:38:55 +0100 Subject: [PATCH 4/6] gc-non-blocking.sh: Add explanation Also name the _NIX_TEST_GC_SYNC environment variables logically. --- src/libstore/gc.cc | 10 +++++----- tests/functional/gc-non-blocking.sh | 20 ++++++++++++++------ 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/src/libstore/gc.cc b/src/libstore/gc.cc index bd64e238d..80e036e7e 100644 --- a/src/libstore/gc.cc +++ b/src/libstore/gc.cc @@ -511,7 +511,7 @@ void LocalStore::collectGarbage(const GCOptions & options, GCResults & results) /* Synchronisation point to test ENOENT handling in addTempRoot(), see tests/gc-non-blocking.sh. */ - if (auto p = getEnv("_NIX_TEST_GC_SYNC_2")) + if (auto p = getEnv("_NIX_TEST_GC_SYNC_1")) readFile(*p); /* Start the server for receiving new roots. */ @@ -637,6 +637,10 @@ void LocalStore::collectGarbage(const GCOptions & options, GCResults & results) roots.insert(root.first); } + /* Synchronisation point for testing, see tests/functional/gc-non-blocking.sh. */ + if (auto p = getEnv("_NIX_TEST_GC_SYNC_2")) + readFile(*p); + /* Helper function that deletes a path from the store and throws GCLimitReached if we've deleted enough garbage. */ auto deleteFromStore = [&](std::string_view baseName) @@ -783,10 +787,6 @@ void LocalStore::collectGarbage(const GCOptions & options, GCResults & results) } }; - /* Synchronisation point for testing, see tests/functional/gc-non-blocking.sh. */ - if (auto p = getEnv("_NIX_TEST_GC_SYNC")) - readFile(*p); - /* Either delete all garbage paths, or just the specified paths (for gcDeleteSpecific). */ if (options.action == GCOptions::gcDeleteSpecific) { diff --git a/tests/functional/gc-non-blocking.sh b/tests/functional/gc-non-blocking.sh index 7f2aebb8b..ec280badb 100644 --- a/tests/functional/gc-non-blocking.sh +++ b/tests/functional/gc-non-blocking.sh @@ -6,10 +6,14 @@ needLocalStore "the GC test needs a synchronisation point" clearStore -fifo=$TEST_ROOT/test.fifo -mkfifo "$fifo" +# This FIFO is read just after the global GC lock has been acquired, +# but before the root server is started. +fifo1=$TEST_ROOT/test2.fifo +mkfifo "$fifo1" -fifo2=$TEST_ROOT/test2.fifo +# This FIFO is read just after the roots have been read, but before +# the actual GC starts. +fifo2=$TEST_ROOT/test.fifo mkfifo "$fifo2" dummy=$(nix store add-path ./simple.nix) @@ -17,19 +21,23 @@ dummy=$(nix store add-path ./simple.nix) running=$TEST_ROOT/running touch $running -(_NIX_TEST_GC_SYNC=$fifo _NIX_TEST_GC_SYNC_2=$fifo2 nix-store --gc -vvvvv; rm $running) & +# Start GC. +(_NIX_TEST_GC_SYNC_1=$fifo1 _NIX_TEST_GC_SYNC_2=$fifo2 nix-store --gc -vvvvv; rm $running) & pid=$! sleep 2 -(sleep 1; echo > $fifo2) & +# Delay the start of the root server to check that the build below +# correctly handles ENOENT when connecting to the root server. +(sleep 1; echo > $fifo1) & pid2=$! +# Start a build. This should not be blocked by the GC in progress. outPath=$(nix-build --max-silent-time 60 -o "$TEST_ROOT/result" -E " with import ./config.nix; mkDerivation { name = \"non-blocking\"; - buildCommand = \"set -x; test -e $running; mkdir \$out; echo > $fifo\"; + buildCommand = \"set -x; test -e $running; mkdir \$out; echo > $fifo2\"; }") wait $pid From 0b1d93d2bae5fda9924f13246d7a667ce4392a4d Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Tue, 16 Jan 2024 15:23:22 +0100 Subject: [PATCH 5/6] Sleep a bit between attempts to connect to the root server --- src/libstore/gc.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/libstore/gc.cc b/src/libstore/gc.cc index f60011f95..cb820e2d5 100644 --- a/src/libstore/gc.cc +++ b/src/libstore/gc.cc @@ -154,6 +154,7 @@ void LocalStore::addTempRoot(const StorePath & path) if (e.errNo == ECONNREFUSED || e.errNo == ENOENT) { debug("GC socket connection refused: %s", e.msg()); fdRootsSocket->close(); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); goto restart; } throw; From d005bade7f3339cc68bee12ce13d863d51d54dc4 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Tue, 16 Jan 2024 15:23:46 +0100 Subject: [PATCH 6/6] connect(): Propagate errno from the child process This is necessary on macOS since addTempRoot() relies on errno. --- src/libutil/unix-domain-socket.cc | 39 +++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/src/libutil/unix-domain-socket.cc b/src/libutil/unix-domain-socket.cc index 8949461d2..05bbb5ba3 100644 --- a/src/libutil/unix-domain-socket.cc +++ b/src/libutil/unix-domain-socket.cc @@ -1,6 +1,7 @@ #include "file-system.hh" #include "processes.hh" #include "unix-domain-socket.hh" +#include "util.hh" #include #include @@ -75,21 +76,35 @@ void connect(int fd, const std::string & path) addr.sun_family = AF_UNIX; if (path.size() + 1 >= sizeof(addr.sun_path)) { + Pipe pipe; + pipe.create(); Pid pid = startProcess([&]() { - Path dir = dirOf(path); - if (chdir(dir.c_str()) == -1) - throw SysError("chdir to '%s' failed", dir); - std::string base(baseNameOf(path)); - if (base.size() + 1 >= sizeof(addr.sun_path)) - throw Error("socket path '%s' is too long", base); - memcpy(addr.sun_path, base.c_str(), base.size() + 1); - if (connect(fd, (struct sockaddr *) &addr, sizeof(addr)) == -1) - throw SysError("cannot connect to socket at '%s'", path); - _exit(0); + try { + pipe.readSide.close(); + Path dir = dirOf(path); + if (chdir(dir.c_str()) == -1) + throw SysError("chdir to '%s' failed", dir); + std::string base(baseNameOf(path)); + if (base.size() + 1 >= sizeof(addr.sun_path)) + throw Error("socket path '%s' is too long", base); + memcpy(addr.sun_path, base.c_str(), base.size() + 1); + if (connect(fd, (struct sockaddr *) &addr, sizeof(addr)) == -1) + throw SysError("cannot connect to socket at '%s'", path); + writeFull(pipe.writeSide.get(), "0\n"); + } catch (SysError & e) { + writeFull(pipe.writeSide.get(), fmt("%d\n", e.errNo)); + } catch (...) { + writeFull(pipe.writeSide.get(), "-1\n"); + } }); - int status = pid.wait(); - if (status != 0) + pipe.writeSide.close(); + auto errNo = string2Int(chomp(drainFD(pipe.readSide.get()))); + if (!errNo || *errNo == -1) throw Error("cannot connect to socket at '%s'", path); + else if (*errNo > 0) { + errno = *errNo; + throw SysError("cannot connect to socket at '%s'", path); + } } else { memcpy(addr.sun_path, path.c_str(), path.size() + 1); if (connect(fd, (struct sockaddr *) &addr, sizeof(addr)) == -1)