Compare commits

...

7 commits

Author SHA1 Message Date
Félix Baylac Jacqué 891dd0bf95
Cache cold build: prevent race condition
We had a race condition on cold cache builds. When several nix-gl-host
instances were called, they were trying to concurrently build the
cache.

This lead to some weird errors and busted caches.

We introduce a file lock preventing any concurrent access to the
cache.

We take advantage of this bug to rethink the way we build the cache
and do it in a more robust way. Instead of building it in place, we're
now building it first in a temporary directory (while making sure to
patchelf the DSOs according their final destination). We then move
this directory to the actual cache destination iff the cache has been
built successfully.
2023-01-17 16:14:16 +01:00
Félix Baylac Jacqué 4f51b37c29
nvidia_main: refactor out metadata generation in separate function
This nvidia_main function got out of control. Let's chop the metadata
generation out of it.
2023-01-17 12:41:07 +01:00
Félix Baylac Jacqué c1680e0c7c
flake: add meta.mainProgram
Allow us to run the program through `nix run`.
2023-01-05 16:48:38 +01:00
Félix Baylac Jacqué 3876bce7a1
Flake: overlays removal leftovers.
I forgot to remove an instance of the nixpkgs overlay I removed two
commits ago…
2022-12-22 19:14:23 +01:00
Félix Baylac Jacqué 37d4a1b248
Bugfix: do not try to copy and patch unexisting DSOs
It seems like some drivers installations are missing some NVidia
subsystems. We stumbled upon the case of somebody missing the Cuda
libraries.

It did end up making fail the patchelf call.

Preventing the copying/patch routine to run when we do not have any
DSO to copy/patch.
2022-12-22 19:01:31 +01:00
Félix Baylac Jacqué 09fda6b9bd
Remove unused nixpkgs overlay
That nixpkgs overlay was used to inject a patched libglvnd. We're not
using that patch anymore, there's no reason to keep this overlay
around.
2022-12-22 10:15:46 +01:00
Félix Baylac Jacqué eb782d3fd5 Bug: use mtime instead of atime as timestamp
atime == last time a file has been accessed.
mtime == last time a file has been modified.

Facepalm on that one…
2022-12-19 20:53:14 +01:00
4 changed files with 97 additions and 49 deletions

View file

@ -30,4 +30,6 @@ pkgs.stdenvNoCC.mkDerivation {
installPhase = ''
install -D -m0755 src/nixglhost.py $out/bin/nixglhost
'';
meta.mainProgram = "nixglhost";
}

View file

@ -13,14 +13,11 @@
forAllSystems = f: nixpkgs.lib.genAttrs systems (system: f system);
pkgs = system: import nixpkgs {
inherit system;
overlays = [ self.overlays.default ];
};
in
{
defaultPackage = forAllSystems (system: import ./default.nix { pkgs = pkgs system; });
overlays.default = import ./overlays/nixpkgs.nix;
legacyPackages = forAllSystems (system: (pkgs system));
devShell = forAllSystems (system:

View file

@ -1,11 +0,0 @@
self: super:
{
libglvnd = super.libglvnd.overrideAttrs (old: {
src = super.fetchFromGitHub {
owner = "NinjaTrappeur";
repo = "libglvnd";
rev = "f4dff011f78ecd5a69871d4a8ddf3c742de5f621";
sha256 = "sha256-57awDiR9DaFTGe8J4ed89Xm3Fc4/OM6qflsuHqx9mxE=";
};
});
}

View file

@ -1,14 +1,16 @@
#!/usr/bin/env python3
import argparse
import fcntl
import hashlib
import json
import os
import re
import shutil
import subprocess
import stat
import subprocess
import sys
import tempfile
import time
from glob import glob
from typing import List, Literal, Dict, Tuple, TypedDict, TextIO, Optional
@ -42,7 +44,7 @@ class ResolvedLib:
self.fullpath: str = fullpath
if size is None or last_modification is None:
stat = os.stat(fullpath)
self.last_modification: float = stat.st_atime
self.last_modification: float = stat.st_mtime
self.size: int = stat.st_size
else:
self.last_modification = last_modification
@ -108,6 +110,9 @@ class LibraryPath:
and self.path == other.path
)
def __repr__(self):
return f"LibraryPath<{self.path}>"
def __hash__(self):
return hash(
(
@ -409,22 +414,25 @@ def scan_dsos_from_dir(path: str) -> Optional[LibraryPath]:
return None
def cache_library_path(library_path: LibraryPath, cache_dir_root: str) -> str:
def cache_library_path(
library_path: LibraryPath, temp_cache_dir_root: str, final_cache_dir_root: str
) -> str:
"""Generate a cache directory for the LIBRARY_PATH host directory.
This cache directory is mirroring the host directory containing
the graphics card drivers. Its full name is hashed: it's an
attempt to keep the final LD_LIBRARY_PATH reasonably sized.
Returns the full path of the cache directory created by this
function."""
Returns the name of the cache directory created by this
function to CACHE_DIR_ROOT."""
# Hash Computation
h = hashlib.sha256()
h.update(library_path.path.encode("utf8"))
path_hash: str = h.hexdigest()
# Paths
cache_path_root: str = os.path.join(cache_dir_root, path_hash)
cache_path_root: str = os.path.join(temp_cache_dir_root, path_hash)
lib_dir = os.path.join(cache_path_root, "lib")
rpath_lib_dir = os.path.join(final_cache_dir_root, path_hash, "lib")
cuda_dir = os.path.join(cache_path_root, "cuda")
egl_dir = os.path.join(cache_path_root, "egl")
glx_dir = os.path.join(cache_path_root, "glx")
@ -436,8 +444,11 @@ def cache_library_path(library_path: LibraryPath, cache_dir_root: str) -> str:
(library_path.glx, glx_dir),
]:
os.makedirs(d, exist_ok=True)
copy_and_patch_libs(dsos=dsos, dest_dir=d, rpath=lib_dir)
return cache_path_root
if len(dsos) > 0:
copy_and_patch_libs(dsos=dsos, dest_dir=d, rpath=rpath_lib_dir)
else:
log_info(f"Did not find any DSO to put in {d}, skipping copy and patching.")
return path_hash
def generate_cache_ld_library_path(cache_paths: List[str]) -> str:
@ -457,6 +468,32 @@ def generate_cache_ld_library_path(cache_paths: List[str]) -> str:
return ":".join(ld_library_paths)
def generate_cache_metadata(
cache_dir: str, cache_content: CacheDirContent, cache_paths: List[str]
) -> str:
"""Generates the various cache metadata for a given CACHE_CONTENT
and CACHE_PATHS in CACHE_DIR. Return the associated LD_LIBRARY_PATH.
The metadata being:
- CACHE_DIR/cache.json: json file containing all the paths info.
- CACHE_DIR/ld_library_path: file containing the LD_LIBRARY_PATH
to inject for the CACHE_PATHS.
- CACHE_DIR/egl-confs: directory containing the various EGL
confs."""
cache_file_path = os.path.join(cache_dir, "cache.json")
cached_ld_library_path = os.path.join(cache_dir, "ld_library_path")
egl_conf_dir = os.path.join(cache_dir, "egl-confs")
with open(cache_file_path, "w", encoding="utf8") as f:
f.write(cache_content.to_json())
nix_gl_ld_library_path = generate_cache_ld_library_path(cache_paths)
log_info(f"Caching LD_LIBRARY_PATH: {nix_gl_ld_library_path}")
with open(cached_ld_library_path, "w", encoding="utf8") as f:
f.write(nix_gl_ld_library_path)
generate_nvidia_egl_config_files(egl_conf_dir)
return nix_gl_ld_library_path
def nvidia_main(
cache_dir: str, dso_vendor_paths: List[str], print_ld_library_path: bool = False
) -> Dict:
@ -488,40 +525,63 @@ def nvidia_main(
log_info("Searching for the host DSOs")
cache_content: CacheDirContent = CacheDirContent(paths=[])
cache_file_path = os.path.join(cache_dir, "cache.json")
lock_path = os.path.join(os.path.split(cache_dir)[0], "nix-gl-host.lock")
cached_ld_library_path = os.path.join(cache_dir, "ld_library_path")
paths = get_ld_paths()
egl_conf_dir = os.path.join(cache_dir, "egl-confs")
nix_gl_ld_library_path: Optional[str] = None
# Cache/Patch DSOs
for path in paths:
res = scan_dsos_from_dir(path)
if res:
cache_content.paths.append(res)
if not is_dso_cache_up_to_date(
cache_content, cache_file_path
) or not os.path.isfile(cached_ld_library_path):
log_info("The cache is not up to date, regenerating it")
shutil.rmtree(cache_dir)
cache_paths: List[str] = []
for p in cache_content.paths:
log_info(f"Caching {p}")
cache_paths.append(cache_library_path(p, cache_dir))
log_info(f"Caching ")
with open(cache_file_path, "w", encoding="utf8") as f:
f.write(cache_content.to_json())
nix_gl_ld_library_path = generate_cache_ld_library_path(cache_paths)
log_info(f"Caching LD_LIBRARY_PATH: {nix_gl_ld_library_path}")
with open(cached_ld_library_path, "w", encoding="utf8") as f:
f.write(nix_gl_ld_library_path)
generate_nvidia_egl_config_files(egl_conf_dir)
else:
log_info("The cache is up to date, re-using it.")
with open(cached_ld_library_path, "r", encoding="utf8") as f:
nix_gl_ld_library_path = f.read()
#
# We need to be super careful about race conditions here. We're
# using a file lock to make sure only one nix-gl-host instance can
# access the cache at a time.
#
# If the cache is locked, we'll wait until the said lock is
# released. The lock will always be released when the lock FD get
# closed, IE. when we'll get out of this block.
with open(lock_path, "w") as lock:
log_info("Acquiring the cache lock")
fcntl.flock(lock, fcntl.LOCK_EX)
log_info("Cache lock acquired")
for path in paths:
res = scan_dsos_from_dir(path)
if res:
cache_content.paths.append(res)
if not is_dso_cache_up_to_date(
cache_content, cache_file_path
) or not os.path.isfile(cached_ld_library_path):
log_info("The cache is not up to date, regenerating it")
# We're building first the cache in a temporary directory
# to make sure we won't end up with a partially
# populated/corrupted nix-gl-host cache.
with tempfile.TemporaryDirectory() as tmp_cache:
tmp_cache_dir = os.path.join(tmp_cache, "nix-gl-host")
os.makedirs(tmp_cache_dir)
cache_paths: List[str] = []
for p in cache_content.paths:
log_info(f"Caching {p}")
cache_paths.append(cache_library_path(p, tmp_cache_dir, cache_dir))
# Pointing the LD_LIBRARY_PATH to the final destination
# instead of the tmp dir.
cache_absolute_paths = [os.path.join(cache_dir, p) for p in cache_paths]
nix_gl_ld_library_path = generate_cache_metadata(
tmp_cache_dir, cache_content, cache_absolute_paths
)
# The temporary cache has been successfully populated,
# let's mv it to the actual nix-gl-host cache.
# Note: The move operation is atomic on linux.
log_info(f"Mv {tmp_cache_dir} to {cache_dir}")
if os.path.exists(cache_dir):
shutil.rmtree(cache_dir)
shutil.move(tmp_cache_dir, os.path.split(cache_dir)[0])
else:
log_info("The cache is up to date, re-using it.")
with open(cached_ld_library_path, "r", encoding="utf8") as f:
nix_gl_ld_library_path = f.read()
log_info("Cache lock released")
assert nix_gl_ld_library_path, "The nix-host-gl LD_LIBRARY_PATH is not set"
log_info(f"Injecting LD_LIBRARY_PATH: {nix_gl_ld_library_path}")
os.makedirs(egl_conf_dir, exist_ok=True)
new_env = {}
log_info(f"__GLX_VENDOR_LIBRARY_NAME = nvidia")
new_env["__GLX_VENDOR_LIBRARY_NAME"] = "nvidia"
@ -555,8 +615,8 @@ def main(args):
home = os.path.expanduser("~")
xdg_cache_home = os.environ.get("XDG_CACHE_HOME", os.path.join(home, ".cache"))
cache_dir = os.path.join(xdg_cache_home, "nix-gl-host")
log_info(f'Using "{cache_dir}" as cache dir.')
os.makedirs(cache_dir, exist_ok=True)
log_info(f'Using "{cache_dir}" as cache dir.')
if args.driver_directory:
log_info(
f"Retreiving DSOs from the specified directory: {args.driver_directory}"