Compare commits

...

2 commits

Author SHA1 Message Date
Félix Baylac Jacqué 891dd0bf95
Cache cold build: prevent race condition
We had a race condition on cold cache builds. When several nix-gl-host
instances were called, they were trying to concurrently build the
cache.

This lead to some weird errors and busted caches.

We introduce a file lock preventing any concurrent access to the
cache.

We take advantage of this bug to rethink the way we build the cache
and do it in a more robust way. Instead of building it in place, we're
now building it first in a temporary directory (while making sure to
patchelf the DSOs according their final destination). We then move
this directory to the actual cache destination iff the cache has been
built successfully.
2023-01-17 16:14:16 +01:00
Félix Baylac Jacqué 4f51b37c29
nvidia_main: refactor out metadata generation in separate function
This nvidia_main function got out of control. Let's chop the metadata
generation out of it.
2023-01-17 12:41:07 +01:00

View file

@ -1,14 +1,16 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import argparse import argparse
import fcntl
import hashlib import hashlib
import json import json
import os import os
import re import re
import shutil import shutil
import subprocess
import stat import stat
import subprocess
import sys import sys
import tempfile
import time import time
from glob import glob from glob import glob
from typing import List, Literal, Dict, Tuple, TypedDict, TextIO, Optional from typing import List, Literal, Dict, Tuple, TypedDict, TextIO, Optional
@ -412,22 +414,25 @@ def scan_dsos_from_dir(path: str) -> Optional[LibraryPath]:
return None return None
def cache_library_path(library_path: LibraryPath, cache_dir_root: str) -> str: def cache_library_path(
library_path: LibraryPath, temp_cache_dir_root: str, final_cache_dir_root: str
) -> str:
"""Generate a cache directory for the LIBRARY_PATH host directory. """Generate a cache directory for the LIBRARY_PATH host directory.
This cache directory is mirroring the host directory containing This cache directory is mirroring the host directory containing
the graphics card drivers. Its full name is hashed: it's an the graphics card drivers. Its full name is hashed: it's an
attempt to keep the final LD_LIBRARY_PATH reasonably sized. attempt to keep the final LD_LIBRARY_PATH reasonably sized.
Returns the full path of the cache directory created by this Returns the name of the cache directory created by this
function.""" function to CACHE_DIR_ROOT."""
# Hash Computation # Hash Computation
h = hashlib.sha256() h = hashlib.sha256()
h.update(library_path.path.encode("utf8")) h.update(library_path.path.encode("utf8"))
path_hash: str = h.hexdigest() path_hash: str = h.hexdigest()
# Paths # Paths
cache_path_root: str = os.path.join(cache_dir_root, path_hash) cache_path_root: str = os.path.join(temp_cache_dir_root, path_hash)
lib_dir = os.path.join(cache_path_root, "lib") lib_dir = os.path.join(cache_path_root, "lib")
rpath_lib_dir = os.path.join(final_cache_dir_root, path_hash, "lib")
cuda_dir = os.path.join(cache_path_root, "cuda") cuda_dir = os.path.join(cache_path_root, "cuda")
egl_dir = os.path.join(cache_path_root, "egl") egl_dir = os.path.join(cache_path_root, "egl")
glx_dir = os.path.join(cache_path_root, "glx") glx_dir = os.path.join(cache_path_root, "glx")
@ -440,10 +445,10 @@ def cache_library_path(library_path: LibraryPath, cache_dir_root: str) -> str:
]: ]:
os.makedirs(d, exist_ok=True) os.makedirs(d, exist_ok=True)
if len(dsos) > 0: if len(dsos) > 0:
copy_and_patch_libs(dsos=dsos, dest_dir=d, rpath=lib_dir) copy_and_patch_libs(dsos=dsos, dest_dir=d, rpath=rpath_lib_dir)
else: else:
log_info(f"Did not find any DSO to put in {d}, skipping copy and patching.") log_info(f"Did not find any DSO to put in {d}, skipping copy and patching.")
return cache_path_root return path_hash
def generate_cache_ld_library_path(cache_paths: List[str]) -> str: def generate_cache_ld_library_path(cache_paths: List[str]) -> str:
@ -463,6 +468,32 @@ def generate_cache_ld_library_path(cache_paths: List[str]) -> str:
return ":".join(ld_library_paths) return ":".join(ld_library_paths)
def generate_cache_metadata(
cache_dir: str, cache_content: CacheDirContent, cache_paths: List[str]
) -> str:
"""Generates the various cache metadata for a given CACHE_CONTENT
and CACHE_PATHS in CACHE_DIR. Return the associated LD_LIBRARY_PATH.
The metadata being:
- CACHE_DIR/cache.json: json file containing all the paths info.
- CACHE_DIR/ld_library_path: file containing the LD_LIBRARY_PATH
to inject for the CACHE_PATHS.
- CACHE_DIR/egl-confs: directory containing the various EGL
confs."""
cache_file_path = os.path.join(cache_dir, "cache.json")
cached_ld_library_path = os.path.join(cache_dir, "ld_library_path")
egl_conf_dir = os.path.join(cache_dir, "egl-confs")
with open(cache_file_path, "w", encoding="utf8") as f:
f.write(cache_content.to_json())
nix_gl_ld_library_path = generate_cache_ld_library_path(cache_paths)
log_info(f"Caching LD_LIBRARY_PATH: {nix_gl_ld_library_path}")
with open(cached_ld_library_path, "w", encoding="utf8") as f:
f.write(nix_gl_ld_library_path)
generate_nvidia_egl_config_files(egl_conf_dir)
return nix_gl_ld_library_path
def nvidia_main( def nvidia_main(
cache_dir: str, dso_vendor_paths: List[str], print_ld_library_path: bool = False cache_dir: str, dso_vendor_paths: List[str], print_ld_library_path: bool = False
) -> Dict: ) -> Dict:
@ -494,40 +525,63 @@ def nvidia_main(
log_info("Searching for the host DSOs") log_info("Searching for the host DSOs")
cache_content: CacheDirContent = CacheDirContent(paths=[]) cache_content: CacheDirContent = CacheDirContent(paths=[])
cache_file_path = os.path.join(cache_dir, "cache.json") cache_file_path = os.path.join(cache_dir, "cache.json")
lock_path = os.path.join(os.path.split(cache_dir)[0], "nix-gl-host.lock")
cached_ld_library_path = os.path.join(cache_dir, "ld_library_path") cached_ld_library_path = os.path.join(cache_dir, "ld_library_path")
paths = get_ld_paths() paths = get_ld_paths()
egl_conf_dir = os.path.join(cache_dir, "egl-confs") egl_conf_dir = os.path.join(cache_dir, "egl-confs")
nix_gl_ld_library_path: Optional[str] = None nix_gl_ld_library_path: Optional[str] = None
# Cache/Patch DSOs # Cache/Patch DSOs
for path in paths: #
res = scan_dsos_from_dir(path) # We need to be super careful about race conditions here. We're
if res: # using a file lock to make sure only one nix-gl-host instance can
cache_content.paths.append(res) # access the cache at a time.
if not is_dso_cache_up_to_date( #
cache_content, cache_file_path # If the cache is locked, we'll wait until the said lock is
) or not os.path.isfile(cached_ld_library_path): # released. The lock will always be released when the lock FD get
log_info("The cache is not up to date, regenerating it") # closed, IE. when we'll get out of this block.
shutil.rmtree(cache_dir) with open(lock_path, "w") as lock:
cache_paths: List[str] = [] log_info("Acquiring the cache lock")
for p in cache_content.paths: fcntl.flock(lock, fcntl.LOCK_EX)
log_info(f"Caching {p}") log_info("Cache lock acquired")
cache_paths.append(cache_library_path(p, cache_dir)) for path in paths:
log_info(f"Caching ") res = scan_dsos_from_dir(path)
with open(cache_file_path, "w", encoding="utf8") as f: if res:
f.write(cache_content.to_json()) cache_content.paths.append(res)
nix_gl_ld_library_path = generate_cache_ld_library_path(cache_paths) if not is_dso_cache_up_to_date(
log_info(f"Caching LD_LIBRARY_PATH: {nix_gl_ld_library_path}") cache_content, cache_file_path
with open(cached_ld_library_path, "w", encoding="utf8") as f: ) or not os.path.isfile(cached_ld_library_path):
f.write(nix_gl_ld_library_path) log_info("The cache is not up to date, regenerating it")
generate_nvidia_egl_config_files(egl_conf_dir) # We're building first the cache in a temporary directory
else: # to make sure we won't end up with a partially
log_info("The cache is up to date, re-using it.") # populated/corrupted nix-gl-host cache.
with open(cached_ld_library_path, "r", encoding="utf8") as f: with tempfile.TemporaryDirectory() as tmp_cache:
nix_gl_ld_library_path = f.read() tmp_cache_dir = os.path.join(tmp_cache, "nix-gl-host")
os.makedirs(tmp_cache_dir)
cache_paths: List[str] = []
for p in cache_content.paths:
log_info(f"Caching {p}")
cache_paths.append(cache_library_path(p, tmp_cache_dir, cache_dir))
# Pointing the LD_LIBRARY_PATH to the final destination
# instead of the tmp dir.
cache_absolute_paths = [os.path.join(cache_dir, p) for p in cache_paths]
nix_gl_ld_library_path = generate_cache_metadata(
tmp_cache_dir, cache_content, cache_absolute_paths
)
# The temporary cache has been successfully populated,
# let's mv it to the actual nix-gl-host cache.
# Note: The move operation is atomic on linux.
log_info(f"Mv {tmp_cache_dir} to {cache_dir}")
if os.path.exists(cache_dir):
shutil.rmtree(cache_dir)
shutil.move(tmp_cache_dir, os.path.split(cache_dir)[0])
else:
log_info("The cache is up to date, re-using it.")
with open(cached_ld_library_path, "r", encoding="utf8") as f:
nix_gl_ld_library_path = f.read()
log_info("Cache lock released")
assert nix_gl_ld_library_path, "The nix-host-gl LD_LIBRARY_PATH is not set" assert nix_gl_ld_library_path, "The nix-host-gl LD_LIBRARY_PATH is not set"
log_info(f"Injecting LD_LIBRARY_PATH: {nix_gl_ld_library_path}") log_info(f"Injecting LD_LIBRARY_PATH: {nix_gl_ld_library_path}")
os.makedirs(egl_conf_dir, exist_ok=True)
new_env = {} new_env = {}
log_info(f"__GLX_VENDOR_LIBRARY_NAME = nvidia") log_info(f"__GLX_VENDOR_LIBRARY_NAME = nvidia")
new_env["__GLX_VENDOR_LIBRARY_NAME"] = "nvidia" new_env["__GLX_VENDOR_LIBRARY_NAME"] = "nvidia"
@ -561,8 +615,8 @@ def main(args):
home = os.path.expanduser("~") home = os.path.expanduser("~")
xdg_cache_home = os.environ.get("XDG_CACHE_HOME", os.path.join(home, ".cache")) xdg_cache_home = os.environ.get("XDG_CACHE_HOME", os.path.join(home, ".cache"))
cache_dir = os.path.join(xdg_cache_home, "nix-gl-host") cache_dir = os.path.join(xdg_cache_home, "nix-gl-host")
log_info(f'Using "{cache_dir}" as cache dir.')
os.makedirs(cache_dir, exist_ok=True) os.makedirs(cache_dir, exist_ok=True)
log_info(f'Using "{cache_dir}" as cache dir.')
if args.driver_directory: if args.driver_directory:
log_info( log_info(
f"Retreiving DSOs from the specified directory: {args.driver_directory}" f"Retreiving DSOs from the specified directory: {args.driver_directory}"