Compare commits

...

2 Commits

Author SHA1 Message Date
Félix Baylac Jacqué 891dd0bf95
Cache cold build: prevent race condition
We had a race condition on cold cache builds. When several nix-gl-host
instances were called, they were trying to concurrently build the
cache.

This lead to some weird errors and busted caches.

We introduce a file lock preventing any concurrent access to the
cache.

We take advantage of this bug to rethink the way we build the cache
and do it in a more robust way. Instead of building it in place, we're
now building it first in a temporary directory (while making sure to
patchelf the DSOs according their final destination). We then move
this directory to the actual cache destination iff the cache has been
built successfully.
2023-01-17 16:14:16 +01:00
Félix Baylac Jacqué 4f51b37c29
nvidia_main: refactor out metadata generation in separate function
This nvidia_main function got out of control. Let's chop the metadata
generation out of it.
2023-01-17 12:41:07 +01:00
1 changed files with 88 additions and 34 deletions

View File

@ -1,14 +1,16 @@
#!/usr/bin/env python3
import argparse
import fcntl
import hashlib
import json
import os
import re
import shutil
import subprocess
import stat
import subprocess
import sys
import tempfile
import time
from glob import glob
from typing import List, Literal, Dict, Tuple, TypedDict, TextIO, Optional
@ -412,22 +414,25 @@ def scan_dsos_from_dir(path: str) -> Optional[LibraryPath]:
return None
def cache_library_path(library_path: LibraryPath, cache_dir_root: str) -> str:
def cache_library_path(
library_path: LibraryPath, temp_cache_dir_root: str, final_cache_dir_root: str
) -> str:
"""Generate a cache directory for the LIBRARY_PATH host directory.
This cache directory is mirroring the host directory containing
the graphics card drivers. Its full name is hashed: it's an
attempt to keep the final LD_LIBRARY_PATH reasonably sized.
Returns the full path of the cache directory created by this
function."""
Returns the name of the cache directory created by this
function to CACHE_DIR_ROOT."""
# Hash Computation
h = hashlib.sha256()
h.update(library_path.path.encode("utf8"))
path_hash: str = h.hexdigest()
# Paths
cache_path_root: str = os.path.join(cache_dir_root, path_hash)
cache_path_root: str = os.path.join(temp_cache_dir_root, path_hash)
lib_dir = os.path.join(cache_path_root, "lib")
rpath_lib_dir = os.path.join(final_cache_dir_root, path_hash, "lib")
cuda_dir = os.path.join(cache_path_root, "cuda")
egl_dir = os.path.join(cache_path_root, "egl")
glx_dir = os.path.join(cache_path_root, "glx")
@ -440,10 +445,10 @@ def cache_library_path(library_path: LibraryPath, cache_dir_root: str) -> str:
]:
os.makedirs(d, exist_ok=True)
if len(dsos) > 0:
copy_and_patch_libs(dsos=dsos, dest_dir=d, rpath=lib_dir)
copy_and_patch_libs(dsos=dsos, dest_dir=d, rpath=rpath_lib_dir)
else:
log_info(f"Did not find any DSO to put in {d}, skipping copy and patching.")
return cache_path_root
return path_hash
def generate_cache_ld_library_path(cache_paths: List[str]) -> str:
@ -463,6 +468,32 @@ def generate_cache_ld_library_path(cache_paths: List[str]) -> str:
return ":".join(ld_library_paths)
def generate_cache_metadata(
cache_dir: str, cache_content: CacheDirContent, cache_paths: List[str]
) -> str:
"""Generates the various cache metadata for a given CACHE_CONTENT
and CACHE_PATHS in CACHE_DIR. Return the associated LD_LIBRARY_PATH.
The metadata being:
- CACHE_DIR/cache.json: json file containing all the paths info.
- CACHE_DIR/ld_library_path: file containing the LD_LIBRARY_PATH
to inject for the CACHE_PATHS.
- CACHE_DIR/egl-confs: directory containing the various EGL
confs."""
cache_file_path = os.path.join(cache_dir, "cache.json")
cached_ld_library_path = os.path.join(cache_dir, "ld_library_path")
egl_conf_dir = os.path.join(cache_dir, "egl-confs")
with open(cache_file_path, "w", encoding="utf8") as f:
f.write(cache_content.to_json())
nix_gl_ld_library_path = generate_cache_ld_library_path(cache_paths)
log_info(f"Caching LD_LIBRARY_PATH: {nix_gl_ld_library_path}")
with open(cached_ld_library_path, "w", encoding="utf8") as f:
f.write(nix_gl_ld_library_path)
generate_nvidia_egl_config_files(egl_conf_dir)
return nix_gl_ld_library_path
def nvidia_main(
cache_dir: str, dso_vendor_paths: List[str], print_ld_library_path: bool = False
) -> Dict:
@ -494,40 +525,63 @@ def nvidia_main(
log_info("Searching for the host DSOs")
cache_content: CacheDirContent = CacheDirContent(paths=[])
cache_file_path = os.path.join(cache_dir, "cache.json")
lock_path = os.path.join(os.path.split(cache_dir)[0], "nix-gl-host.lock")
cached_ld_library_path = os.path.join(cache_dir, "ld_library_path")
paths = get_ld_paths()
egl_conf_dir = os.path.join(cache_dir, "egl-confs")
nix_gl_ld_library_path: Optional[str] = None
# Cache/Patch DSOs
for path in paths:
res = scan_dsos_from_dir(path)
if res:
cache_content.paths.append(res)
if not is_dso_cache_up_to_date(
cache_content, cache_file_path
) or not os.path.isfile(cached_ld_library_path):
log_info("The cache is not up to date, regenerating it")
shutil.rmtree(cache_dir)
cache_paths: List[str] = []
for p in cache_content.paths:
log_info(f"Caching {p}")
cache_paths.append(cache_library_path(p, cache_dir))
log_info(f"Caching ")
with open(cache_file_path, "w", encoding="utf8") as f:
f.write(cache_content.to_json())
nix_gl_ld_library_path = generate_cache_ld_library_path(cache_paths)
log_info(f"Caching LD_LIBRARY_PATH: {nix_gl_ld_library_path}")
with open(cached_ld_library_path, "w", encoding="utf8") as f:
f.write(nix_gl_ld_library_path)
generate_nvidia_egl_config_files(egl_conf_dir)
else:
log_info("The cache is up to date, re-using it.")
with open(cached_ld_library_path, "r", encoding="utf8") as f:
nix_gl_ld_library_path = f.read()
#
# We need to be super careful about race conditions here. We're
# using a file lock to make sure only one nix-gl-host instance can
# access the cache at a time.
#
# If the cache is locked, we'll wait until the said lock is
# released. The lock will always be released when the lock FD get
# closed, IE. when we'll get out of this block.
with open(lock_path, "w") as lock:
log_info("Acquiring the cache lock")
fcntl.flock(lock, fcntl.LOCK_EX)
log_info("Cache lock acquired")
for path in paths:
res = scan_dsos_from_dir(path)
if res:
cache_content.paths.append(res)
if not is_dso_cache_up_to_date(
cache_content, cache_file_path
) or not os.path.isfile(cached_ld_library_path):
log_info("The cache is not up to date, regenerating it")
# We're building first the cache in a temporary directory
# to make sure we won't end up with a partially
# populated/corrupted nix-gl-host cache.
with tempfile.TemporaryDirectory() as tmp_cache:
tmp_cache_dir = os.path.join(tmp_cache, "nix-gl-host")
os.makedirs(tmp_cache_dir)
cache_paths: List[str] = []
for p in cache_content.paths:
log_info(f"Caching {p}")
cache_paths.append(cache_library_path(p, tmp_cache_dir, cache_dir))
# Pointing the LD_LIBRARY_PATH to the final destination
# instead of the tmp dir.
cache_absolute_paths = [os.path.join(cache_dir, p) for p in cache_paths]
nix_gl_ld_library_path = generate_cache_metadata(
tmp_cache_dir, cache_content, cache_absolute_paths
)
# The temporary cache has been successfully populated,
# let's mv it to the actual nix-gl-host cache.
# Note: The move operation is atomic on linux.
log_info(f"Mv {tmp_cache_dir} to {cache_dir}")
if os.path.exists(cache_dir):
shutil.rmtree(cache_dir)
shutil.move(tmp_cache_dir, os.path.split(cache_dir)[0])
else:
log_info("The cache is up to date, re-using it.")
with open(cached_ld_library_path, "r", encoding="utf8") as f:
nix_gl_ld_library_path = f.read()
log_info("Cache lock released")
assert nix_gl_ld_library_path, "The nix-host-gl LD_LIBRARY_PATH is not set"
log_info(f"Injecting LD_LIBRARY_PATH: {nix_gl_ld_library_path}")
os.makedirs(egl_conf_dir, exist_ok=True)
new_env = {}
log_info(f"__GLX_VENDOR_LIBRARY_NAME = nvidia")
new_env["__GLX_VENDOR_LIBRARY_NAME"] = "nvidia"
@ -561,8 +615,8 @@ def main(args):
home = os.path.expanduser("~")
xdg_cache_home = os.environ.get("XDG_CACHE_HOME", os.path.join(home, ".cache"))
cache_dir = os.path.join(xdg_cache_home, "nix-gl-host")
log_info(f'Using "{cache_dir}" as cache dir.')
os.makedirs(cache_dir, exist_ok=True)
log_info(f'Using "{cache_dir}" as cache dir.')
if args.driver_directory:
log_info(
f"Retreiving DSOs from the specified directory: {args.driver_directory}"