Cache cold build: prevent race condition
We had a race condition on cold cache builds. When several nix-gl-host instances were called, they were trying to concurrently build the cache. This lead to some weird errors and busted caches. We introduce a file lock preventing any concurrent access to the cache. We take advantage of this bug to rethink the way we build the cache and do it in a more robust way. Instead of building it in place, we're now building it first in a temporary directory (while making sure to patchelf the DSOs according their final destination). We then move this directory to the actual cache destination iff the cache has been built successfully.
This commit is contained in:
parent
4f51b37c29
commit
891dd0bf95
|
@ -1,14 +1,16 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import fcntl
|
||||||
import hashlib
|
import hashlib
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
import subprocess
|
|
||||||
import stat
|
import stat
|
||||||
|
import subprocess
|
||||||
import sys
|
import sys
|
||||||
|
import tempfile
|
||||||
import time
|
import time
|
||||||
from glob import glob
|
from glob import glob
|
||||||
from typing import List, Literal, Dict, Tuple, TypedDict, TextIO, Optional
|
from typing import List, Literal, Dict, Tuple, TypedDict, TextIO, Optional
|
||||||
|
@ -412,7 +414,9 @@ def scan_dsos_from_dir(path: str) -> Optional[LibraryPath]:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def cache_library_path(library_path: LibraryPath, cache_dir_root: str) -> str:
|
def cache_library_path(
|
||||||
|
library_path: LibraryPath, temp_cache_dir_root: str, final_cache_dir_root: str
|
||||||
|
) -> str:
|
||||||
"""Generate a cache directory for the LIBRARY_PATH host directory.
|
"""Generate a cache directory for the LIBRARY_PATH host directory.
|
||||||
|
|
||||||
This cache directory is mirroring the host directory containing
|
This cache directory is mirroring the host directory containing
|
||||||
|
@ -426,8 +430,9 @@ def cache_library_path(library_path: LibraryPath, cache_dir_root: str) -> str:
|
||||||
h.update(library_path.path.encode("utf8"))
|
h.update(library_path.path.encode("utf8"))
|
||||||
path_hash: str = h.hexdigest()
|
path_hash: str = h.hexdigest()
|
||||||
# Paths
|
# Paths
|
||||||
cache_path_root: str = os.path.join(cache_dir_root, path_hash)
|
cache_path_root: str = os.path.join(temp_cache_dir_root, path_hash)
|
||||||
lib_dir = os.path.join(cache_path_root, "lib")
|
lib_dir = os.path.join(cache_path_root, "lib")
|
||||||
|
rpath_lib_dir = os.path.join(final_cache_dir_root, path_hash, "lib")
|
||||||
cuda_dir = os.path.join(cache_path_root, "cuda")
|
cuda_dir = os.path.join(cache_path_root, "cuda")
|
||||||
egl_dir = os.path.join(cache_path_root, "egl")
|
egl_dir = os.path.join(cache_path_root, "egl")
|
||||||
glx_dir = os.path.join(cache_path_root, "glx")
|
glx_dir = os.path.join(cache_path_root, "glx")
|
||||||
|
@ -440,7 +445,7 @@ def cache_library_path(library_path: LibraryPath, cache_dir_root: str) -> str:
|
||||||
]:
|
]:
|
||||||
os.makedirs(d, exist_ok=True)
|
os.makedirs(d, exist_ok=True)
|
||||||
if len(dsos) > 0:
|
if len(dsos) > 0:
|
||||||
copy_and_patch_libs(dsos=dsos, dest_dir=d, rpath=lib_dir)
|
copy_and_patch_libs(dsos=dsos, dest_dir=d, rpath=rpath_lib_dir)
|
||||||
else:
|
else:
|
||||||
log_info(f"Did not find any DSO to put in {d}, skipping copy and patching.")
|
log_info(f"Did not find any DSO to put in {d}, skipping copy and patching.")
|
||||||
return path_hash
|
return path_hash
|
||||||
|
@ -520,33 +525,60 @@ def nvidia_main(
|
||||||
log_info("Searching for the host DSOs")
|
log_info("Searching for the host DSOs")
|
||||||
cache_content: CacheDirContent = CacheDirContent(paths=[])
|
cache_content: CacheDirContent = CacheDirContent(paths=[])
|
||||||
cache_file_path = os.path.join(cache_dir, "cache.json")
|
cache_file_path = os.path.join(cache_dir, "cache.json")
|
||||||
|
lock_path = os.path.join(os.path.split(cache_dir)[0], "nix-gl-host.lock")
|
||||||
cached_ld_library_path = os.path.join(cache_dir, "ld_library_path")
|
cached_ld_library_path = os.path.join(cache_dir, "ld_library_path")
|
||||||
paths = get_ld_paths()
|
paths = get_ld_paths()
|
||||||
egl_conf_dir = os.path.join(cache_dir, "egl-confs")
|
egl_conf_dir = os.path.join(cache_dir, "egl-confs")
|
||||||
nix_gl_ld_library_path: Optional[str] = None
|
nix_gl_ld_library_path: Optional[str] = None
|
||||||
# Cache/Patch DSOs
|
# Cache/Patch DSOs
|
||||||
# TODO: extract
|
#
|
||||||
for path in paths:
|
# We need to be super careful about race conditions here. We're
|
||||||
res = scan_dsos_from_dir(path)
|
# using a file lock to make sure only one nix-gl-host instance can
|
||||||
if res:
|
# access the cache at a time.
|
||||||
cache_content.paths.append(res)
|
#
|
||||||
if not is_dso_cache_up_to_date(
|
# If the cache is locked, we'll wait until the said lock is
|
||||||
cache_content, cache_file_path
|
# released. The lock will always be released when the lock FD get
|
||||||
) or not os.path.isfile(cached_ld_library_path):
|
# closed, IE. when we'll get out of this block.
|
||||||
log_info("The cache is not up to date, regenerating it")
|
with open(lock_path, "w") as lock:
|
||||||
shutil.rmtree(cache_dir)
|
log_info("Acquiring the cache lock")
|
||||||
cache_paths: List[str] = []
|
fcntl.flock(lock, fcntl.LOCK_EX)
|
||||||
for p in cache_content.paths:
|
log_info("Cache lock acquired")
|
||||||
log_info(f"Caching {p}")
|
for path in paths:
|
||||||
cache_paths.append(cache_library_path(p, cache_dir))
|
res = scan_dsos_from_dir(path)
|
||||||
cache_absolute_paths = [os.path.join(cache_dir, p) for p in cache_paths]
|
if res:
|
||||||
nix_gl_ld_library_path = generate_cache_metadata(
|
cache_content.paths.append(res)
|
||||||
cache_dir, cache_content, cache_absolute_paths
|
if not is_dso_cache_up_to_date(
|
||||||
)
|
cache_content, cache_file_path
|
||||||
else:
|
) or not os.path.isfile(cached_ld_library_path):
|
||||||
log_info("The cache is up to date, re-using it.")
|
log_info("The cache is not up to date, regenerating it")
|
||||||
with open(cached_ld_library_path, "r", encoding="utf8") as f:
|
# We're building first the cache in a temporary directory
|
||||||
nix_gl_ld_library_path = f.read()
|
# to make sure we won't end up with a partially
|
||||||
|
# populated/corrupted nix-gl-host cache.
|
||||||
|
with tempfile.TemporaryDirectory() as tmp_cache:
|
||||||
|
tmp_cache_dir = os.path.join(tmp_cache, "nix-gl-host")
|
||||||
|
os.makedirs(tmp_cache_dir)
|
||||||
|
cache_paths: List[str] = []
|
||||||
|
for p in cache_content.paths:
|
||||||
|
log_info(f"Caching {p}")
|
||||||
|
cache_paths.append(cache_library_path(p, tmp_cache_dir, cache_dir))
|
||||||
|
# Pointing the LD_LIBRARY_PATH to the final destination
|
||||||
|
# instead of the tmp dir.
|
||||||
|
cache_absolute_paths = [os.path.join(cache_dir, p) for p in cache_paths]
|
||||||
|
nix_gl_ld_library_path = generate_cache_metadata(
|
||||||
|
tmp_cache_dir, cache_content, cache_absolute_paths
|
||||||
|
)
|
||||||
|
# The temporary cache has been successfully populated,
|
||||||
|
# let's mv it to the actual nix-gl-host cache.
|
||||||
|
# Note: The move operation is atomic on linux.
|
||||||
|
log_info(f"Mv {tmp_cache_dir} to {cache_dir}")
|
||||||
|
if os.path.exists(cache_dir):
|
||||||
|
shutil.rmtree(cache_dir)
|
||||||
|
shutil.move(tmp_cache_dir, os.path.split(cache_dir)[0])
|
||||||
|
else:
|
||||||
|
log_info("The cache is up to date, re-using it.")
|
||||||
|
with open(cached_ld_library_path, "r", encoding="utf8") as f:
|
||||||
|
nix_gl_ld_library_path = f.read()
|
||||||
|
log_info("Cache lock released")
|
||||||
|
|
||||||
assert nix_gl_ld_library_path, "The nix-host-gl LD_LIBRARY_PATH is not set"
|
assert nix_gl_ld_library_path, "The nix-host-gl LD_LIBRARY_PATH is not set"
|
||||||
log_info(f"Injecting LD_LIBRARY_PATH: {nix_gl_ld_library_path}")
|
log_info(f"Injecting LD_LIBRARY_PATH: {nix_gl_ld_library_path}")
|
||||||
|
@ -583,8 +615,8 @@ def main(args):
|
||||||
home = os.path.expanduser("~")
|
home = os.path.expanduser("~")
|
||||||
xdg_cache_home = os.environ.get("XDG_CACHE_HOME", os.path.join(home, ".cache"))
|
xdg_cache_home = os.environ.get("XDG_CACHE_HOME", os.path.join(home, ".cache"))
|
||||||
cache_dir = os.path.join(xdg_cache_home, "nix-gl-host")
|
cache_dir = os.path.join(xdg_cache_home, "nix-gl-host")
|
||||||
log_info(f'Using "{cache_dir}" as cache dir.')
|
|
||||||
os.makedirs(cache_dir, exist_ok=True)
|
os.makedirs(cache_dir, exist_ok=True)
|
||||||
|
log_info(f'Using "{cache_dir}" as cache dir.')
|
||||||
if args.driver_directory:
|
if args.driver_directory:
|
||||||
log_info(
|
log_info(
|
||||||
f"Retreiving DSOs from the specified directory: {args.driver_directory}"
|
f"Retreiving DSOs from the specified directory: {args.driver_directory}"
|
||||||
|
|
Loading…
Reference in a new issue