nspawn: add file system locks for controlling access to container images

This adds three kinds of file system locks for container images:

a) a file system lock next to the actual image, in a .lck file in the
   same directory the image is located. This lock has the benefit of
   usually being located on the same NFS share as the image itself, and
   thus allows locking container images across NFS shares.

b) a file system lock in /run, named after st_dev and st_ino of the
   root of the image. This lock has the advantage that it is unique even
   if the same image is bind mounted to two different places at the same
   time, as the ino/dev stays constant for them.

c) a file system lock that is only taken when a new disk image is about
   to be created, that ensures that checking whether the name is already
   used across the search path, and actually placing the image is not
   interrupted by other code taking the name.

a + b are read-write locks. When a container is booted in read-only mode
a read lock is taken, otherwise a write lock.

Lock b is always taken after a, to avoid ABBA problems.

Lock c is mostly relevant when renaming or cloning images.
This commit is contained in:
Lennart Poettering 2015-01-14 23:09:02 +01:00
parent 805e5dda0a
commit 30535c1692
5 changed files with 304 additions and 32 deletions

View file

@ -3337,6 +3337,7 @@ int main(int argc, char *argv[]) {
pid_t pid = 0;
int ret = EXIT_SUCCESS;
union in_addr_union exposed = {};
_cleanup_release_lock_file_ LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
log_parse_environment();
log_open();
@ -3382,20 +3383,8 @@ int main(int argc, char *argv[]) {
goto finish;
}
if (arg_template) {
r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
if (r == -EEXIST) {
if (!arg_quiet)
log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
} else if (r < 0) {
log_error_errno(r, "Couldn't create snapshort %s from %s: %m", arg_directory, arg_template);
goto finish;
} else {
if (!arg_quiet)
log_info("Populated %s from template %s.", arg_directory, arg_template);
}
} else if (arg_ephemeral) {
if (arg_ephemeral) {
_cleanup_release_lock_file_ LockFile original_lock = LOCK_FILE_INIT;
char *np;
/* If the specified path is a mount point we
@ -3418,6 +3407,12 @@ int main(int argc, char *argv[]) {
goto finish;
}
r = image_path_lock(np, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
if (r < 0) {
log_error_errno(r, "Failed to lock %s: %m", np);
goto finish;
}
r = btrfs_subvol_snapshot(arg_directory, np, arg_read_only, true);
if (r < 0) {
free(np);
@ -3429,6 +3424,31 @@ int main(int argc, char *argv[]) {
arg_directory = np;
remove_subvol = true;
} else {
r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
if (r == -EBUSY) {
log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
goto finish;
}
if (r < 0) {
log_error_errno(r, "Failed to lock %s: %m", arg_directory);
return r;
}
if (arg_template) {
r = btrfs_subvol_snapshot(arg_template, arg_directory, arg_read_only, true);
if (r == -EEXIST) {
if (!arg_quiet)
log_info("Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
} else if (r < 0) {
log_error_errno(r, "Couldn't create snapshort %s from %s: %m", arg_directory, arg_template);
goto finish;
} else {
if (!arg_quiet)
log_info("Populated %s from template %s.", arg_directory, arg_template);
}
}
}
if (arg_boot) {
@ -3455,6 +3475,16 @@ int main(int argc, char *argv[]) {
assert(arg_image);
assert(!arg_template);
r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
if (r == -EBUSY) {
r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
goto finish;
}
if (r < 0) {
r = log_error_errno(r, "Failed to create image lock: %m");
goto finish;
}
if (!mkdtemp(template)) {
log_error_errno(errno, "Failed to create temporary directory: %m");
r = -errno;

View file

@ -28,6 +28,7 @@
#include "btrfs-util.h"
#include "path-util.h"
#include "copy.h"
#include "mkdir.h"
#include "machine-image.h"
static const char image_search_path[] =
@ -340,12 +341,20 @@ void image_hashmap_free(Hashmap *map) {
}
int image_remove(Image *i) {
_cleanup_release_lock_file_ LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT;
int r;
assert(i);
if (path_equal(i->path, "/") ||
path_startswith(i->path, "/usr"))
return -EROFS;
/* Make sure we don't interfere with a running nspawn */
r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
if (r < 0)
return r;
switch (i->type) {
case IMAGE_SUBVOLUME:
@ -366,6 +375,7 @@ int image_remove(Image *i) {
}
int image_rename(Image *i, const char *new_name) {
_cleanup_release_lock_file_ LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT, name_lock = LOCK_FILE_INIT;
_cleanup_free_ char *new_path = NULL, *nn = NULL;
unsigned file_attr = 0;
int r;
@ -379,6 +389,18 @@ int image_rename(Image *i, const char *new_name) {
path_startswith(i->path, "/usr"))
return -EROFS;
/* Make sure we don't interfere with a running nspawn */
r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
if (r < 0)
return r;
/* Make sure nobody takes the new name, between the time we
* checked it is currently unused in all search paths, and the
* time we take possesion of it */
r = image_name_lock(new_name, LOCK_EX|LOCK_NB, &name_lock);
if (r < 0)
return r;
r = image_find(new_name, NULL);
if (r < 0)
return r;
@ -438,6 +460,7 @@ int image_rename(Image *i, const char *new_name) {
}
int image_clone(Image *i, const char *new_name, bool read_only) {
_cleanup_release_lock_file_ LockFile name_lock = LOCK_FILE_INIT;
const char *new_path;
int r;
@ -446,6 +469,13 @@ int image_clone(Image *i, const char *new_name, bool read_only) {
if (!image_name_is_valid(new_name))
return -EINVAL;
/* Make sure nobody takes the new name, between the time we
* checked it is currently unused in all search paths, and the
* time we take possesion of it */
r = image_name_lock(new_name, LOCK_EX|LOCK_NB, &name_lock);
if (r < 0)
return r;
r = image_find(new_name, NULL);
if (r < 0)
return r;
@ -478,6 +508,7 @@ int image_clone(Image *i, const char *new_name, bool read_only) {
}
int image_read_only(Image *i, bool b) {
_cleanup_release_lock_file_ LockFile global_lock = LOCK_FILE_INIT, local_lock = LOCK_FILE_INIT;
int r;
assert(i);
@ -485,6 +516,11 @@ int image_read_only(Image *i, bool b) {
path_startswith(i->path, "/usr"))
return -EROFS;
/* Make sure we don't interfere with a running nspawn */
r = image_path_lock(i->path, LOCK_EX|LOCK_NB, &global_lock, &local_lock);
if (r < 0)
return r;
switch (i->type) {
case IMAGE_SUBVOLUME:
@ -533,6 +569,88 @@ int image_read_only(Image *i, bool b) {
return 0;
}
int image_path_lock(const char *path, int operation, LockFile *global, LockFile *local) {
_cleanup_free_ char *p = NULL;
LockFile t = LOCK_FILE_INIT;
struct stat st;
int r;
assert(path);
assert(global);
assert(local);
/* Locks an image path. This actually creates two locks: one
* "local" one, next to the image path itself, which might be
* shared via NFS. And another "global" one, in /run, that
* uses the device/inode number. This has the benefit that we
* can even lock a tree that is a mount point, correctly. */
if (path_equal(path, "/"))
return -EBUSY;
if (!path_is_absolute(path))
return -EINVAL;
if (stat(path, &st) >= 0) {
if (asprintf(&p, "/run/systemd/nspawn/locks/inode-%lu:%lu", (unsigned long) st.st_dev, (unsigned long) st.st_ino) < 0)
return -ENOMEM;
}
r = make_lock_file_for(path, operation, &t);
if (r < 0)
return r;
if (p) {
mkdir_p("/run/systemd/nspawn/locks", 0600);
r = make_lock_file(p, operation, global);
if (r < 0) {
release_lock_file(&t);
return r;
}
}
*local = t;
return 0;
}
int image_name_lock(const char *name, int operation, LockFile *ret) {
const char *p;
assert(name);
assert(ret);
/* Locks an image name, regardless of the precise path used. */
if (!image_name_is_valid(name))
return -EINVAL;
if (streq(name, ".host"))
return -EBUSY;
mkdir_p("/run/systemd/nspawn/locks", 0600);
p = strappenda("/run/systemd/nspawn/locks/name-", name);
return make_lock_file(p, operation, ret);
}
bool image_name_is_valid(const char *s) {
if (!filename_is_valid(s))
return false;
if (string_has_cc(s, NULL))
return false;
if (!utf8_is_valid(s))
return false;
/* Temporary files for atomically creating new files */
if (startswith(s, ".#"))
return false;
return true;
}
static const char* const image_type_table[_IMAGE_TYPE_MAX] = {
[IMAGE_DIRECTORY] = "directory",
[IMAGE_SUBVOLUME] = "subvolume",

View file

@ -63,3 +63,8 @@ int image_read_only(Image *i, bool b);
const char* image_type_to_string(ImageType t) _const_;
ImageType image_type_from_string(const char *s) _pure_;
bool image_name_is_valid(const char *s) _pure_;
int image_path_lock(const char *path, int operation, LockFile *global, LockFile *local);
int image_name_lock(const char *name, int operation, LockFile *ret);

View file

@ -62,6 +62,7 @@
#include <sys/xattr.h>
#include <libgen.h>
#include <sys/statvfs.h>
#include <sys/file.h>
#include <linux/fs.h>
#undef basename
@ -4303,23 +4304,6 @@ bool machine_name_is_valid(const char *s) {
return true;
}
bool image_name_is_valid(const char *s) {
if (!filename_is_valid(s))
return false;
if (string_has_cc(s, NULL))
return false;
if (!utf8_is_valid(s))
return false;
/* Temporary files for atomically creating new files */
if (startswith(s, ".#"))
return false;
return true;
}
int pipe_eof(int fd) {
struct pollfd pollfd = {
.fd = fd,
@ -7819,3 +7803,125 @@ int read_attr_path(const char *p, unsigned *ret) {
return read_attr_fd(fd, ret);
}
int make_lock_file(const char *p, int operation, LockFile *ret) {
_cleanup_close_ int fd = -1;
_cleanup_free_ char *t = NULL;
int r;
/*
* We use UNPOSIX locks if they are available. They have nice
* semantics, and are mostly compatible with NFS. However,
* they are only available on new kernels. When we detect we
* are running on an older kernel, then we fall back to good
* old BSD locks. They also have nice semantics, but are
* slightly problematic on NFS, where they are upgraded to
* POSIX locks, even though locally they are orthogonal to
* POSIX locks.
*/
t = strdup(p);
if (!t)
return -ENOMEM;
for (;;) {
struct flock fl = {
.l_type = (operation & ~LOCK_NB) == LOCK_EX ? F_WRLCK : F_RDLCK,
.l_whence = SEEK_SET,
};
struct stat st;
fd = open(p, O_CREAT|O_RDWR|O_NOFOLLOW|O_CLOEXEC|O_NOCTTY, 0600);
if (fd < 0)
return -errno;
r = fcntl(fd, (operation & LOCK_NB) ? F_OFD_SETLK : F_OFD_SETLKW, &fl);
if (r < 0) {
/* If the kernel is too old, use good old BSD locks */
if (errno == EINVAL)
r = flock(fd, operation);
if (r < 0)
return errno == EAGAIN ? -EBUSY : -errno;
}
/* If we acquired the lock, let's check if the file
* still exists in the file system. If not, then the
* previous exclusive owner removed it and then closed
* it. In such a case our acquired lock is worthless,
* hence try again. */
r = fstat(fd, &st);
if (r < 0)
return -errno;
if (st.st_nlink > 0)
break;
fd = safe_close(fd);
}
ret->path = t;
ret->fd = fd;
ret->operation = operation;
fd = -1;
t = NULL;
return r;
}
int make_lock_file_for(const char *p, int operation, LockFile *ret) {
const char *fn;
char *t;
assert(p);
assert(ret);
fn = basename(p);
if (!filename_is_valid(fn))
return -EINVAL;
t = newa(char, strlen(p) + 2 + 4 + 1);
stpcpy(stpcpy(stpcpy(mempcpy(t, p, fn - p), ".#"), fn), ".lck");
return make_lock_file(t, operation, ret);
}
void release_lock_file(LockFile *f) {
int r;
if (!f)
return;
if (f->path) {
/* If we are the exclusive owner we can safely delete
* the lock file itself. If we are not the exclusive
* owner, we can try becoming it. */
if (f->fd >= 0 &&
(f->operation & ~LOCK_NB) == LOCK_SH) {
static const struct flock fl = {
.l_type = F_WRLCK,
.l_whence = SEEK_SET,
};
r = fcntl(f->fd, F_OFD_SETLK, &fl);
if (r < 0 && errno == EINVAL)
r = flock(f->fd, LOCK_EX|LOCK_NB);
if (r >= 0)
f->operation = LOCK_EX|LOCK_NB;
}
if ((f->operation & ~LOCK_NB) == LOCK_EX)
unlink_noerrno(f->path);
free(f->path);
f->path = NULL;
}
f->fd = safe_close(f->fd);
f->operation = 0;
}

View file

@ -550,7 +550,6 @@ bool hostname_is_valid(const char *s) _pure_;
char* hostname_cleanup(char *s, bool lowercase);
bool machine_name_is_valid(const char *s) _pure_;
bool image_name_is_valid(const char *s) _pure_;
char* strshorten(char *s, size_t l);
@ -1080,4 +1079,18 @@ int chattr_path(const char *p, bool b, unsigned mask);
int read_attr_fd(int fd, unsigned *ret);
int read_attr_path(const char *p, unsigned *ret);
typedef struct LockFile {
char *path;
int fd;
int operation;
} LockFile;
int make_lock_file(const char *p, int operation, LockFile *ret);
int make_lock_file_for(const char *p, int operation, LockFile *ret);
void release_lock_file(LockFile *f);
#define _cleanup_release_lock_file_ _cleanup_(release_lock_file)
#define LOCK_FILE_INIT { .fd = -1, .path = NULL }
#define RLIMIT_MAKE_CONST(lim) ((struct rlimit) { lim, lim })