nspawn: rework custom mount point order, and add support for overlayfs

Previously all bind mount mounts were applied in the order specified,
followed by all tmpfs mounts in the order specified. This is
problematic, if bind mounts shall be placed within tmpfs mounts.

This patch hence reworks the custom mount point logic, and alwas applies
them in strict prefix-first order. This means the order of mounts
specified on the command line becomes irrelevant, the right operation
will always be executed.

While we are at it this commit also adds native support for overlayfs
mounts, as supported by recent kernels.
This commit is contained in:
Lennart Poettering 2015-05-13 14:04:55 +02:00
parent 4df3277881
commit 5a8af538ae
3 changed files with 383 additions and 110 deletions

View File

@ -564,6 +564,52 @@
combined with <option>--read-only</option>.</para></listitem>
</varlistentry>
<varlistentry>
<term><option>--overlay=</option></term>
<term><option>--overlay-ro=</option></term>
<listitem><para>Combine multiple directory trees into one
overlay file system and mount it into the container. Takes a
list of colon-separated paths to the directory trees to
combine and the destination mount point.</para>
<para>If three or more paths are specified, then the last
specified path is the destination mount point in the
container, all paths specified before refer to directory trees
on the host and are combined in the specified order into one
overlay file system. The left-most path is hence the lowest
directory tree, the second-to-last path the highest directory
tree in the stacking order. If <option>--overlay-ro=</option>
is used instead of <option>--overlay=</option> a read-only
overlay file system is created. If a writable overlay file
system is created all changes made to it are written to the
highest directory tree in the stacking order, i.e. the
second-to-last specified.</para>
<para>If only two paths are specified, then the second
specified path is used both as the top-level directory tree in
the stacking order as seen from the host, as well as the mount
point for the overlay file system in the container. At least
two paths have to be specified.</para>
<para>For details about overlay file systems, see <ulink
url="https://www.kernel.org/doc/Documentation/filesystems/overlayfs.txt">overlayfs.txt</ulink>. Note
that the semantics of overlay file systems are substantially
different from normal file systems, in particular regarding
reported device and inode information. Device and inode
information may change for a file while it is being written
to, and processes might see out-of-date versions of files at
times. Note that this switch automatically derives the
<literal>workdir=</literal> mount option for the overlay file
system from the top-level directory tree, making it a sibling
of it. It is hence essential that the top-level directory tree
is not a mount point itself (since the working directory must
be on the same file system as the top-most directory
tree). Also note that the <literal>lowerdir=</literal> mount
option receives the paths to stack in the opposite order of
this switch.</para></listitem>
</varlistentry>
<varlistentry>
<term><option>--setenv=</option></term>

View File

@ -87,7 +87,7 @@ static int mount_path_compare(const void *a, const void *b) {
d = path_compare(p->path, q->path);
if (!d) {
if (d == 0) {
/* If the paths are equal, check the mode */
if (p->mode < q->mode)
return -1;

View File

@ -125,6 +125,22 @@ typedef enum Volatile {
VOLATILE_STATE,
} Volatile;
typedef enum CustomMountType {
CUSTOM_MOUNT_BIND,
CUSTOM_MOUNT_TMPFS,
CUSTOM_MOUNT_OVERLAY,
} CustomMountType;
typedef struct CustomMount {
CustomMountType type;
bool read_only;
char *source; /* for overlayfs this is the upper directory */
char *destination;
char *options;
char *work_dir;
char **lower;
} CustomMount;
static char *arg_directory = NULL;
static char *arg_template = NULL;
static char *arg_user = NULL;
@ -166,9 +182,8 @@ static uint64_t arg_retain =
(1ULL << CAP_AUDIT_WRITE) |
(1ULL << CAP_AUDIT_CONTROL) |
(1ULL << CAP_MKNOD);
static char **arg_bind = NULL;
static char **arg_bind_ro = NULL;
static char **arg_tmpfs = NULL;
static CustomMount *arg_custom_mounts = NULL;
static unsigned arg_n_custom_mounts = 0;
static char **arg_setenv = NULL;
static bool arg_quiet = false;
static bool arg_share_system = false;
@ -244,6 +259,11 @@ static void help(void) {
" the container\n"
" --bind-ro=PATH[:PATH] Similar, but creates a read-only bind mount\n"
" --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
" --overlay=PATH[:PATH...]:PATH\n"
" Create an overlay mount from the host to \n"
" the container\n"
" --overlay-ro=PATH[:PATH...]:PATH\n"
" Similar, but creates a read-only overlay mount\n"
" --setenv=NAME=VALUE Pass an environment variable to PID 1\n"
" --share-system Share system namespaces with host\n"
" --register=BOOLEAN Register container as machine\n"
@ -253,6 +273,89 @@ static void help(void) {
, program_invocation_short_name);
}
static CustomMount* custom_mount_add(CustomMountType t) {
CustomMount *c, *ret;
c = realloc(arg_custom_mounts, (arg_n_custom_mounts + 1) * sizeof(CustomMount));
if (!c)
return NULL;
arg_custom_mounts = c;
ret = arg_custom_mounts + arg_n_custom_mounts;
arg_n_custom_mounts++;
*ret = (CustomMount) { .type = t };
return ret;
}
static void custom_mount_free_all(void) {
unsigned i;
for (i = 0; i < arg_n_custom_mounts; i++) {
CustomMount *m = &arg_custom_mounts[i];
free(m->source);
free(m->destination);
free(m->options);
if (m->work_dir) {
(void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL);
free(m->work_dir);
}
strv_free(m->lower);
}
free(arg_custom_mounts);
arg_custom_mounts = NULL;
arg_n_custom_mounts = 0;
}
static int custom_mount_compare(const void *a, const void *b) {
const CustomMount *x = a, *y = b;
int r;
r = path_compare(x->destination, y->destination);
if (r != 0)
return r;
if (x->type < y->type)
return -1;
if (x->type > y->type)
return 1;
return 0;
}
static int custom_mounts_prepare(void) {
unsigned i;
int r;
/* Ensure the mounts are applied prefix first. */
qsort_safe(arg_custom_mounts, arg_n_custom_mounts, sizeof(CustomMount), custom_mount_compare);
/* Allocate working directories for the overlay file systems that need it */
for (i = 0; i < arg_n_custom_mounts; i++) {
CustomMount *m = &arg_custom_mounts[i];
if (m->type != CUSTOM_MOUNT_OVERLAY)
continue;
if (m->work_dir)
continue;
if (m->read_only)
continue;
r = tempfn_random(m->source, &m->work_dir);
if (r < 0)
return log_error_errno(r, "Failed to generate work directory from %s: %m", m->source);
}
return 0;
}
static int set_sanitized_path(char **b, const char *path) {
char *p;
@ -287,6 +390,8 @@ static int parse_argv(int argc, char *argv[]) {
ARG_BIND,
ARG_BIND_RO,
ARG_TMPFS,
ARG_OVERLAY,
ARG_OVERLAY_RO,
ARG_SETENV,
ARG_SHARE_SYSTEM,
ARG_REGISTER,
@ -320,6 +425,8 @@ static int parse_argv(int argc, char *argv[]) {
{ "bind", required_argument, NULL, ARG_BIND },
{ "bind-ro", required_argument, NULL, ARG_BIND_RO },
{ "tmpfs", required_argument, NULL, ARG_TMPFS },
{ "overlay", required_argument, NULL, ARG_OVERLAY },
{ "overlay-ro", required_argument, NULL, ARG_OVERLAY_RO },
{ "machine", required_argument, NULL, 'M' },
{ "slice", required_argument, NULL, 'S' },
{ "setenv", required_argument, NULL, ARG_SETENV },
@ -544,72 +651,131 @@ static int parse_argv(int argc, char *argv[]) {
case ARG_BIND:
case ARG_BIND_RO: {
_cleanup_free_ char *a = NULL, *b = NULL;
_cleanup_free_ char *source = NULL, *destination = NULL;
CustomMount *m;
char *e;
char ***x;
x = c == ARG_BIND ? &arg_bind : &arg_bind_ro;
e = strchr(optarg, ':');
if (e) {
a = strndup(optarg, e - optarg);
b = strdup(e + 1);
source = strndup(optarg, e - optarg);
destination = strdup(e + 1);
} else {
a = strdup(optarg);
b = strdup(optarg);
source = strdup(optarg);
destination = strdup(optarg);
}
if (!a || !b)
if (!source || !destination)
return log_oom();
if (!path_is_absolute(a) || !path_is_absolute(b)) {
if (!path_is_absolute(source) || !path_is_absolute(destination)) {
log_error("Invalid bind mount specification: %s", optarg);
return -EINVAL;
}
r = strv_extend(x, a);
if (r < 0)
m = custom_mount_add(CUSTOM_MOUNT_BIND);
if (!m)
return log_oom();
r = strv_extend(x, b);
if (r < 0)
return log_oom();
m->source = source;
m->destination = destination;
m->read_only = c == ARG_BIND_RO;
source = destination = NULL;
break;
}
case ARG_TMPFS: {
_cleanup_free_ char *a = NULL, *b = NULL;
_cleanup_free_ char *path = NULL, *opts = NULL;
CustomMount *m;
char *e;
e = strchr(optarg, ':');
if (e) {
a = strndup(optarg, e - optarg);
b = strdup(e + 1);
path = strndup(optarg, e - optarg);
opts = strdup(e + 1);
} else {
a = strdup(optarg);
b = strdup("mode=0755");
path = strdup(optarg);
opts = strdup("mode=0755");
}
if (!a || !b)
if (!path || !opts)
return log_oom();
if (!path_is_absolute(a)) {
if (!path_is_absolute(path)) {
log_error("Invalid tmpfs specification: %s", optarg);
return -EINVAL;
}
r = strv_push(&arg_tmpfs, a);
if (r < 0)
m = custom_mount_add(CUSTOM_MOUNT_TMPFS);
if (!m)
return log_oom();
a = NULL;
m->destination = path;
m->options = opts;
r = strv_push(&arg_tmpfs, b);
if (r < 0)
path = opts = NULL;
break;
}
case ARG_OVERLAY:
case ARG_OVERLAY_RO: {
_cleanup_free_ char *upper = NULL, *destination = NULL;
_cleanup_strv_free_ char **lower = NULL;
CustomMount *m;
unsigned n = 0;
char **i;
lower = strv_split(optarg, ":");
if (!lower)
return log_oom();
b = NULL;
STRV_FOREACH(i, lower) {
if (!path_is_absolute(*i)) {
log_error("Overlay path %s is not absolute.", *i);
return -EINVAL;
}
n++;
}
if (n < 2) {
log_error("--overlay= needs at least two colon-separated directories specified.");
return -EINVAL;
}
if (n == 2) {
/* If two parameters are specified,
* the first one is the lower, the
* second one the upper directory. And
* we'll also define the the
* destination mount point the same as
* the upper. */
upper = lower[1];
lower[1] = NULL;
destination = strdup(upper);
if (!destination)
return log_oom();
} else {
upper = lower[n - 2];
destination = lower[n - 1];
lower[n - 2] = NULL;
}
m = custom_mount_add(CUSTOM_MOUNT_OVERLAY);
if (!m)
return log_oom();
m->destination = destination;
m->source = upper;
m->lower = lower;
m->read_only = c == ARG_OVERLAY_RO;
upper = destination = NULL;
lower = NULL;
break;
}
@ -964,62 +1130,149 @@ static int mount_all(const char *dest) {
return r;
}
static int mount_binds(const char *dest, char **l, bool ro) {
char **x, **y;
static int mount_bind(const char *dest, CustomMount *m) {
struct stat source_st, dest_st;
char *where;
int r;
STRV_FOREACH_PAIR(x, y, l) {
_cleanup_free_ char *where = NULL;
struct stat source_st, dest_st;
int r;
assert(dest);
assert(m);
if (stat(*x, &source_st) < 0)
return log_error_errno(errno, "Failed to stat %s: %m", *x);
if (stat(m->source, &source_st) < 0)
return log_error_errno(errno, "Failed to stat %s: %m", m->source);
where = strappend(dest, *y);
if (!where)
return log_oom();
where = strjoina(dest, m->destination);
r = stat(where, &dest_st);
if (r == 0) {
if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
log_error("Cannot bind mount directory %s on file %s.", *x, where);
return -EINVAL;
}
if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
log_error("Cannot bind mount file %s on directory %s.", *x, where);
return -EINVAL;
}
} else if (errno == ENOENT) {
r = mkdir_parents_label(where, 0755);
if (r < 0)
return log_error_errno(r, "Failed to bind mount %s: %m", *x);
} else {
log_error_errno(errno, "Failed to bind mount %s: %m", *x);
return -errno;
r = stat(where, &dest_st);
if (r >= 0) {
if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode)) {
log_error("Cannot bind mount directory %s on file %s.", m->source, where);
return -EINVAL;
}
/* Create the mount point. Any non-directory file can be
* mounted on any non-directory file (regular, fifo, socket,
* char, block).
*/
if (S_ISDIR(source_st.st_mode)) {
r = mkdir_label(where, 0755);
if (r < 0 && errno != EEXIST)
return log_error_errno(r, "Failed to create mount point %s: %m", where);
} else {
r = touch(where);
if (r < 0)
return log_error_errno(r, "Failed to create mount point %s: %m", where);
if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode)) {
log_error("Cannot bind mount file %s on directory %s.", m->source, where);
return -EINVAL;
}
if (mount(*x, where, NULL, MS_BIND, NULL) < 0)
return log_error_errno(errno, "mount(%s) failed: %m", where);
} else if (errno == ENOENT) {
r = mkdir_parents_label(where, 0755);
if (r < 0)
return log_error_errno(r, "Failed to make parents of %s: %m", where);
} else {
log_error_errno(errno, "Failed to stat %s: %m", where);
return -errno;
}
if (ro) {
r = bind_remount_recursive(where, true);
if (r < 0)
return log_error_errno(r, "Read-Only bind mount failed: %m");
/* Create the mount point. Any non-directory file can be
* mounted on any non-directory file (regular, fifo, socket,
* char, block).
*/
if (S_ISDIR(source_st.st_mode))
r = mkdir_label(where, 0755);
else
r = touch(where);
if (r < 0 && r != -EEXIST)
return log_error_errno(r, "Failed to create mount point %s: %m", where);
if (mount(m->source, where, NULL, MS_BIND, NULL) < 0)
return log_error_errno(errno, "mount(%s) failed: %m", where);
if (m->read_only) {
r = bind_remount_recursive(where, true);
if (r < 0)
return log_error_errno(r, "Read-only bind mount failed: %m");
}
return 0;
}
static int mount_tmpfs(const char *dest, CustomMount *m) {
char *where;
int r;
assert(dest);
assert(m);
where = strjoina(dest, m->destination);
r = mkdir_label(where, 0755);
if (r < 0 && r != -EEXIST)
return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, m->options) < 0)
return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
return 0;
}
static int mount_overlay(const char *dest, CustomMount *m) {
_cleanup_free_ char *lower = NULL;
char *where, *options;
int r;
assert(dest);
assert(m);
where = strjoina(dest, m->destination);
r = mkdir_label(where, 0755);
if (r < 0 && r != -EEXIST)
return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where);
(void) mkdir_p_label(m->source, 0755);
strv_reverse(m->lower);
lower = strv_join(m->lower, ":");
strv_reverse(m->lower);
if (!lower)
return log_oom();
if (m->read_only)
options = strjoina("lowerdir=", m->source, ":", lower);
else {
assert(m->work_dir);
(void) mkdir_label(m->work_dir, 0700);
options = strjoina("lowerdir=", lower, ",upperdir=", m->source, ",workdir=", m->work_dir);
}
if (mount("overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options) < 0)
return log_error_errno(errno, "overlay mount to %s failed: %m", where);
return 0;
}
static int mount_custom(const char *dest) {
unsigned i;
int r;
assert(dest);
for (i = 0; i < arg_n_custom_mounts; i++) {
CustomMount *m = &arg_custom_mounts[i];
switch (m->type) {
case CUSTOM_MOUNT_BIND:
r = mount_bind(dest, m);
break;
case CUSTOM_MOUNT_TMPFS:
r = mount_tmpfs(dest, m);
break;
case CUSTOM_MOUNT_OVERLAY:
r = mount_overlay(dest, m);
break;
default:
assert_not_reached("Unknown custom mount type");
}
if (r < 0)
return r;
}
return 0;
@ -1139,28 +1392,6 @@ static int mount_cgroup(const char *dest) {
return 0;
}
static int mount_tmpfs(const char *dest) {
char **i, **o;
STRV_FOREACH_PAIR(i, o, arg_tmpfs) {
_cleanup_free_ char *where = NULL;
int r;
where = strappend(dest, *i);
if (!where)
return log_oom();
r = mkdir_label(where, 0755);
if (r < 0 && r != -EEXIST)
return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
if (mount("tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, *o) < 0)
return log_error_errno(errno, "tmpfs mount to %s failed: %m", where);
}
return 0;
}
static int setup_timezone(const char *dest) {
_cleanup_free_ char *where = NULL, *p = NULL, *q = NULL, *check = NULL, *what = NULL;
char *z, *y;
@ -3889,6 +4120,10 @@ int main(int argc, char *argv[]) {
if (r < 0)
goto finish;
r = custom_mounts_prepare();
if (r < 0)
goto finish;
interactive = isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0;
master = posix_openpt(O_RDWR|O_NOCTTY|O_CLOEXEC|O_NDELAY);
@ -4126,13 +4361,7 @@ int main(int argc, char *argv[]) {
if (setup_journal(arg_directory) < 0)
_exit(EXIT_FAILURE);
if (mount_binds(arg_directory, arg_bind, false) < 0)
_exit(EXIT_FAILURE);
if (mount_binds(arg_directory, arg_bind_ro, true) < 0)
_exit(EXIT_FAILURE);
if (mount_tmpfs(arg_directory) < 0)
if (mount_custom(arg_directory) < 0)
_exit(EXIT_FAILURE);
/* Wait until we are cgroup-ified, so that we
@ -4505,9 +4734,7 @@ finish:
strv_free(arg_network_interfaces);
strv_free(arg_network_macvlan);
strv_free(arg_network_ipvlan);
strv_free(arg_bind);
strv_free(arg_bind_ro);
strv_free(arg_tmpfs);
custom_mount_free_all();
flush_ports(&exposed);