2020-11-09 05:23:58 +01:00
/* SPDX-License-Identifier: LGPL-2.1-or-later */
2015-09-07 15:59:52 +02:00
2015-10-23 18:52:53 +02:00
# include <sys/mount.h>
2015-10-24 22:58:24 +02:00
# include <linux/magic.h>
2015-09-07 15:59:52 +02:00
2015-10-27 03:01:06 +01:00
# include "alloc-util.h"
2015-10-23 18:52:53 +02:00
# include "escape.h"
2016-06-23 13:41:56 +02:00
# include "fd-util.h"
2019-03-27 11:32:41 +01:00
# include "format-util.h"
2015-10-26 21:16:26 +01:00
# include "fs-util.h"
2015-09-07 15:59:52 +02:00
# include "label.h"
2015-10-23 18:52:53 +02:00
# include "mkdir.h"
2015-10-26 18:44:13 +01:00
# include "mount-util.h"
2018-11-29 10:24:39 +01:00
# include "mountpoint-util.h"
2015-10-26 16:18:16 +01:00
# include "nspawn-mount.h"
# include "parse-util.h"
2015-10-23 18:52:53 +02:00
# include "path-util.h"
# include "rm-rf.h"
2015-09-07 15:59:52 +02:00
# include "set.h"
2019-03-13 12:14:47 +01:00
# include "sort-util.h"
2015-10-26 22:01:44 +01:00
# include "stat-util.h"
2015-10-24 22:58:24 +02:00
# include "string-util.h"
2015-10-23 18:52:53 +02:00
# include "strv.h"
2018-11-30 21:05:27 +01:00
# include "tmpfile-util.h"
2015-10-27 00:42:07 +01:00
# include "user-util.h"
2015-09-07 15:59:52 +02:00
2018-04-27 22:01:54 +02:00
CustomMount * custom_mount_add ( CustomMount * * l , size_t * n , CustomMountType t ) {
2015-09-07 15:59:52 +02:00
CustomMount * c , * ret ;
assert ( l ) ;
assert ( n ) ;
assert ( t > = 0 ) ;
assert ( t < _CUSTOM_MOUNT_TYPE_MAX ) ;
2018-02-26 21:20:00 +01:00
c = reallocarray ( * l , * n + 1 , sizeof ( CustomMount ) ) ;
2015-09-07 15:59:52 +02:00
if ( ! c )
return NULL ;
* l = c ;
ret = * l + * n ;
( * n ) + + ;
2020-09-22 15:51:17 +02:00
* ret = ( CustomMount ) {
. type = t
} ;
2015-09-07 15:59:52 +02:00
return ret ;
}
2018-04-27 22:01:54 +02:00
void custom_mount_free_all ( CustomMount * l , size_t n ) {
size_t i ;
2015-09-07 15:59:52 +02:00
for ( i = 0 ; i < n ; i + + ) {
CustomMount * m = l + i ;
free ( m - > source ) ;
free ( m - > destination ) ;
free ( m - > options ) ;
if ( m - > work_dir ) {
( void ) rm_rf ( m - > work_dir , REMOVE_ROOT | REMOVE_PHYSICAL ) ;
free ( m - > work_dir ) ;
}
2016-11-30 18:57:42 +01:00
if ( m - > rm_rf_tmpdir ) {
( void ) rm_rf ( m - > rm_rf_tmpdir , REMOVE_ROOT | REMOVE_PHYSICAL ) ;
free ( m - > rm_rf_tmpdir ) ;
}
2015-09-07 15:59:52 +02:00
strv_free ( m - > lower ) ;
nspawn: add support for executing OCI runtime bundles with nspawn
This is a pretty large patch, and adds support for OCI runtime bundles
to nspawn. A new switch --oci-bundle= is added that takes a path to an
OCI bundle. The JSON file included therein is read similar to a .nspawn
settings files, however with a different feature set.
Implementation-wise this mostly extends the pre-existing Settings object
to carry additional properties for OCI. However, OCI supports some
concepts .nspawn files did not support yet, which this patch also adds:
1. Support for "masking" files and directories. This functionatly is now
also available via the new --inaccesible= cmdline command, and
Inaccessible= in .nspawn files.
2. Support for mounting arbitrary file systems. (not exposed through
nspawn cmdline nor .nspawn files, because probably not a good idea)
3. Ability to configure the console settings for a container. This
functionality is now also available on the nspawn cmdline in the new
--console= switch (not added to .nspawn for now, as it is something
specific to the invocation really, not a property of the container)
4. Console width/height configuration. Not exposed through
.nspawn/cmdline, but this may be controlled through $COLUMNS and
$LINES like in most other UNIX tools.
5. UID/GID configuration by raw numbers. (not exposed in .nspawn and on
the cmdline, since containers likely have different user tables, and
the existing --user= switch appears to be the better option)
6. OCI hook commands (no exposed in .nspawn/cmdline, as very specific to
OCI)
7. Creation of additional devices nodes in /dev. Most likely not a good
idea, hence not exposed in .nspawn/cmdline. There's already --bind=
to achieve the same, which is the better alternative.
8. Explicit syscall filters. This is not a good idea, due to the skewed
arch support, hence not exposed through .nspawn/cmdline.
9. Configuration of some sysctls on a whitelist. Questionnable, not
supported in .nspawn/cmdline for now.
10. Configuration of all 5 types of capabilities. Not a useful concept,
since the kernel will reduce the caps on execve() anyway. Not
exposed through .nspawn/cmdline as this is not very useful hence.
Note that this only implements the OCI runtime logic itself. It does not
provide a runc-compatible command line tool. This is left for a later
PR. Only with that in place tools such as "buildah" can use the OCI
support in nspawn as drop-in replacement.
Currently still missing is OCI hook support, but it's already parsed and
everything, and should be easy to add. Other than that it's OCI is
implemented pretty comprehensively.
There's a list of incompatibilities in the nspawn-oci.c file. In a later
PR I'd like to convert this into proper markdown and add it to the
documentation directory.
2018-04-25 11:23:37 +02:00
free ( m - > type_argument ) ;
2015-09-07 15:59:52 +02:00
}
free ( l ) ;
}
2018-09-18 01:39:24 +02:00
static int custom_mount_compare ( const CustomMount * a , const CustomMount * b ) {
2015-09-07 15:59:52 +02:00
int r ;
2018-09-18 01:39:24 +02:00
r = path_compare ( a - > destination , b - > destination ) ;
2015-09-07 15:59:52 +02:00
if ( r ! = 0 )
return r ;
2018-09-18 01:39:24 +02:00
return CMP ( a - > type , b - > type ) ;
2015-09-07 15:59:52 +02:00
}
2016-11-30 16:02:47 +01:00
static bool source_path_is_valid ( const char * p ) {
assert ( p ) ;
if ( * p = = ' + ' )
p + + ;
return path_is_absolute ( p ) ;
}
static char * resolve_source_path ( const char * dest , const char * source ) {
if ( ! source )
return NULL ;
if ( source [ 0 ] = = ' + ' )
2019-06-19 15:20:13 +02:00
return path_join ( dest , source + 1 ) ;
2016-11-30 16:02:47 +01:00
return strdup ( source ) ;
}
2019-12-12 17:05:21 +01:00
static int allocate_temporary_source ( CustomMount * m ) {
assert ( m ) ;
assert ( ! m - > source ) ;
assert ( ! m - > rm_rf_tmpdir ) ;
m - > rm_rf_tmpdir = strdup ( " /var/tmp/nspawn-temp-XXXXXX " ) ;
if ( ! m - > rm_rf_tmpdir )
return log_oom ( ) ;
if ( ! mkdtemp ( m - > rm_rf_tmpdir ) ) {
m - > rm_rf_tmpdir = mfree ( m - > rm_rf_tmpdir ) ;
return log_error_errno ( errno , " Failed to acquire temporary directory: %m " ) ;
}
m - > source = path_join ( m - > rm_rf_tmpdir , " src " ) ;
if ( ! m - > source )
return log_oom ( ) ;
if ( mkdir ( m - > source , 0755 ) < 0 )
return log_error_errno ( errno , " Failed to create %s: %m " , m - > source ) ;
return 0 ;
}
2018-04-27 22:01:54 +02:00
int custom_mount_prepare_all ( const char * dest , CustomMount * l , size_t n ) {
size_t i ;
2016-11-30 16:02:47 +01:00
int r ;
/* Prepare all custom mounts. This will make source we know all temporary directories. This is called in the
* parent process , so that we know the temporary directories to remove on exit before we fork off the
* children . */
assert ( l | | n = = 0 ) ;
/* Order the custom mounts, and make sure we have a working directory */
2018-09-18 01:39:24 +02:00
typesafe_qsort ( l , n , custom_mount_compare ) ;
2016-11-30 16:02:47 +01:00
for ( i = 0 ; i < n ; i + + ) {
CustomMount * m = l + i ;
nspawn: add support for executing OCI runtime bundles with nspawn
This is a pretty large patch, and adds support for OCI runtime bundles
to nspawn. A new switch --oci-bundle= is added that takes a path to an
OCI bundle. The JSON file included therein is read similar to a .nspawn
settings files, however with a different feature set.
Implementation-wise this mostly extends the pre-existing Settings object
to carry additional properties for OCI. However, OCI supports some
concepts .nspawn files did not support yet, which this patch also adds:
1. Support for "masking" files and directories. This functionatly is now
also available via the new --inaccesible= cmdline command, and
Inaccessible= in .nspawn files.
2. Support for mounting arbitrary file systems. (not exposed through
nspawn cmdline nor .nspawn files, because probably not a good idea)
3. Ability to configure the console settings for a container. This
functionality is now also available on the nspawn cmdline in the new
--console= switch (not added to .nspawn for now, as it is something
specific to the invocation really, not a property of the container)
4. Console width/height configuration. Not exposed through
.nspawn/cmdline, but this may be controlled through $COLUMNS and
$LINES like in most other UNIX tools.
5. UID/GID configuration by raw numbers. (not exposed in .nspawn and on
the cmdline, since containers likely have different user tables, and
the existing --user= switch appears to be the better option)
6. OCI hook commands (no exposed in .nspawn/cmdline, as very specific to
OCI)
7. Creation of additional devices nodes in /dev. Most likely not a good
idea, hence not exposed in .nspawn/cmdline. There's already --bind=
to achieve the same, which is the better alternative.
8. Explicit syscall filters. This is not a good idea, due to the skewed
arch support, hence not exposed through .nspawn/cmdline.
9. Configuration of some sysctls on a whitelist. Questionnable, not
supported in .nspawn/cmdline for now.
10. Configuration of all 5 types of capabilities. Not a useful concept,
since the kernel will reduce the caps on execve() anyway. Not
exposed through .nspawn/cmdline as this is not very useful hence.
Note that this only implements the OCI runtime logic itself. It does not
provide a runc-compatible command line tool. This is left for a later
PR. Only with that in place tools such as "buildah" can use the OCI
support in nspawn as drop-in replacement.
Currently still missing is OCI hook support, but it's already parsed and
everything, and should be easy to add. Other than that it's OCI is
implemented pretty comprehensively.
There's a list of incompatibilities in the nspawn-oci.c file. In a later
PR I'd like to convert this into proper markdown and add it to the
documentation directory.
2018-04-25 11:23:37 +02:00
/* /proc we mount in the inner child, i.e. when we acquired CLONE_NEWPID. All other mounts we mount
* already in the outer child , so that the mounts are already established before CLONE_NEWPID and in
* particular CLONE_NEWUSER . This also means any custom mounts below / proc also need to be mounted in
* the inner child , not the outer one . Determine this here . */
m - > in_userns = path_startswith ( m - > destination , " /proc " ) ;
2016-11-30 16:02:47 +01:00
nspawn: add support for executing OCI runtime bundles with nspawn
This is a pretty large patch, and adds support for OCI runtime bundles
to nspawn. A new switch --oci-bundle= is added that takes a path to an
OCI bundle. The JSON file included therein is read similar to a .nspawn
settings files, however with a different feature set.
Implementation-wise this mostly extends the pre-existing Settings object
to carry additional properties for OCI. However, OCI supports some
concepts .nspawn files did not support yet, which this patch also adds:
1. Support for "masking" files and directories. This functionatly is now
also available via the new --inaccesible= cmdline command, and
Inaccessible= in .nspawn files.
2. Support for mounting arbitrary file systems. (not exposed through
nspawn cmdline nor .nspawn files, because probably not a good idea)
3. Ability to configure the console settings for a container. This
functionality is now also available on the nspawn cmdline in the new
--console= switch (not added to .nspawn for now, as it is something
specific to the invocation really, not a property of the container)
4. Console width/height configuration. Not exposed through
.nspawn/cmdline, but this may be controlled through $COLUMNS and
$LINES like in most other UNIX tools.
5. UID/GID configuration by raw numbers. (not exposed in .nspawn and on
the cmdline, since containers likely have different user tables, and
the existing --user= switch appears to be the better option)
6. OCI hook commands (no exposed in .nspawn/cmdline, as very specific to
OCI)
7. Creation of additional devices nodes in /dev. Most likely not a good
idea, hence not exposed in .nspawn/cmdline. There's already --bind=
to achieve the same, which is the better alternative.
8. Explicit syscall filters. This is not a good idea, due to the skewed
arch support, hence not exposed through .nspawn/cmdline.
9. Configuration of some sysctls on a whitelist. Questionnable, not
supported in .nspawn/cmdline for now.
10. Configuration of all 5 types of capabilities. Not a useful concept,
since the kernel will reduce the caps on execve() anyway. Not
exposed through .nspawn/cmdline as this is not very useful hence.
Note that this only implements the OCI runtime logic itself. It does not
provide a runc-compatible command line tool. This is left for a later
PR. Only with that in place tools such as "buildah" can use the OCI
support in nspawn as drop-in replacement.
Currently still missing is OCI hook support, but it's already parsed and
everything, and should be easy to add. Other than that it's OCI is
implemented pretty comprehensively.
There's a list of incompatibilities in the nspawn-oci.c file. In a later
PR I'd like to convert this into proper markdown and add it to the
documentation directory.
2018-04-25 11:23:37 +02:00
if ( m - > type = = CUSTOM_MOUNT_BIND ) {
if ( m - > source ) {
char * s ;
2016-11-30 16:02:47 +01:00
nspawn: add support for executing OCI runtime bundles with nspawn
This is a pretty large patch, and adds support for OCI runtime bundles
to nspawn. A new switch --oci-bundle= is added that takes a path to an
OCI bundle. The JSON file included therein is read similar to a .nspawn
settings files, however with a different feature set.
Implementation-wise this mostly extends the pre-existing Settings object
to carry additional properties for OCI. However, OCI supports some
concepts .nspawn files did not support yet, which this patch also adds:
1. Support for "masking" files and directories. This functionatly is now
also available via the new --inaccesible= cmdline command, and
Inaccessible= in .nspawn files.
2. Support for mounting arbitrary file systems. (not exposed through
nspawn cmdline nor .nspawn files, because probably not a good idea)
3. Ability to configure the console settings for a container. This
functionality is now also available on the nspawn cmdline in the new
--console= switch (not added to .nspawn for now, as it is something
specific to the invocation really, not a property of the container)
4. Console width/height configuration. Not exposed through
.nspawn/cmdline, but this may be controlled through $COLUMNS and
$LINES like in most other UNIX tools.
5. UID/GID configuration by raw numbers. (not exposed in .nspawn and on
the cmdline, since containers likely have different user tables, and
the existing --user= switch appears to be the better option)
6. OCI hook commands (no exposed in .nspawn/cmdline, as very specific to
OCI)
7. Creation of additional devices nodes in /dev. Most likely not a good
idea, hence not exposed in .nspawn/cmdline. There's already --bind=
to achieve the same, which is the better alternative.
8. Explicit syscall filters. This is not a good idea, due to the skewed
arch support, hence not exposed through .nspawn/cmdline.
9. Configuration of some sysctls on a whitelist. Questionnable, not
supported in .nspawn/cmdline for now.
10. Configuration of all 5 types of capabilities. Not a useful concept,
since the kernel will reduce the caps on execve() anyway. Not
exposed through .nspawn/cmdline as this is not very useful hence.
Note that this only implements the OCI runtime logic itself. It does not
provide a runc-compatible command line tool. This is left for a later
PR. Only with that in place tools such as "buildah" can use the OCI
support in nspawn as drop-in replacement.
Currently still missing is OCI hook support, but it's already parsed and
everything, and should be easy to add. Other than that it's OCI is
implemented pretty comprehensively.
There's a list of incompatibilities in the nspawn-oci.c file. In a later
PR I'd like to convert this into proper markdown and add it to the
documentation directory.
2018-04-25 11:23:37 +02:00
s = resolve_source_path ( dest , m - > source ) ;
if ( ! s )
return log_oom ( ) ;
2016-11-30 18:57:42 +01:00
nspawn: add support for executing OCI runtime bundles with nspawn
This is a pretty large patch, and adds support for OCI runtime bundles
to nspawn. A new switch --oci-bundle= is added that takes a path to an
OCI bundle. The JSON file included therein is read similar to a .nspawn
settings files, however with a different feature set.
Implementation-wise this mostly extends the pre-existing Settings object
to carry additional properties for OCI. However, OCI supports some
concepts .nspawn files did not support yet, which this patch also adds:
1. Support for "masking" files and directories. This functionatly is now
also available via the new --inaccesible= cmdline command, and
Inaccessible= in .nspawn files.
2. Support for mounting arbitrary file systems. (not exposed through
nspawn cmdline nor .nspawn files, because probably not a good idea)
3. Ability to configure the console settings for a container. This
functionality is now also available on the nspawn cmdline in the new
--console= switch (not added to .nspawn for now, as it is something
specific to the invocation really, not a property of the container)
4. Console width/height configuration. Not exposed through
.nspawn/cmdline, but this may be controlled through $COLUMNS and
$LINES like in most other UNIX tools.
5. UID/GID configuration by raw numbers. (not exposed in .nspawn and on
the cmdline, since containers likely have different user tables, and
the existing --user= switch appears to be the better option)
6. OCI hook commands (no exposed in .nspawn/cmdline, as very specific to
OCI)
7. Creation of additional devices nodes in /dev. Most likely not a good
idea, hence not exposed in .nspawn/cmdline. There's already --bind=
to achieve the same, which is the better alternative.
8. Explicit syscall filters. This is not a good idea, due to the skewed
arch support, hence not exposed through .nspawn/cmdline.
9. Configuration of some sysctls on a whitelist. Questionnable, not
supported in .nspawn/cmdline for now.
10. Configuration of all 5 types of capabilities. Not a useful concept,
since the kernel will reduce the caps on execve() anyway. Not
exposed through .nspawn/cmdline as this is not very useful hence.
Note that this only implements the OCI runtime logic itself. It does not
provide a runc-compatible command line tool. This is left for a later
PR. Only with that in place tools such as "buildah" can use the OCI
support in nspawn as drop-in replacement.
Currently still missing is OCI hook support, but it's already parsed and
everything, and should be easy to add. Other than that it's OCI is
implemented pretty comprehensively.
There's a list of incompatibilities in the nspawn-oci.c file. In a later
PR I'd like to convert this into proper markdown and add it to the
documentation directory.
2018-04-25 11:23:37 +02:00
free_and_replace ( m - > source , s ) ;
} else {
/* No source specified? In that case, use a throw-away temporary directory in /var/tmp */
2016-11-30 18:57:42 +01:00
2019-12-12 17:05:21 +01:00
r = allocate_temporary_source ( m ) ;
if ( r < 0 )
return r ;
nspawn: add support for executing OCI runtime bundles with nspawn
This is a pretty large patch, and adds support for OCI runtime bundles
to nspawn. A new switch --oci-bundle= is added that takes a path to an
OCI bundle. The JSON file included therein is read similar to a .nspawn
settings files, however with a different feature set.
Implementation-wise this mostly extends the pre-existing Settings object
to carry additional properties for OCI. However, OCI supports some
concepts .nspawn files did not support yet, which this patch also adds:
1. Support for "masking" files and directories. This functionatly is now
also available via the new --inaccesible= cmdline command, and
Inaccessible= in .nspawn files.
2. Support for mounting arbitrary file systems. (not exposed through
nspawn cmdline nor .nspawn files, because probably not a good idea)
3. Ability to configure the console settings for a container. This
functionality is now also available on the nspawn cmdline in the new
--console= switch (not added to .nspawn for now, as it is something
specific to the invocation really, not a property of the container)
4. Console width/height configuration. Not exposed through
.nspawn/cmdline, but this may be controlled through $COLUMNS and
$LINES like in most other UNIX tools.
5. UID/GID configuration by raw numbers. (not exposed in .nspawn and on
the cmdline, since containers likely have different user tables, and
the existing --user= switch appears to be the better option)
6. OCI hook commands (no exposed in .nspawn/cmdline, as very specific to
OCI)
7. Creation of additional devices nodes in /dev. Most likely not a good
idea, hence not exposed in .nspawn/cmdline. There's already --bind=
to achieve the same, which is the better alternative.
8. Explicit syscall filters. This is not a good idea, due to the skewed
arch support, hence not exposed through .nspawn/cmdline.
9. Configuration of some sysctls on a whitelist. Questionnable, not
supported in .nspawn/cmdline for now.
10. Configuration of all 5 types of capabilities. Not a useful concept,
since the kernel will reduce the caps on execve() anyway. Not
exposed through .nspawn/cmdline as this is not very useful hence.
Note that this only implements the OCI runtime logic itself. It does not
provide a runc-compatible command line tool. This is left for a later
PR. Only with that in place tools such as "buildah" can use the OCI
support in nspawn as drop-in replacement.
Currently still missing is OCI hook support, but it's already parsed and
everything, and should be easy to add. Other than that it's OCI is
implemented pretty comprehensively.
There's a list of incompatibilities in the nspawn-oci.c file. In a later
PR I'd like to convert this into proper markdown and add it to the
documentation directory.
2018-04-25 11:23:37 +02:00
}
2016-11-30 16:02:47 +01:00
}
if ( m - > type = = CUSTOM_MOUNT_OVERLAY ) {
char * * j ;
STRV_FOREACH ( j , m - > lower ) {
char * s ;
s = resolve_source_path ( dest , * j ) ;
if ( ! s )
return log_oom ( ) ;
2018-04-27 22:03:14 +02:00
free_and_replace ( * j , s ) ;
2016-11-30 16:02:47 +01:00
}
2019-12-12 17:05:21 +01:00
if ( m - > source ) {
char * s ;
s = resolve_source_path ( dest , m - > source ) ;
if ( ! s )
return log_oom ( ) ;
free_and_replace ( m - > source , s ) ;
} else {
r = allocate_temporary_source ( m ) ;
if ( r < 0 )
return r ;
}
2016-11-30 16:02:47 +01:00
if ( m - > work_dir ) {
char * s ;
s = resolve_source_path ( dest , m - > work_dir ) ;
if ( ! s )
return log_oom ( ) ;
2018-04-27 22:03:14 +02:00
free_and_replace ( m - > work_dir , s ) ;
2016-11-30 16:02:47 +01:00
} else {
r = tempfn_random ( m - > source , NULL , & m - > work_dir ) ;
if ( r < 0 )
return log_error_errno ( r , " Failed to acquire working directory: %m " ) ;
}
( void ) mkdir_label ( m - > work_dir , 0700 ) ;
}
}
return 0 ;
}
2018-04-27 22:01:54 +02:00
int bind_mount_parse ( CustomMount * * l , size_t * n , const char * s , bool read_only ) {
2015-09-07 15:59:52 +02:00
_cleanup_free_ char * source = NULL , * destination = NULL , * opts = NULL ;
const char * p = s ;
CustomMount * m ;
int r ;
assert ( l ) ;
assert ( n ) ;
r = extract_many_words ( & p , " : " , EXTRACT_DONT_COALESCE_SEPARATORS , & source , & destination , NULL ) ;
if ( r < 0 )
return r ;
if ( r = = 0 )
return - EINVAL ;
if ( r = = 1 ) {
2016-11-30 16:02:47 +01:00
destination = strdup ( source [ 0 ] = = ' + ' ? source + 1 : source ) ;
2015-09-07 15:59:52 +02:00
if ( ! destination )
return - ENOMEM ;
}
if ( r = = 2 & & ! isempty ( p ) ) {
opts = strdup ( p ) ;
if ( ! opts )
return - ENOMEM ;
}
2016-11-30 18:57:42 +01:00
if ( isempty ( source ) )
2019-03-11 14:27:29 +01:00
source = mfree ( source ) ;
2016-11-30 18:57:42 +01:00
else if ( ! source_path_is_valid ( source ) )
2015-09-07 15:59:52 +02:00
return - EINVAL ;
2016-11-30 18:57:42 +01:00
2015-09-07 15:59:52 +02:00
if ( ! path_is_absolute ( destination ) )
return - EINVAL ;
m = custom_mount_add ( l , n , CUSTOM_MOUNT_BIND ) ;
if ( ! m )
2016-11-29 23:47:24 +01:00
return - ENOMEM ;
2015-09-07 15:59:52 +02:00
2019-03-11 14:27:29 +01:00
m - > source = TAKE_PTR ( source ) ;
m - > destination = TAKE_PTR ( destination ) ;
2015-09-07 15:59:52 +02:00
m - > read_only = read_only ;
2019-03-11 14:27:29 +01:00
m - > options = TAKE_PTR ( opts ) ;
nspawn: add support for executing OCI runtime bundles with nspawn
This is a pretty large patch, and adds support for OCI runtime bundles
to nspawn. A new switch --oci-bundle= is added that takes a path to an
OCI bundle. The JSON file included therein is read similar to a .nspawn
settings files, however with a different feature set.
Implementation-wise this mostly extends the pre-existing Settings object
to carry additional properties for OCI. However, OCI supports some
concepts .nspawn files did not support yet, which this patch also adds:
1. Support for "masking" files and directories. This functionatly is now
also available via the new --inaccesible= cmdline command, and
Inaccessible= in .nspawn files.
2. Support for mounting arbitrary file systems. (not exposed through
nspawn cmdline nor .nspawn files, because probably not a good idea)
3. Ability to configure the console settings for a container. This
functionality is now also available on the nspawn cmdline in the new
--console= switch (not added to .nspawn for now, as it is something
specific to the invocation really, not a property of the container)
4. Console width/height configuration. Not exposed through
.nspawn/cmdline, but this may be controlled through $COLUMNS and
$LINES like in most other UNIX tools.
5. UID/GID configuration by raw numbers. (not exposed in .nspawn and on
the cmdline, since containers likely have different user tables, and
the existing --user= switch appears to be the better option)
6. OCI hook commands (no exposed in .nspawn/cmdline, as very specific to
OCI)
7. Creation of additional devices nodes in /dev. Most likely not a good
idea, hence not exposed in .nspawn/cmdline. There's already --bind=
to achieve the same, which is the better alternative.
8. Explicit syscall filters. This is not a good idea, due to the skewed
arch support, hence not exposed through .nspawn/cmdline.
9. Configuration of some sysctls on a whitelist. Questionnable, not
supported in .nspawn/cmdline for now.
10. Configuration of all 5 types of capabilities. Not a useful concept,
since the kernel will reduce the caps on execve() anyway. Not
exposed through .nspawn/cmdline as this is not very useful hence.
Note that this only implements the OCI runtime logic itself. It does not
provide a runc-compatible command line tool. This is left for a later
PR. Only with that in place tools such as "buildah" can use the OCI
support in nspawn as drop-in replacement.
Currently still missing is OCI hook support, but it's already parsed and
everything, and should be easy to add. Other than that it's OCI is
implemented pretty comprehensively.
There's a list of incompatibilities in the nspawn-oci.c file. In a later
PR I'd like to convert this into proper markdown and add it to the
documentation directory.
2018-04-25 11:23:37 +02:00
2015-09-07 15:59:52 +02:00
return 0 ;
}
2018-04-27 22:01:54 +02:00
int tmpfs_mount_parse ( CustomMount * * l , size_t * n , const char * s ) {
2015-09-07 15:59:52 +02:00
_cleanup_free_ char * path = NULL , * opts = NULL ;
const char * p = s ;
CustomMount * m ;
int r ;
assert ( l ) ;
assert ( n ) ;
assert ( s ) ;
r = extract_first_word ( & p , & path , " : " , EXTRACT_DONT_COALESCE_SEPARATORS ) ;
if ( r < 0 )
return r ;
if ( r = = 0 )
return - EINVAL ;
if ( isempty ( p ) )
opts = strdup ( " mode=0755 " ) ;
else
opts = strdup ( p ) ;
if ( ! opts )
return - ENOMEM ;
if ( ! path_is_absolute ( path ) )
return - EINVAL ;
m = custom_mount_add ( l , n , CUSTOM_MOUNT_TMPFS ) ;
if ( ! m )
return - ENOMEM ;
2018-04-05 07:26:26 +02:00
m - > destination = TAKE_PTR ( path ) ;
m - > options = TAKE_PTR ( opts ) ;
2015-09-07 15:59:52 +02:00
return 0 ;
}
2018-04-27 22:01:54 +02:00
int overlay_mount_parse ( CustomMount * * l , size_t * n , const char * s , bool read_only ) {
2016-11-29 23:47:58 +01:00
_cleanup_free_ char * upper = NULL , * destination = NULL ;
_cleanup_strv_free_ char * * lower = NULL ;
CustomMount * m ;
2016-11-30 16:02:47 +01:00
int k ;
2016-11-29 23:47:58 +01:00
2020-08-03 17:52:01 +02:00
k = strv_split_full ( & lower , s , " : " , EXTRACT_DONT_COALESCE_SEPARATORS ) ;
2016-11-30 16:02:47 +01:00
if ( k < 0 )
return k ;
2016-11-29 23:47:58 +01:00
if ( k < 2 )
return - EADDRNOTAVAIL ;
if ( k = = 2 ) {
2016-11-30 16:02:47 +01:00
/* If two parameters are specified, the first one is the lower, the second one the upper directory. And
* we ' ll also define the destination mount point the same as the upper . */
if ( ! source_path_is_valid ( lower [ 0 ] ) | |
! source_path_is_valid ( lower [ 1 ] ) )
return - EINVAL ;
2018-03-22 16:53:26 +01:00
upper = TAKE_PTR ( lower [ 1 ] ) ;
2016-11-29 23:47:58 +01:00
2016-11-30 16:02:47 +01:00
destination = strdup ( upper [ 0 ] = = ' + ' ? upper + 1 : upper ) ; /* take the destination without "+" prefix */
2016-11-29 23:47:58 +01:00
if ( ! destination )
return - ENOMEM ;
} else {
2016-11-30 18:57:42 +01:00
char * * i ;
2016-11-30 16:02:47 +01:00
/* If more than two parameters are specified, the last one is the destination, the second to last one
* the " upper " , and all before that the " lower " directories . */
2016-11-29 23:47:58 +01:00
destination = lower [ k - 1 ] ;
2018-03-22 16:53:26 +01:00
upper = TAKE_PTR ( lower [ k - 2 ] ) ;
2016-11-30 16:02:47 +01:00
2016-11-30 18:57:42 +01:00
STRV_FOREACH ( i , lower )
if ( ! source_path_is_valid ( * i ) )
return - EINVAL ;
/* If the upper directory is unspecified, then let's create it automatically as a throw-away directory
* in / var / tmp */
if ( isempty ( upper ) )
2019-03-15 15:53:02 +01:00
upper = mfree ( upper ) ;
2016-11-30 18:57:42 +01:00
else if ( ! source_path_is_valid ( upper ) )
return - EINVAL ;
2016-11-30 16:02:47 +01:00
if ( ! path_is_absolute ( destination ) )
return - EINVAL ;
2016-11-29 23:47:58 +01:00
}
m = custom_mount_add ( l , n , CUSTOM_MOUNT_OVERLAY ) ;
if ( ! m )
return - ENOMEM ;
2018-04-05 07:26:26 +02:00
m - > destination = TAKE_PTR ( destination ) ;
m - > source = TAKE_PTR ( upper ) ;
m - > lower = TAKE_PTR ( lower ) ;
2016-11-29 23:47:58 +01:00
m - > read_only = read_only ;
return 0 ;
}
nspawn: add support for executing OCI runtime bundles with nspawn
This is a pretty large patch, and adds support for OCI runtime bundles
to nspawn. A new switch --oci-bundle= is added that takes a path to an
OCI bundle. The JSON file included therein is read similar to a .nspawn
settings files, however with a different feature set.
Implementation-wise this mostly extends the pre-existing Settings object
to carry additional properties for OCI. However, OCI supports some
concepts .nspawn files did not support yet, which this patch also adds:
1. Support for "masking" files and directories. This functionatly is now
also available via the new --inaccesible= cmdline command, and
Inaccessible= in .nspawn files.
2. Support for mounting arbitrary file systems. (not exposed through
nspawn cmdline nor .nspawn files, because probably not a good idea)
3. Ability to configure the console settings for a container. This
functionality is now also available on the nspawn cmdline in the new
--console= switch (not added to .nspawn for now, as it is something
specific to the invocation really, not a property of the container)
4. Console width/height configuration. Not exposed through
.nspawn/cmdline, but this may be controlled through $COLUMNS and
$LINES like in most other UNIX tools.
5. UID/GID configuration by raw numbers. (not exposed in .nspawn and on
the cmdline, since containers likely have different user tables, and
the existing --user= switch appears to be the better option)
6. OCI hook commands (no exposed in .nspawn/cmdline, as very specific to
OCI)
7. Creation of additional devices nodes in /dev. Most likely not a good
idea, hence not exposed in .nspawn/cmdline. There's already --bind=
to achieve the same, which is the better alternative.
8. Explicit syscall filters. This is not a good idea, due to the skewed
arch support, hence not exposed through .nspawn/cmdline.
9. Configuration of some sysctls on a whitelist. Questionnable, not
supported in .nspawn/cmdline for now.
10. Configuration of all 5 types of capabilities. Not a useful concept,
since the kernel will reduce the caps on execve() anyway. Not
exposed through .nspawn/cmdline as this is not very useful hence.
Note that this only implements the OCI runtime logic itself. It does not
provide a runc-compatible command line tool. This is left for a later
PR. Only with that in place tools such as "buildah" can use the OCI
support in nspawn as drop-in replacement.
Currently still missing is OCI hook support, but it's already parsed and
everything, and should be easy to add. Other than that it's OCI is
implemented pretty comprehensively.
There's a list of incompatibilities in the nspawn-oci.c file. In a later
PR I'd like to convert this into proper markdown and add it to the
documentation directory.
2018-04-25 11:23:37 +02:00
int inaccessible_mount_parse ( CustomMount * * l , size_t * n , const char * s ) {
_cleanup_free_ char * path = NULL ;
CustomMount * m ;
assert ( l ) ;
assert ( n ) ;
assert ( s ) ;
if ( ! path_is_absolute ( s ) )
return - EINVAL ;
path = strdup ( s ) ;
if ( ! path )
return - ENOMEM ;
m = custom_mount_add ( l , n , CUSTOM_MOUNT_INACCESSIBLE ) ;
if ( ! m )
return - ENOMEM ;
m - > destination = TAKE_PTR ( path ) ;
return 0 ;
}
2017-07-08 00:57:08 +02:00
int tmpfs_patch_options (
2015-09-07 15:59:52 +02:00
const char * options ,
nspawn: Simplify tmpfs_patch_options() usage, and trickle that up
One of the things that tmpfs_patch_options does is take an (optional) UID,
and insert "uid=${UID},gid=${UID}" into the options string. So we need a
uid_t argument, and a way of telling if we should use it. Fortunately,
that is built in to the uid_t type by having UID_INVALID as a possible
value.
So this is really a feature that requires one argument. Yet, it is somehow
taking 4! That is absurd. Simplify it to only take one argument, and have
that trickle all the way up to mount_all()'s usage.
Now, in may of the uses, the argument becomes
uid_shift == 0 ? UID_INVALID : uid_shift
because it used to treat uid_shift=0 as invalid unless the patch_ids flag
was also set. This keeps the behavior the same. Note that in all cases
where it is invoked, if !use_userns (sometimes called !userns), then
uid_shift is 0; we don't have to add any checks for that.
That said, I'm pretty sure that "uid=0" and not setting "uid=" are the
same, but Christian Brauner seemed to not think so when implementing the
cgns support. https://github.com/systemd/systemd/pull/3589
2017-06-14 00:06:09 +02:00
uid_t uid_shift ,
2015-09-07 15:59:52 +02:00
const char * selinux_apifs_context ,
char * * ret ) {
char * buf = NULL ;
nspawn: Simplify tmpfs_patch_options() usage, and trickle that up
One of the things that tmpfs_patch_options does is take an (optional) UID,
and insert "uid=${UID},gid=${UID}" into the options string. So we need a
uid_t argument, and a way of telling if we should use it. Fortunately,
that is built in to the uid_t type by having UID_INVALID as a possible
value.
So this is really a feature that requires one argument. Yet, it is somehow
taking 4! That is absurd. Simplify it to only take one argument, and have
that trickle all the way up to mount_all()'s usage.
Now, in may of the uses, the argument becomes
uid_shift == 0 ? UID_INVALID : uid_shift
because it used to treat uid_shift=0 as invalid unless the patch_ids flag
was also set. This keeps the behavior the same. Note that in all cases
where it is invoked, if !use_userns (sometimes called !userns), then
uid_shift is 0; we don't have to add any checks for that.
That said, I'm pretty sure that "uid=0" and not setting "uid=" are the
same, but Christian Brauner seemed to not think so when implementing the
cgns support. https://github.com/systemd/systemd/pull/3589
2017-06-14 00:06:09 +02:00
if ( uid_shift ! = UID_INVALID ) {
2016-10-23 17:57:55 +02:00
if ( asprintf ( & buf , " %s%suid= " UID_FMT " ,gid= " UID_FMT ,
2017-11-24 10:31:08 +01:00
strempty ( options ) , options ? " , " : " " ,
2016-10-23 17:57:55 +02:00
uid_shift , uid_shift ) < 0 )
2015-09-07 15:59:52 +02:00
return - ENOMEM ;
options = buf ;
}
2017-10-03 10:41:51 +02:00
# if HAVE_SELINUX
2015-09-07 15:59:52 +02:00
if ( selinux_apifs_context ) {
char * t ;
2017-11-24 10:31:08 +01:00
t = strjoin ( strempty ( options ) , options ? " , " : " " ,
2016-10-23 17:57:55 +02:00
" context= \" " , selinux_apifs_context , " \" " ) ;
free ( buf ) ;
if ( ! t )
2015-09-07 15:59:52 +02:00
return - ENOMEM ;
buf = t ;
}
# endif
2016-06-23 13:41:56 +02:00
if ( ! buf & & options ) {
buf = strdup ( options ) ;
if ( ! buf )
return - ENOMEM ;
}
2015-09-07 15:59:52 +02:00
* ret = buf ;
2016-06-23 13:41:56 +02:00
2015-09-07 15:59:52 +02:00
return ! ! buf ;
}
2016-10-14 14:00:15 +02:00
int mount_sysfs ( const char * dest , MountSettingsMask mount_settings ) {
2015-09-30 13:47:28 +02:00
const char * full , * top , * x ;
2015-10-15 12:13:13 +02:00
int r ;
2016-10-14 14:00:15 +02:00
unsigned long extra_flags = 0 ;
2015-09-30 13:47:28 +02:00
top = prefix_roota ( dest , " /sys " ) ;
2017-10-31 16:13:05 +01:00
r = path_is_fs_type ( top , SYSFS_MAGIC ) ;
2015-10-15 12:13:13 +02:00
if ( r < 0 )
return log_error_errno ( r , " Failed to determine filesystem type of %s: %m " , top ) ;
/* /sys might already be mounted as sysfs by the outer child in the
* ! netns case . In this case , it ' s all good . Don ' t touch it because we
* don ' t have the right to do so , see https : //github.com/systemd/systemd/issues/1555.
*/
if ( r > 0 )
return 0 ;
2015-09-30 13:47:28 +02:00
full = prefix_roota ( top , " /full " ) ;
( void ) mkdir ( full , 0755 ) ;
2019-12-12 20:18:37 +01:00
if ( FLAGS_SET ( mount_settings , MOUNT_APPLY_APIVFS_RO ) )
2016-10-14 14:00:15 +02:00
extra_flags | = MS_RDONLY ;
2020-09-22 15:51:17 +02:00
r = mount_nofollow_verbose ( LOG_ERR , " sysfs " , full , " sysfs " ,
MS_NOSUID | MS_NOEXEC | MS_NODEV | extra_flags , NULL ) ;
nspawn,mount-util: add [u]mount_verbose and use it in nspawn
This makes it easier to debug failed nspawn invocations:
Mounting sysfs on /var/lib/machines/fedora-rawhide/sys (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV "")...
Mounting tmpfs on /var/lib/machines/fedora-rawhide/dev (MS_NOSUID|MS_STRICTATIME "mode=755,uid=1450901504,gid=1450901504")...
Mounting tmpfs on /var/lib/machines/fedora-rawhide/dev/shm (MS_NOSUID|MS_NODEV|MS_STRICTATIME "mode=1777,uid=1450901504,gid=1450901504")...
Mounting tmpfs on /var/lib/machines/fedora-rawhide/run (MS_NOSUID|MS_NODEV|MS_STRICTATIME "mode=755,uid=1450901504,gid=1450901504")...
Bind-mounting /sys/fs/selinux on /var/lib/machines/fedora-rawhide/sys/fs/selinux (MS_BIND "")...
Remounting /var/lib/machines/fedora-rawhide/sys/fs/selinux (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_BIND|MS_REMOUNT "")...
Mounting proc on /proc (MS_NOSUID|MS_NOEXEC|MS_NODEV "")...
Bind-mounting /proc/sys on /proc/sys (MS_BIND "")...
Remounting /proc/sys (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_BIND|MS_REMOUNT "")...
Bind-mounting /proc/sysrq-trigger on /proc/sysrq-trigger (MS_BIND "")...
Remounting /proc/sysrq-trigger (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_BIND|MS_REMOUNT "")...
Mounting tmpfs on /tmp (MS_STRICTATIME "mode=1777,uid=0,gid=0")...
Mounting tmpfs on /sys/fs/cgroup (MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME "mode=755,uid=0,gid=0")...
Mounting cgroup on /sys/fs/cgroup/systemd (MS_NOSUID|MS_NOEXEC|MS_NODEV "none,name=systemd,xattr")...
Failed to mount cgroup on /sys/fs/cgroup/systemd (MS_NOSUID|MS_NOEXEC|MS_NODEV "none,name=systemd,xattr"): No such file or directory
2016-10-10 21:55:20 +02:00
if ( r < 0 )
return r ;
2015-09-30 13:47:28 +02:00
FOREACH_STRING ( x , " block " , " bus " , " class " , " dev " , " devices " , " kernel " ) {
_cleanup_free_ char * from = NULL , * to = NULL ;
2019-06-19 15:20:13 +02:00
from = path_join ( full , x ) ;
2015-09-30 13:47:28 +02:00
if ( ! from )
return log_oom ( ) ;
2019-06-19 15:20:13 +02:00
to = path_join ( top , x ) ;
2015-09-30 13:47:28 +02:00
if ( ! to )
return log_oom ( ) ;
( void ) mkdir ( to , 0755 ) ;
2020-09-22 15:51:17 +02:00
r = mount_nofollow_verbose ( LOG_ERR , from , to , NULL , MS_BIND , NULL ) ;
nspawn,mount-util: add [u]mount_verbose and use it in nspawn
This makes it easier to debug failed nspawn invocations:
Mounting sysfs on /var/lib/machines/fedora-rawhide/sys (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV "")...
Mounting tmpfs on /var/lib/machines/fedora-rawhide/dev (MS_NOSUID|MS_STRICTATIME "mode=755,uid=1450901504,gid=1450901504")...
Mounting tmpfs on /var/lib/machines/fedora-rawhide/dev/shm (MS_NOSUID|MS_NODEV|MS_STRICTATIME "mode=1777,uid=1450901504,gid=1450901504")...
Mounting tmpfs on /var/lib/machines/fedora-rawhide/run (MS_NOSUID|MS_NODEV|MS_STRICTATIME "mode=755,uid=1450901504,gid=1450901504")...
Bind-mounting /sys/fs/selinux on /var/lib/machines/fedora-rawhide/sys/fs/selinux (MS_BIND "")...
Remounting /var/lib/machines/fedora-rawhide/sys/fs/selinux (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_BIND|MS_REMOUNT "")...
Mounting proc on /proc (MS_NOSUID|MS_NOEXEC|MS_NODEV "")...
Bind-mounting /proc/sys on /proc/sys (MS_BIND "")...
Remounting /proc/sys (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_BIND|MS_REMOUNT "")...
Bind-mounting /proc/sysrq-trigger on /proc/sysrq-trigger (MS_BIND "")...
Remounting /proc/sysrq-trigger (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_BIND|MS_REMOUNT "")...
Mounting tmpfs on /tmp (MS_STRICTATIME "mode=1777,uid=0,gid=0")...
Mounting tmpfs on /sys/fs/cgroup (MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME "mode=755,uid=0,gid=0")...
Mounting cgroup on /sys/fs/cgroup/systemd (MS_NOSUID|MS_NOEXEC|MS_NODEV "none,name=systemd,xattr")...
Failed to mount cgroup on /sys/fs/cgroup/systemd (MS_NOSUID|MS_NOEXEC|MS_NODEV "none,name=systemd,xattr"): No such file or directory
2016-10-10 21:55:20 +02:00
if ( r < 0 )
return r ;
2015-09-30 13:47:28 +02:00
2020-09-22 15:51:17 +02:00
r = mount_nofollow_verbose ( LOG_ERR , NULL , to , NULL ,
MS_BIND | MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_REMOUNT | extra_flags , NULL ) ;
nspawn,mount-util: add [u]mount_verbose and use it in nspawn
This makes it easier to debug failed nspawn invocations:
Mounting sysfs on /var/lib/machines/fedora-rawhide/sys (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV "")...
Mounting tmpfs on /var/lib/machines/fedora-rawhide/dev (MS_NOSUID|MS_STRICTATIME "mode=755,uid=1450901504,gid=1450901504")...
Mounting tmpfs on /var/lib/machines/fedora-rawhide/dev/shm (MS_NOSUID|MS_NODEV|MS_STRICTATIME "mode=1777,uid=1450901504,gid=1450901504")...
Mounting tmpfs on /var/lib/machines/fedora-rawhide/run (MS_NOSUID|MS_NODEV|MS_STRICTATIME "mode=755,uid=1450901504,gid=1450901504")...
Bind-mounting /sys/fs/selinux on /var/lib/machines/fedora-rawhide/sys/fs/selinux (MS_BIND "")...
Remounting /var/lib/machines/fedora-rawhide/sys/fs/selinux (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_BIND|MS_REMOUNT "")...
Mounting proc on /proc (MS_NOSUID|MS_NOEXEC|MS_NODEV "")...
Bind-mounting /proc/sys on /proc/sys (MS_BIND "")...
Remounting /proc/sys (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_BIND|MS_REMOUNT "")...
Bind-mounting /proc/sysrq-trigger on /proc/sysrq-trigger (MS_BIND "")...
Remounting /proc/sysrq-trigger (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_BIND|MS_REMOUNT "")...
Mounting tmpfs on /tmp (MS_STRICTATIME "mode=1777,uid=0,gid=0")...
Mounting tmpfs on /sys/fs/cgroup (MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME "mode=755,uid=0,gid=0")...
Mounting cgroup on /sys/fs/cgroup/systemd (MS_NOSUID|MS_NOEXEC|MS_NODEV "none,name=systemd,xattr")...
Failed to mount cgroup on /sys/fs/cgroup/systemd (MS_NOSUID|MS_NOEXEC|MS_NODEV "none,name=systemd,xattr"): No such file or directory
2016-10-10 21:55:20 +02:00
if ( r < 0 )
return r ;
2015-09-30 13:47:28 +02:00
}
2020-09-22 16:32:07 +02:00
r = umount_verbose ( LOG_ERR , full , UMOUNT_NOFOLLOW ) ;
nspawn,mount-util: add [u]mount_verbose and use it in nspawn
This makes it easier to debug failed nspawn invocations:
Mounting sysfs on /var/lib/machines/fedora-rawhide/sys (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV "")...
Mounting tmpfs on /var/lib/machines/fedora-rawhide/dev (MS_NOSUID|MS_STRICTATIME "mode=755,uid=1450901504,gid=1450901504")...
Mounting tmpfs on /var/lib/machines/fedora-rawhide/dev/shm (MS_NOSUID|MS_NODEV|MS_STRICTATIME "mode=1777,uid=1450901504,gid=1450901504")...
Mounting tmpfs on /var/lib/machines/fedora-rawhide/run (MS_NOSUID|MS_NODEV|MS_STRICTATIME "mode=755,uid=1450901504,gid=1450901504")...
Bind-mounting /sys/fs/selinux on /var/lib/machines/fedora-rawhide/sys/fs/selinux (MS_BIND "")...
Remounting /var/lib/machines/fedora-rawhide/sys/fs/selinux (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_BIND|MS_REMOUNT "")...
Mounting proc on /proc (MS_NOSUID|MS_NOEXEC|MS_NODEV "")...
Bind-mounting /proc/sys on /proc/sys (MS_BIND "")...
Remounting /proc/sys (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_BIND|MS_REMOUNT "")...
Bind-mounting /proc/sysrq-trigger on /proc/sysrq-trigger (MS_BIND "")...
Remounting /proc/sysrq-trigger (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_BIND|MS_REMOUNT "")...
Mounting tmpfs on /tmp (MS_STRICTATIME "mode=1777,uid=0,gid=0")...
Mounting tmpfs on /sys/fs/cgroup (MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME "mode=755,uid=0,gid=0")...
Mounting cgroup on /sys/fs/cgroup/systemd (MS_NOSUID|MS_NOEXEC|MS_NODEV "none,name=systemd,xattr")...
Failed to mount cgroup on /sys/fs/cgroup/systemd (MS_NOSUID|MS_NOEXEC|MS_NODEV "none,name=systemd,xattr"): No such file or directory
2016-10-10 21:55:20 +02:00
if ( r < 0 )
return r ;
2015-09-30 13:47:28 +02:00
if ( rmdir ( full ) < 0 )
return log_error_errno ( errno , " Failed to remove %s: %m " , full ) ;
2016-06-23 13:41:56 +02:00
/* Create mountpoint for cgroups. Otherwise we are not allowed since we
* remount / sys read - only .
*/
nspawn: mount_sysfs(): Unconditionally mkdir /sys/fs/cgroup
Currently, mount_sysfs() only creates /sys/fs/cgroup if cg_ns_supported().
The comment explains that we need to "Create mountpoint for
cgroups. Otherwise we are not allowed since we remount /sys read-only.";
that is: that we need to do it now, rather than later. However, the
comment doesn't do anything to explain why we only need to do this if
cg_ns_supported(); shouldn't we _always_ need to do it?
The answer is that if !use_cgns, then this was already done by the outer
child, so mount_sysfs() only needs to do it if use_cgns. Now,
mount_sysfs() doesn't know whether use_cgns, but !cg_ns_supported() implies
!use_cgns, so we can optimize" the case where we _know_ !use_cgns, and deal
with a no-op mkdir_p() in the false-positive where cgns_supported() but
!use_cgns.
But is it really much of an optimization? We're potentially spending an
access(2) (cg_ns_supported() could be cached from a previous call) to
potentially save an lstat(2) and mkdir(2); and all of them are on virtual
fileystems, so they should all be pretty cheap.
So, simplify and drop the conditional. It's a dubious optimization that
requires more text to explain than it's worth.
2017-06-01 19:59:20 +02:00
x = prefix_roota ( top , " /fs/cgroup " ) ;
( void ) mkdir_p ( x , 0755 ) ;
2015-09-30 13:47:28 +02:00
2020-09-22 15:51:17 +02:00
return mount_nofollow_verbose ( LOG_ERR , NULL , top , NULL ,
MS_BIND | MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_REMOUNT | extra_flags , NULL ) ;
2015-09-30 13:47:28 +02:00
}
2015-09-07 15:59:52 +02:00
int mount_all ( const char * dest ,
2016-10-14 14:00:15 +02:00
MountSettingsMask mount_settings ,
nspawn: Simplify tmpfs_patch_options() usage, and trickle that up
One of the things that tmpfs_patch_options does is take an (optional) UID,
and insert "uid=${UID},gid=${UID}" into the options string. So we need a
uid_t argument, and a way of telling if we should use it. Fortunately,
that is built in to the uid_t type by having UID_INVALID as a possible
value.
So this is really a feature that requires one argument. Yet, it is somehow
taking 4! That is absurd. Simplify it to only take one argument, and have
that trickle all the way up to mount_all()'s usage.
Now, in may of the uses, the argument becomes
uid_shift == 0 ? UID_INVALID : uid_shift
because it used to treat uid_shift=0 as invalid unless the patch_ids flag
was also set. This keeps the behavior the same. Note that in all cases
where it is invoked, if !use_userns (sometimes called !userns), then
uid_shift is 0; we don't have to add any checks for that.
That said, I'm pretty sure that "uid=0" and not setting "uid=" are the
same, but Christian Brauner seemed to not think so when implementing the
cgns support. https://github.com/systemd/systemd/pull/3589
2017-06-14 00:06:09 +02:00
uid_t uid_shift ,
2015-09-07 15:59:52 +02:00
const char * selinux_apifs_context ) {
nspawn: add support for executing OCI runtime bundles with nspawn
This is a pretty large patch, and adds support for OCI runtime bundles
to nspawn. A new switch --oci-bundle= is added that takes a path to an
OCI bundle. The JSON file included therein is read similar to a .nspawn
settings files, however with a different feature set.
Implementation-wise this mostly extends the pre-existing Settings object
to carry additional properties for OCI. However, OCI supports some
concepts .nspawn files did not support yet, which this patch also adds:
1. Support for "masking" files and directories. This functionatly is now
also available via the new --inaccesible= cmdline command, and
Inaccessible= in .nspawn files.
2. Support for mounting arbitrary file systems. (not exposed through
nspawn cmdline nor .nspawn files, because probably not a good idea)
3. Ability to configure the console settings for a container. This
functionality is now also available on the nspawn cmdline in the new
--console= switch (not added to .nspawn for now, as it is something
specific to the invocation really, not a property of the container)
4. Console width/height configuration. Not exposed through
.nspawn/cmdline, but this may be controlled through $COLUMNS and
$LINES like in most other UNIX tools.
5. UID/GID configuration by raw numbers. (not exposed in .nspawn and on
the cmdline, since containers likely have different user tables, and
the existing --user= switch appears to be the better option)
6. OCI hook commands (no exposed in .nspawn/cmdline, as very specific to
OCI)
7. Creation of additional devices nodes in /dev. Most likely not a good
idea, hence not exposed in .nspawn/cmdline. There's already --bind=
to achieve the same, which is the better alternative.
8. Explicit syscall filters. This is not a good idea, due to the skewed
arch support, hence not exposed through .nspawn/cmdline.
9. Configuration of some sysctls on a whitelist. Questionnable, not
supported in .nspawn/cmdline for now.
10. Configuration of all 5 types of capabilities. Not a useful concept,
since the kernel will reduce the caps on execve() anyway. Not
exposed through .nspawn/cmdline as this is not very useful hence.
Note that this only implements the OCI runtime logic itself. It does not
provide a runc-compatible command line tool. This is left for a later
PR. Only with that in place tools such as "buildah" can use the OCI
support in nspawn as drop-in replacement.
Currently still missing is OCI hook support, but it's already parsed and
everything, and should be easy to add. Other than that it's OCI is
implemented pretty comprehensively.
There's a list of incompatibilities in the nspawn-oci.c file. In a later
PR I'd like to convert this into proper markdown and add it to the
documentation directory.
2018-04-25 11:23:37 +02:00
# define PROC_INACCESSIBLE_REG(path) \
{ " /run/systemd/inaccessible/reg " , ( path ) , NULL , NULL , MS_BIND , \
MOUNT_IN_USERNS | MOUNT_APPLY_APIVFS_RO } , /* Bind mount first ... */ \
2018-04-30 12:22:41 +02:00
{ NULL , ( path ) , NULL , NULL , MS_BIND | MS_RDONLY | MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_REMOUNT , \
MOUNT_IN_USERNS | MOUNT_APPLY_APIVFS_RO } /* Then, make it r/o */
# define PROC_READ_ONLY(path) \
{ ( path ) , ( path ) , NULL , NULL , MS_BIND , \
MOUNT_IN_USERNS | MOUNT_APPLY_APIVFS_RO } , /* Bind mount first ... */ \
{ NULL , ( path ) , NULL , NULL , MS_BIND | MS_RDONLY | MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_REMOUNT , \
MOUNT_IN_USERNS | MOUNT_APPLY_APIVFS_RO } /* Then, make it r/o */
2015-09-07 15:59:52 +02:00
typedef struct MountPoint {
const char * what ;
const char * where ;
const char * type ;
const char * options ;
unsigned long flags ;
2016-10-14 14:00:15 +02:00
MountSettingsMask mount_settings ;
2015-09-07 15:59:52 +02:00
} MountPoint ;
static const MountPoint mount_table [ ] = {
2018-04-30 12:22:41 +02:00
/* First we list inner child mounts (i.e. mounts applied *after* entering user namespacing) */
{ " proc " , " /proc " , " proc " , NULL , MS_NOSUID | MS_NOEXEC | MS_NODEV ,
2020-09-22 15:51:17 +02:00
MOUNT_FATAL | MOUNT_IN_USERNS | MOUNT_MKDIR | MOUNT_FOLLOW_SYMLINKS } , /* we follow symlinks here since not following them requires /proc/ already being mounted, which we don't have here. */
2018-04-30 12:22:41 +02:00
{ " /proc/sys " , " /proc/sys " , NULL , NULL , MS_BIND ,
MOUNT_FATAL | MOUNT_IN_USERNS | MOUNT_APPLY_APIVFS_RO } , /* Bind mount first ... */
{ " /proc/sys/net " , " /proc/sys/net " , NULL , NULL , MS_BIND ,
MOUNT_FATAL | MOUNT_IN_USERNS | MOUNT_APPLY_APIVFS_RO | MOUNT_APPLY_APIVFS_NETNS } , /* (except for this) */
{ NULL , " /proc/sys " , NULL , NULL , MS_BIND | MS_RDONLY | MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_REMOUNT ,
MOUNT_FATAL | MOUNT_IN_USERNS | MOUNT_APPLY_APIVFS_RO } , /* ... then, make it r/o */
/* Make these files inaccessible to container payloads: they potentially leak information about kernel
* internals or the host ' s execution environment to the container */
nspawn: add support for executing OCI runtime bundles with nspawn
This is a pretty large patch, and adds support for OCI runtime bundles
to nspawn. A new switch --oci-bundle= is added that takes a path to an
OCI bundle. The JSON file included therein is read similar to a .nspawn
settings files, however with a different feature set.
Implementation-wise this mostly extends the pre-existing Settings object
to carry additional properties for OCI. However, OCI supports some
concepts .nspawn files did not support yet, which this patch also adds:
1. Support for "masking" files and directories. This functionatly is now
also available via the new --inaccesible= cmdline command, and
Inaccessible= in .nspawn files.
2. Support for mounting arbitrary file systems. (not exposed through
nspawn cmdline nor .nspawn files, because probably not a good idea)
3. Ability to configure the console settings for a container. This
functionality is now also available on the nspawn cmdline in the new
--console= switch (not added to .nspawn for now, as it is something
specific to the invocation really, not a property of the container)
4. Console width/height configuration. Not exposed through
.nspawn/cmdline, but this may be controlled through $COLUMNS and
$LINES like in most other UNIX tools.
5. UID/GID configuration by raw numbers. (not exposed in .nspawn and on
the cmdline, since containers likely have different user tables, and
the existing --user= switch appears to be the better option)
6. OCI hook commands (no exposed in .nspawn/cmdline, as very specific to
OCI)
7. Creation of additional devices nodes in /dev. Most likely not a good
idea, hence not exposed in .nspawn/cmdline. There's already --bind=
to achieve the same, which is the better alternative.
8. Explicit syscall filters. This is not a good idea, due to the skewed
arch support, hence not exposed through .nspawn/cmdline.
9. Configuration of some sysctls on a whitelist. Questionnable, not
supported in .nspawn/cmdline for now.
10. Configuration of all 5 types of capabilities. Not a useful concept,
since the kernel will reduce the caps on execve() anyway. Not
exposed through .nspawn/cmdline as this is not very useful hence.
Note that this only implements the OCI runtime logic itself. It does not
provide a runc-compatible command line tool. This is left for a later
PR. Only with that in place tools such as "buildah" can use the OCI
support in nspawn as drop-in replacement.
Currently still missing is OCI hook support, but it's already parsed and
everything, and should be easy to add. Other than that it's OCI is
implemented pretty comprehensively.
There's a list of incompatibilities in the nspawn-oci.c file. In a later
PR I'd like to convert this into proper markdown and add it to the
documentation directory.
2018-04-25 11:23:37 +02:00
PROC_INACCESSIBLE_REG ( " /proc/kallsyms " ) ,
PROC_INACCESSIBLE_REG ( " /proc/kcore " ) ,
PROC_INACCESSIBLE_REG ( " /proc/keys " ) ,
PROC_INACCESSIBLE_REG ( " /proc/sysrq-trigger " ) ,
PROC_INACCESSIBLE_REG ( " /proc/timer_list " ) ,
2018-04-30 12:22:41 +02:00
/* Make these directories read-only to container payloads: they show hardware information, and in some
* cases contain tunables the container really shouldn ' t have access to . */
PROC_READ_ONLY ( " /proc/acpi " ) ,
PROC_READ_ONLY ( " /proc/apm " ) ,
PROC_READ_ONLY ( " /proc/asound " ) ,
PROC_READ_ONLY ( " /proc/bus " ) ,
PROC_READ_ONLY ( " /proc/fs " ) ,
PROC_READ_ONLY ( " /proc/irq " ) ,
PROC_READ_ONLY ( " /proc/scsi " ) ,
2020-05-22 17:06:54 +02:00
{ " mqueue " , " /dev/mqueue " , " mqueue " , NULL , MS_NOSUID | MS_NOEXEC | MS_NODEV ,
2020-04-22 16:35:32 +02:00
MOUNT_IN_USERNS | MOUNT_MKDIR } ,
2019-03-25 19:42:47 +01:00
2018-04-30 12:22:41 +02:00
/* Then we list outer child mounts (i.e. mounts applied *before* entering user namespacing) */
2020-07-30 13:36:10 +02:00
{ " tmpfs " , " /tmp " , " tmpfs " , " mode=1777 " NESTED_TMPFS_LIMITS , MS_NOSUID | MS_NODEV | MS_STRICTATIME ,
2020-04-22 16:35:32 +02:00
MOUNT_FATAL | MOUNT_APPLY_TMPFS_TMP | MOUNT_MKDIR } ,
2020-05-22 17:06:54 +02:00
{ " tmpfs " , " /sys " , " tmpfs " , " mode=555 " TMPFS_LIMITS_SYS , MS_NOSUID | MS_NOEXEC | MS_NODEV ,
2020-04-22 16:35:32 +02:00
MOUNT_FATAL | MOUNT_APPLY_APIVFS_NETNS | MOUNT_MKDIR } ,
2020-05-22 17:06:54 +02:00
{ " sysfs " , " /sys " , " sysfs " , NULL , MS_RDONLY | MS_NOSUID | MS_NOEXEC | MS_NODEV ,
2020-04-22 16:35:32 +02:00
MOUNT_FATAL | MOUNT_APPLY_APIVFS_RO | MOUNT_MKDIR } , /* skipped if above was mounted */
2020-05-22 17:06:54 +02:00
{ " sysfs " , " /sys " , " sysfs " , NULL , MS_NOSUID | MS_NOEXEC | MS_NODEV ,
2020-04-22 16:35:32 +02:00
MOUNT_FATAL | MOUNT_MKDIR } , /* skipped if above was mounted */
2020-05-22 17:06:54 +02:00
{ " tmpfs " , " /dev " , " tmpfs " , " mode=755 " TMPFS_LIMITS_DEV , MS_NOSUID | MS_STRICTATIME ,
2020-04-22 16:35:32 +02:00
MOUNT_FATAL | MOUNT_MKDIR } ,
2020-07-30 13:36:10 +02:00
{ " tmpfs " , " /dev/shm " , " tmpfs " , " mode=1777 " NESTED_TMPFS_LIMITS , MS_NOSUID | MS_NODEV | MS_STRICTATIME ,
2020-04-22 16:35:32 +02:00
MOUNT_FATAL | MOUNT_MKDIR } ,
2020-05-22 17:06:54 +02:00
{ " tmpfs " , " /run " , " tmpfs " , " mode=755 " TMPFS_LIMITS_RUN , MS_NOSUID | MS_NODEV | MS_STRICTATIME ,
2020-04-22 16:35:32 +02:00
MOUNT_FATAL | MOUNT_MKDIR } ,
2020-07-23 16:49:13 +02:00
{ " /run/host " , " /run/host " , NULL , NULL , MS_BIND ,
MOUNT_FATAL | MOUNT_MKDIR | MOUNT_PREFIX_ROOT } , /* Prepare this so that we can make it read-only when we are done */
{ " /etc/os-release " , " /run/host/os-release " , NULL , NULL , MS_BIND ,
MOUNT_TOUCH } , /* As per kernel interface requirements, bind mount first (creating mount points) and make read-only later */
{ " /usr/lib/os-release " , " /run/host/os-release " , NULL , NULL , MS_BIND ,
MOUNT_FATAL } , /* If /etc/os-release doesn't exist use the version in /usr/lib as fallback */
{ NULL , " /run/host/os-release " , NULL , NULL , MS_BIND | MS_RDONLY | MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_REMOUNT ,
MOUNT_FATAL } ,
{ NULL , " /run/host " , NULL , NULL , MS_BIND | MS_RDONLY | MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_REMOUNT ,
MOUNT_FATAL | MOUNT_IN_USERNS } ,
2017-10-03 10:41:51 +02:00
# if HAVE_SELINUX
2020-05-22 17:06:54 +02:00
{ " /sys/fs/selinux " , " /sys/fs/selinux " , NULL , NULL , MS_BIND ,
2020-06-16 16:50:46 +02:00
MOUNT_MKDIR } , /* Bind mount first (mkdir/chown the mount point in case /sys/ is mounted as minimal skeleton tmpfs) */
2020-05-22 17:06:54 +02:00
{ NULL , " /sys/fs/selinux " , NULL , NULL , MS_BIND | MS_RDONLY | MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_REMOUNT ,
2020-06-16 16:50:46 +02:00
0 } , /* Then, make it r/o (don't mkdir/chown the mount point here, the previous entry already did that) */
2015-09-07 15:59:52 +02:00
# endif
} ;
2019-12-12 20:18:37 +01:00
bool use_userns = FLAGS_SET ( mount_settings , MOUNT_USE_USERNS ) ;
bool netns = FLAGS_SET ( mount_settings , MOUNT_APPLY_APIVFS_NETNS ) ;
bool ro = FLAGS_SET ( mount_settings , MOUNT_APPLY_APIVFS_RO ) ;
bool in_userns = FLAGS_SET ( mount_settings , MOUNT_IN_USERNS ) ;
bool tmpfs_tmp = FLAGS_SET ( mount_settings , MOUNT_APPLY_TMPFS_TMP ) ;
2018-04-30 12:22:41 +02:00
size_t k ;
2018-04-27 22:01:54 +02:00
int r ;
2015-09-07 15:59:52 +02:00
for ( k = 0 ; k < ELEMENTSOF ( mount_table ) ; k + + ) {
2020-07-23 16:49:13 +02:00
_cleanup_free_ char * where = NULL , * options = NULL , * prefixed = NULL ;
2019-12-12 20:18:37 +01:00
bool fatal = FLAGS_SET ( mount_table [ k ] . mount_settings , MOUNT_FATAL ) ;
2020-07-23 16:49:13 +02:00
const char * o ;
2016-10-14 14:00:15 +02:00
2019-12-12 20:18:37 +01:00
if ( in_userns ! = FLAGS_SET ( mount_table [ k ] . mount_settings , MOUNT_IN_USERNS ) )
2016-10-14 14:00:15 +02:00
continue ;
2015-09-07 15:59:52 +02:00
2019-12-12 20:18:37 +01:00
if ( ! netns & & FLAGS_SET ( mount_table [ k ] . mount_settings , MOUNT_APPLY_APIVFS_NETNS ) )
2015-10-15 12:13:13 +02:00
continue ;
2019-12-12 20:18:37 +01:00
if ( ! ro & & FLAGS_SET ( mount_table [ k ] . mount_settings , MOUNT_APPLY_APIVFS_RO ) )
2015-09-07 15:59:52 +02:00
continue ;
2019-12-12 20:18:37 +01:00
if ( ! tmpfs_tmp & & FLAGS_SET ( mount_table [ k ] . mount_settings , MOUNT_APPLY_TMPFS_TMP ) )
2018-10-08 18:32:03 +02:00
continue ;
2019-10-24 10:33:20 +02:00
r = chase_symlinks ( mount_table [ k ] . where , dest , CHASE_NONEXISTENT | CHASE_PREFIX_ROOT , & where , NULL ) ;
2016-11-29 18:13:11 +01:00
if ( r < 0 )
2016-12-01 12:40:23 +01:00
return log_error_errno ( r , " Failed to resolve %s/%s: %m " , dest , mount_table [ k ] . where ) ;
2015-09-07 15:59:52 +02:00
/* Skip this entry if it is not a remount. */
nspawn: add support for executing OCI runtime bundles with nspawn
This is a pretty large patch, and adds support for OCI runtime bundles
to nspawn. A new switch --oci-bundle= is added that takes a path to an
OCI bundle. The JSON file included therein is read similar to a .nspawn
settings files, however with a different feature set.
Implementation-wise this mostly extends the pre-existing Settings object
to carry additional properties for OCI. However, OCI supports some
concepts .nspawn files did not support yet, which this patch also adds:
1. Support for "masking" files and directories. This functionatly is now
also available via the new --inaccesible= cmdline command, and
Inaccessible= in .nspawn files.
2. Support for mounting arbitrary file systems. (not exposed through
nspawn cmdline nor .nspawn files, because probably not a good idea)
3. Ability to configure the console settings for a container. This
functionality is now also available on the nspawn cmdline in the new
--console= switch (not added to .nspawn for now, as it is something
specific to the invocation really, not a property of the container)
4. Console width/height configuration. Not exposed through
.nspawn/cmdline, but this may be controlled through $COLUMNS and
$LINES like in most other UNIX tools.
5. UID/GID configuration by raw numbers. (not exposed in .nspawn and on
the cmdline, since containers likely have different user tables, and
the existing --user= switch appears to be the better option)
6. OCI hook commands (no exposed in .nspawn/cmdline, as very specific to
OCI)
7. Creation of additional devices nodes in /dev. Most likely not a good
idea, hence not exposed in .nspawn/cmdline. There's already --bind=
to achieve the same, which is the better alternative.
8. Explicit syscall filters. This is not a good idea, due to the skewed
arch support, hence not exposed through .nspawn/cmdline.
9. Configuration of some sysctls on a whitelist. Questionnable, not
supported in .nspawn/cmdline for now.
10. Configuration of all 5 types of capabilities. Not a useful concept,
since the kernel will reduce the caps on execve() anyway. Not
exposed through .nspawn/cmdline as this is not very useful hence.
Note that this only implements the OCI runtime logic itself. It does not
provide a runc-compatible command line tool. This is left for a later
PR. Only with that in place tools such as "buildah" can use the OCI
support in nspawn as drop-in replacement.
Currently still missing is OCI hook support, but it's already parsed and
everything, and should be easy to add. Other than that it's OCI is
implemented pretty comprehensively.
There's a list of incompatibilities in the nspawn-oci.c file. In a later
PR I'd like to convert this into proper markdown and add it to the
documentation directory.
2018-04-25 11:23:37 +02:00
if ( mount_table [ k ] . what ) {
r = path_is_mount_point ( where , NULL , 0 ) ;
if ( r < 0 & & r ! = - ENOENT )
return log_error_errno ( r , " Failed to detect whether %s is a mount point: %m " , where ) ;
if ( r > 0 )
continue ;
}
2015-09-07 15:59:52 +02:00
2020-07-23 16:49:13 +02:00
if ( ( mount_table [ k ] . mount_settings & ( MOUNT_MKDIR | MOUNT_TOUCH ) ) ! = 0 ) {
2020-06-15 16:59:44 +02:00
uid_t u = ( use_userns & & ! in_userns ) ? uid_shift : UID_INVALID ;
2020-05-22 17:06:54 +02:00
if ( FLAGS_SET ( mount_table [ k ] . mount_settings , MOUNT_TOUCH ) )
r = mkdir_parents_safe ( dest , where , 0755 , u , u , 0 ) ;
else
r = mkdir_p_safe ( dest , where , 0755 , u , u , 0 ) ;
2020-04-22 16:35:32 +02:00
if ( r < 0 & & r ! = - EEXIST ) {
if ( fatal & & r ! = - EROFS )
return log_error_errno ( r , " Failed to create directory %s: %m " , where ) ;
2015-09-07 15:59:52 +02:00
2020-04-22 16:35:32 +02:00
log_debug_errno ( r , " Failed to create directory %s: %m " , where ) ;
/* If we failed mkdir() or chown() due to the root directory being read only,
* attempt to mount this fs anyway and let mount_verbose log any errors */
if ( r ! = - EROFS )
continue ;
}
2020-07-23 16:49:13 +02:00
}
if ( FLAGS_SET ( mount_table [ k ] . mount_settings , MOUNT_TOUCH ) ) {
r = touch ( where ) ;
if ( r < 0 & & r ! = - EEXIST ) {
if ( fatal & & r ! = - EROFS )
return log_error_errno ( r , " Failed to create file %s: %m " , where ) ;
log_debug_errno ( r , " Failed to create file %s: %m " , where ) ;
if ( r ! = - EROFS )
continue ;
2020-05-22 17:06:54 +02:00
}
2015-09-07 15:59:52 +02:00
}
o = mount_table [ k ] . options ;
if ( streq_ptr ( mount_table [ k ] . type , " tmpfs " ) ) {
nspawn: Simplify tmpfs_patch_options() usage, and trickle that up
One of the things that tmpfs_patch_options does is take an (optional) UID,
and insert "uid=${UID},gid=${UID}" into the options string. So we need a
uid_t argument, and a way of telling if we should use it. Fortunately,
that is built in to the uid_t type by having UID_INVALID as a possible
value.
So this is really a feature that requires one argument. Yet, it is somehow
taking 4! That is absurd. Simplify it to only take one argument, and have
that trickle all the way up to mount_all()'s usage.
Now, in may of the uses, the argument becomes
uid_shift == 0 ? UID_INVALID : uid_shift
because it used to treat uid_shift=0 as invalid unless the patch_ids flag
was also set. This keeps the behavior the same. Note that in all cases
where it is invoked, if !use_userns (sometimes called !userns), then
uid_shift is 0; we don't have to add any checks for that.
That said, I'm pretty sure that "uid=0" and not setting "uid=" are the
same, but Christian Brauner seemed to not think so when implementing the
cgns support. https://github.com/systemd/systemd/pull/3589
2017-06-14 00:06:09 +02:00
r = tmpfs_patch_options ( o , in_userns ? 0 : uid_shift , selinux_apifs_context , & options ) ;
2015-09-07 15:59:52 +02:00
if ( r < 0 )
return log_oom ( ) ;
if ( r > 0 )
o = options ;
}
2020-07-23 16:49:13 +02:00
if ( FLAGS_SET ( mount_table [ k ] . mount_settings , MOUNT_PREFIX_ROOT ) ) {
/* Optionally prefix the mount source with the root dir. This is useful in bind
* mounts to be created within the container image before we transition into it . Note
* that MOUNT_IN_USERNS is run after we transitioned hence prefixing is not ncessary
* for those . */
r = chase_symlinks ( mount_table [ k ] . what , dest , CHASE_PREFIX_ROOT , & prefixed , NULL ) ;
if ( r < 0 )
return log_error_errno ( r , " Failed to resolve %s/%s: %m " , dest , mount_table [ k ] . what ) ;
}
2020-09-22 15:51:17 +02:00
r = mount_verbose_full (
fatal ? LOG_ERR : LOG_DEBUG ,
prefixed ? : mount_table [ k ] . what ,
where ,
mount_table [ k ] . type ,
mount_table [ k ] . flags ,
o ,
FLAGS_SET ( mount_table [ k ] . mount_settings , MOUNT_FOLLOW_SYMLINKS ) ) ;
2016-10-14 14:00:15 +02:00
if ( r < 0 & & fatal )
nspawn,mount-util: add [u]mount_verbose and use it in nspawn
This makes it easier to debug failed nspawn invocations:
Mounting sysfs on /var/lib/machines/fedora-rawhide/sys (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV "")...
Mounting tmpfs on /var/lib/machines/fedora-rawhide/dev (MS_NOSUID|MS_STRICTATIME "mode=755,uid=1450901504,gid=1450901504")...
Mounting tmpfs on /var/lib/machines/fedora-rawhide/dev/shm (MS_NOSUID|MS_NODEV|MS_STRICTATIME "mode=1777,uid=1450901504,gid=1450901504")...
Mounting tmpfs on /var/lib/machines/fedora-rawhide/run (MS_NOSUID|MS_NODEV|MS_STRICTATIME "mode=755,uid=1450901504,gid=1450901504")...
Bind-mounting /sys/fs/selinux on /var/lib/machines/fedora-rawhide/sys/fs/selinux (MS_BIND "")...
Remounting /var/lib/machines/fedora-rawhide/sys/fs/selinux (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_BIND|MS_REMOUNT "")...
Mounting proc on /proc (MS_NOSUID|MS_NOEXEC|MS_NODEV "")...
Bind-mounting /proc/sys on /proc/sys (MS_BIND "")...
Remounting /proc/sys (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_BIND|MS_REMOUNT "")...
Bind-mounting /proc/sysrq-trigger on /proc/sysrq-trigger (MS_BIND "")...
Remounting /proc/sysrq-trigger (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_BIND|MS_REMOUNT "")...
Mounting tmpfs on /tmp (MS_STRICTATIME "mode=1777,uid=0,gid=0")...
Mounting tmpfs on /sys/fs/cgroup (MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME "mode=755,uid=0,gid=0")...
Mounting cgroup on /sys/fs/cgroup/systemd (MS_NOSUID|MS_NOEXEC|MS_NODEV "none,name=systemd,xattr")...
Failed to mount cgroup on /sys/fs/cgroup/systemd (MS_NOSUID|MS_NOEXEC|MS_NODEV "none,name=systemd,xattr"): No such file or directory
2016-10-10 21:55:20 +02:00
return r ;
2015-09-07 15:59:52 +02:00
}
return 0 ;
}
2019-07-24 16:41:29 +02:00
static int parse_mount_bind_options ( const char * options , unsigned long * mount_flags , char * * mount_opts ) {
const char * p = options ;
unsigned long flags = * mount_flags ;
char * opts = NULL ;
int r ;
assert ( options ) ;
for ( ; ; ) {
_cleanup_free_ char * word = NULL ;
r = extract_first_word ( & p , & word , " , " , 0 ) ;
if ( r < 0 )
return log_error_errno ( r , " Failed to extract mount option: %m " ) ;
if ( r = = 0 )
break ;
if ( streq ( word , " rbind " ) )
flags | = MS_REC ;
else if ( streq ( word , " norbind " ) )
flags & = ~ MS_REC ;
else {
2019-09-21 16:01:14 +02:00
return log_error_errno ( SYNTHETIC_ERRNO ( EINVAL ) ,
" Invalid bind mount option: %s " ,
word ) ;
2019-07-24 16:41:29 +02:00
}
}
* mount_flags = flags ;
/* in the future mount_opts will hold string options for mount(2) */
* mount_opts = opts ;
return 0 ;
}
2015-09-07 15:59:52 +02:00
static int mount_bind ( const char * dest , CustomMount * m ) {
2019-07-24 16:41:29 +02:00
_cleanup_free_ char * mount_opts = NULL , * where = NULL ;
unsigned long mount_flags = MS_BIND | MS_REC ;
2016-11-25 19:01:36 +01:00
struct stat source_st , dest_st ;
2015-09-07 15:59:52 +02:00
int r ;
2016-11-30 16:02:47 +01:00
assert ( dest ) ;
2015-09-07 15:59:52 +02:00
assert ( m ) ;
2019-07-24 16:41:29 +02:00
if ( m - > options ) {
r = parse_mount_bind_options ( m - > options , & mount_flags , & mount_opts ) ;
if ( r < 0 )
return r ;
}
2015-09-07 15:59:52 +02:00
if ( stat ( m - > source , & source_st ) < 0 )
return log_error_errno ( errno , " Failed to stat %s: %m " , m - > source ) ;
2019-10-24 10:33:20 +02:00
r = chase_symlinks ( m - > destination , dest , CHASE_PREFIX_ROOT | CHASE_NONEXISTENT , & where , NULL ) ;
2016-11-25 19:01:36 +01:00
if ( r < 0 )
2016-12-01 12:40:23 +01:00
return log_error_errno ( r , " Failed to resolve %s/%s: %m " , dest , m - > destination ) ;
2016-11-29 18:13:11 +01:00
if ( r > 0 ) { /* Path exists already? */
if ( stat ( where , & dest_st ) < 0 )
return log_error_errno ( errno , " Failed to stat %s: %m " , where ) ;
2015-09-07 15:59:52 +02:00
2018-11-20 23:40:44 +01:00
if ( S_ISDIR ( source_st . st_mode ) & & ! S_ISDIR ( dest_st . st_mode ) )
return log_error_errno ( SYNTHETIC_ERRNO ( EINVAL ) ,
" Cannot bind mount directory %s on file %s. " ,
m - > source , where ) ;
if ( ! S_ISDIR ( source_st . st_mode ) & & S_ISDIR ( dest_st . st_mode ) )
return log_error_errno ( SYNTHETIC_ERRNO ( EINVAL ) ,
" Cannot bind mount file %s on directory %s. " ,
m - > source , where ) ;
2015-09-07 15:59:52 +02:00
2016-11-29 18:13:11 +01:00
} else { /* Path doesn't exist yet? */
2015-09-07 15:59:52 +02:00
r = mkdir_parents_label ( where , 0755 ) ;
if ( r < 0 )
return log_error_errno ( r , " Failed to make parents of %s: %m " , where ) ;
2016-04-01 17:31:55 +02:00
/* Create the mount point. Any non-directory file can be
* mounted on any non - directory file ( regular , fifo , socket ,
* char , block ) .
*/
if ( S_ISDIR ( source_st . st_mode ) )
r = mkdir_label ( where , 0755 ) ;
else
r = touch ( where ) ;
if ( r < 0 )
return log_error_errno ( r , " Failed to create mount point %s: %m " , where ) ;
2016-11-29 18:13:11 +01:00
}
2015-09-07 15:59:52 +02:00
2020-09-22 15:51:17 +02:00
r = mount_nofollow_verbose ( LOG_ERR , m - > source , where , NULL , mount_flags , mount_opts ) ;
nspawn,mount-util: add [u]mount_verbose and use it in nspawn
This makes it easier to debug failed nspawn invocations:
Mounting sysfs on /var/lib/machines/fedora-rawhide/sys (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV "")...
Mounting tmpfs on /var/lib/machines/fedora-rawhide/dev (MS_NOSUID|MS_STRICTATIME "mode=755,uid=1450901504,gid=1450901504")...
Mounting tmpfs on /var/lib/machines/fedora-rawhide/dev/shm (MS_NOSUID|MS_NODEV|MS_STRICTATIME "mode=1777,uid=1450901504,gid=1450901504")...
Mounting tmpfs on /var/lib/machines/fedora-rawhide/run (MS_NOSUID|MS_NODEV|MS_STRICTATIME "mode=755,uid=1450901504,gid=1450901504")...
Bind-mounting /sys/fs/selinux on /var/lib/machines/fedora-rawhide/sys/fs/selinux (MS_BIND "")...
Remounting /var/lib/machines/fedora-rawhide/sys/fs/selinux (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_BIND|MS_REMOUNT "")...
Mounting proc on /proc (MS_NOSUID|MS_NOEXEC|MS_NODEV "")...
Bind-mounting /proc/sys on /proc/sys (MS_BIND "")...
Remounting /proc/sys (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_BIND|MS_REMOUNT "")...
Bind-mounting /proc/sysrq-trigger on /proc/sysrq-trigger (MS_BIND "")...
Remounting /proc/sysrq-trigger (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_BIND|MS_REMOUNT "")...
Mounting tmpfs on /tmp (MS_STRICTATIME "mode=1777,uid=0,gid=0")...
Mounting tmpfs on /sys/fs/cgroup (MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME "mode=755,uid=0,gid=0")...
Mounting cgroup on /sys/fs/cgroup/systemd (MS_NOSUID|MS_NOEXEC|MS_NODEV "none,name=systemd,xattr")...
Failed to mount cgroup on /sys/fs/cgroup/systemd (MS_NOSUID|MS_NOEXEC|MS_NODEV "none,name=systemd,xattr"): No such file or directory
2016-10-10 21:55:20 +02:00
if ( r < 0 )
return r ;
2015-09-07 15:59:52 +02:00
if ( m - > read_only ) {
2019-03-25 17:04:38 +01:00
r = bind_remount_recursive ( where , MS_RDONLY , MS_RDONLY , NULL ) ;
2015-09-07 15:59:52 +02:00
if ( r < 0 )
return log_error_errno ( r , " Read-only bind mount failed: %m " ) ;
}
return 0 ;
}
2019-12-07 11:59:59 +01:00
static int mount_tmpfs ( const char * dest , CustomMount * m , uid_t uid_shift , const char * selinux_apifs_context ) {
2015-09-07 15:59:52 +02:00
2016-11-25 19:01:36 +01:00
const char * options ;
_cleanup_free_ char * buf = NULL , * where = NULL ;
2015-09-07 15:59:52 +02:00
int r ;
assert ( dest ) ;
assert ( m ) ;
2019-10-24 10:33:20 +02:00
r = chase_symlinks ( m - > destination , dest , CHASE_PREFIX_ROOT | CHASE_NONEXISTENT , & where , NULL ) ;
2016-11-25 19:01:36 +01:00
if ( r < 0 )
2016-12-01 12:40:23 +01:00
return log_error_errno ( r , " Failed to resolve %s/%s: %m " , dest , m - > destination ) ;
2016-11-29 18:13:11 +01:00
if ( r = = 0 ) { /* Doesn't exist yet? */
r = mkdir_p_label ( where , 0755 ) ;
if ( r < 0 )
return log_error_errno ( r , " Creating mount point for tmpfs %s failed: %m " , where ) ;
}
2015-09-07 15:59:52 +02:00
nspawn: Simplify tmpfs_patch_options() usage, and trickle that up
One of the things that tmpfs_patch_options does is take an (optional) UID,
and insert "uid=${UID},gid=${UID}" into the options string. So we need a
uid_t argument, and a way of telling if we should use it. Fortunately,
that is built in to the uid_t type by having UID_INVALID as a possible
value.
So this is really a feature that requires one argument. Yet, it is somehow
taking 4! That is absurd. Simplify it to only take one argument, and have
that trickle all the way up to mount_all()'s usage.
Now, in may of the uses, the argument becomes
uid_shift == 0 ? UID_INVALID : uid_shift
because it used to treat uid_shift=0 as invalid unless the patch_ids flag
was also set. This keeps the behavior the same. Note that in all cases
where it is invoked, if !use_userns (sometimes called !userns), then
uid_shift is 0; we don't have to add any checks for that.
That said, I'm pretty sure that "uid=0" and not setting "uid=" are the
same, but Christian Brauner seemed to not think so when implementing the
cgns support. https://github.com/systemd/systemd/pull/3589
2017-06-14 00:06:09 +02:00
r = tmpfs_patch_options ( m - > options , uid_shift = = 0 ? UID_INVALID : uid_shift , selinux_apifs_context , & buf ) ;
2015-09-07 15:59:52 +02:00
if ( r < 0 )
return log_oom ( ) ;
options = r > 0 ? buf : m - > options ;
2020-09-22 15:51:17 +02:00
return mount_nofollow_verbose ( LOG_ERR , " tmpfs " , where , " tmpfs " , MS_NODEV | MS_STRICTATIME , options ) ;
2015-09-07 15:59:52 +02:00
}
2016-11-30 16:02:47 +01:00
static char * joined_and_escaped_lower_dirs ( char * * lower ) {
2015-09-07 15:59:52 +02:00
_cleanup_strv_free_ char * * sv = NULL ;
sv = strv_copy ( lower ) ;
if ( ! sv )
return NULL ;
strv_reverse ( sv ) ;
if ( ! strv_shell_escape ( sv , " ,: " ) )
return NULL ;
return strv_join ( sv , " : " ) ;
}
static int mount_overlay ( const char * dest , CustomMount * m ) {
2016-11-30 16:02:47 +01:00
_cleanup_free_ char * lower = NULL , * where = NULL , * escaped_source = NULL ;
2016-11-25 19:01:36 +01:00
const char * options ;
2015-09-07 15:59:52 +02:00
int r ;
assert ( dest ) ;
assert ( m ) ;
2019-10-24 10:33:20 +02:00
r = chase_symlinks ( m - > destination , dest , CHASE_PREFIX_ROOT | CHASE_NONEXISTENT , & where , NULL ) ;
2016-11-25 19:01:36 +01:00
if ( r < 0 )
2016-12-01 12:40:23 +01:00
return log_error_errno ( r , " Failed to resolve %s/%s: %m " , dest , m - > destination ) ;
2016-11-29 18:13:11 +01:00
if ( r = = 0 ) { /* Doesn't exist yet? */
r = mkdir_label ( where , 0755 ) ;
if ( r < 0 )
return log_error_errno ( r , " Creating mount point for overlay %s failed: %m " , where ) ;
}
2015-09-07 15:59:52 +02:00
( void ) mkdir_p_label ( m - > source , 0755 ) ;
lower = joined_and_escaped_lower_dirs ( m - > lower ) ;
if ( ! lower )
return log_oom ( ) ;
2016-11-30 16:02:47 +01:00
escaped_source = shell_escape ( m - > source , " ,: " ) ;
if ( ! escaped_source )
return log_oom ( ) ;
2015-09-07 15:59:52 +02:00
2016-11-30 16:02:47 +01:00
if ( m - > read_only )
2015-09-07 15:59:52 +02:00
options = strjoina ( " lowerdir= " , escaped_source , " : " , lower ) ;
2016-11-30 16:02:47 +01:00
else {
_cleanup_free_ char * escaped_work_dir = NULL ;
2015-09-07 15:59:52 +02:00
escaped_work_dir = shell_escape ( m - > work_dir , " ,: " ) ;
if ( ! escaped_work_dir )
return log_oom ( ) ;
options = strjoina ( " lowerdir= " , lower , " ,upperdir= " , escaped_source , " ,workdir= " , escaped_work_dir ) ;
}
2020-09-22 15:51:17 +02:00
return mount_nofollow_verbose ( LOG_ERR , " overlay " , where , " overlay " , m - > read_only ? MS_RDONLY : 0 , options ) ;
2015-09-07 15:59:52 +02:00
}
nspawn: add support for executing OCI runtime bundles with nspawn
This is a pretty large patch, and adds support for OCI runtime bundles
to nspawn. A new switch --oci-bundle= is added that takes a path to an
OCI bundle. The JSON file included therein is read similar to a .nspawn
settings files, however with a different feature set.
Implementation-wise this mostly extends the pre-existing Settings object
to carry additional properties for OCI. However, OCI supports some
concepts .nspawn files did not support yet, which this patch also adds:
1. Support for "masking" files and directories. This functionatly is now
also available via the new --inaccesible= cmdline command, and
Inaccessible= in .nspawn files.
2. Support for mounting arbitrary file systems. (not exposed through
nspawn cmdline nor .nspawn files, because probably not a good idea)
3. Ability to configure the console settings for a container. This
functionality is now also available on the nspawn cmdline in the new
--console= switch (not added to .nspawn for now, as it is something
specific to the invocation really, not a property of the container)
4. Console width/height configuration. Not exposed through
.nspawn/cmdline, but this may be controlled through $COLUMNS and
$LINES like in most other UNIX tools.
5. UID/GID configuration by raw numbers. (not exposed in .nspawn and on
the cmdline, since containers likely have different user tables, and
the existing --user= switch appears to be the better option)
6. OCI hook commands (no exposed in .nspawn/cmdline, as very specific to
OCI)
7. Creation of additional devices nodes in /dev. Most likely not a good
idea, hence not exposed in .nspawn/cmdline. There's already --bind=
to achieve the same, which is the better alternative.
8. Explicit syscall filters. This is not a good idea, due to the skewed
arch support, hence not exposed through .nspawn/cmdline.
9. Configuration of some sysctls on a whitelist. Questionnable, not
supported in .nspawn/cmdline for now.
10. Configuration of all 5 types of capabilities. Not a useful concept,
since the kernel will reduce the caps on execve() anyway. Not
exposed through .nspawn/cmdline as this is not very useful hence.
Note that this only implements the OCI runtime logic itself. It does not
provide a runc-compatible command line tool. This is left for a later
PR. Only with that in place tools such as "buildah" can use the OCI
support in nspawn as drop-in replacement.
Currently still missing is OCI hook support, but it's already parsed and
everything, and should be easy to add. Other than that it's OCI is
implemented pretty comprehensively.
There's a list of incompatibilities in the nspawn-oci.c file. In a later
PR I'd like to convert this into proper markdown and add it to the
documentation directory.
2018-04-25 11:23:37 +02:00
static int mount_inaccessible ( const char * dest , CustomMount * m ) {
2019-11-19 23:24:52 +01:00
_cleanup_free_ char * where = NULL , * source = NULL ;
nspawn: add support for executing OCI runtime bundles with nspawn
This is a pretty large patch, and adds support for OCI runtime bundles
to nspawn. A new switch --oci-bundle= is added that takes a path to an
OCI bundle. The JSON file included therein is read similar to a .nspawn
settings files, however with a different feature set.
Implementation-wise this mostly extends the pre-existing Settings object
to carry additional properties for OCI. However, OCI supports some
concepts .nspawn files did not support yet, which this patch also adds:
1. Support for "masking" files and directories. This functionatly is now
also available via the new --inaccesible= cmdline command, and
Inaccessible= in .nspawn files.
2. Support for mounting arbitrary file systems. (not exposed through
nspawn cmdline nor .nspawn files, because probably not a good idea)
3. Ability to configure the console settings for a container. This
functionality is now also available on the nspawn cmdline in the new
--console= switch (not added to .nspawn for now, as it is something
specific to the invocation really, not a property of the container)
4. Console width/height configuration. Not exposed through
.nspawn/cmdline, but this may be controlled through $COLUMNS and
$LINES like in most other UNIX tools.
5. UID/GID configuration by raw numbers. (not exposed in .nspawn and on
the cmdline, since containers likely have different user tables, and
the existing --user= switch appears to be the better option)
6. OCI hook commands (no exposed in .nspawn/cmdline, as very specific to
OCI)
7. Creation of additional devices nodes in /dev. Most likely not a good
idea, hence not exposed in .nspawn/cmdline. There's already --bind=
to achieve the same, which is the better alternative.
8. Explicit syscall filters. This is not a good idea, due to the skewed
arch support, hence not exposed through .nspawn/cmdline.
9. Configuration of some sysctls on a whitelist. Questionnable, not
supported in .nspawn/cmdline for now.
10. Configuration of all 5 types of capabilities. Not a useful concept,
since the kernel will reduce the caps on execve() anyway. Not
exposed through .nspawn/cmdline as this is not very useful hence.
Note that this only implements the OCI runtime logic itself. It does not
provide a runc-compatible command line tool. This is left for a later
PR. Only with that in place tools such as "buildah" can use the OCI
support in nspawn as drop-in replacement.
Currently still missing is OCI hook support, but it's already parsed and
everything, and should be easy to add. Other than that it's OCI is
implemented pretty comprehensively.
There's a list of incompatibilities in the nspawn-oci.c file. In a later
PR I'd like to convert this into proper markdown and add it to the
documentation directory.
2018-04-25 11:23:37 +02:00
struct stat st ;
int r ;
assert ( dest ) ;
assert ( m ) ;
2019-10-24 10:33:20 +02:00
r = chase_symlinks_and_stat ( m - > destination , dest , CHASE_PREFIX_ROOT , & where , & st , NULL ) ;
nspawn: add support for executing OCI runtime bundles with nspawn
This is a pretty large patch, and adds support for OCI runtime bundles
to nspawn. A new switch --oci-bundle= is added that takes a path to an
OCI bundle. The JSON file included therein is read similar to a .nspawn
settings files, however with a different feature set.
Implementation-wise this mostly extends the pre-existing Settings object
to carry additional properties for OCI. However, OCI supports some
concepts .nspawn files did not support yet, which this patch also adds:
1. Support for "masking" files and directories. This functionatly is now
also available via the new --inaccesible= cmdline command, and
Inaccessible= in .nspawn files.
2. Support for mounting arbitrary file systems. (not exposed through
nspawn cmdline nor .nspawn files, because probably not a good idea)
3. Ability to configure the console settings for a container. This
functionality is now also available on the nspawn cmdline in the new
--console= switch (not added to .nspawn for now, as it is something
specific to the invocation really, not a property of the container)
4. Console width/height configuration. Not exposed through
.nspawn/cmdline, but this may be controlled through $COLUMNS and
$LINES like in most other UNIX tools.
5. UID/GID configuration by raw numbers. (not exposed in .nspawn and on
the cmdline, since containers likely have different user tables, and
the existing --user= switch appears to be the better option)
6. OCI hook commands (no exposed in .nspawn/cmdline, as very specific to
OCI)
7. Creation of additional devices nodes in /dev. Most likely not a good
idea, hence not exposed in .nspawn/cmdline. There's already --bind=
to achieve the same, which is the better alternative.
8. Explicit syscall filters. This is not a good idea, due to the skewed
arch support, hence not exposed through .nspawn/cmdline.
9. Configuration of some sysctls on a whitelist. Questionnable, not
supported in .nspawn/cmdline for now.
10. Configuration of all 5 types of capabilities. Not a useful concept,
since the kernel will reduce the caps on execve() anyway. Not
exposed through .nspawn/cmdline as this is not very useful hence.
Note that this only implements the OCI runtime logic itself. It does not
provide a runc-compatible command line tool. This is left for a later
PR. Only with that in place tools such as "buildah" can use the OCI
support in nspawn as drop-in replacement.
Currently still missing is OCI hook support, but it's already parsed and
everything, and should be easy to add. Other than that it's OCI is
implemented pretty comprehensively.
There's a list of incompatibilities in the nspawn-oci.c file. In a later
PR I'd like to convert this into proper markdown and add it to the
documentation directory.
2018-04-25 11:23:37 +02:00
if ( r < 0 ) {
log_full_errno ( m - > graceful ? LOG_DEBUG : LOG_ERR , r , " Failed to resolve %s/%s: %m " , dest , m - > destination ) ;
return m - > graceful ? 0 : r ;
}
2020-06-09 16:22:24 +02:00
r = mode_to_inaccessible_node ( NULL , st . st_mode , & source ) ;
2019-11-19 23:24:52 +01:00
if ( r < 0 )
return m - > graceful ? 0 : r ;
nspawn: add support for executing OCI runtime bundles with nspawn
This is a pretty large patch, and adds support for OCI runtime bundles
to nspawn. A new switch --oci-bundle= is added that takes a path to an
OCI bundle. The JSON file included therein is read similar to a .nspawn
settings files, however with a different feature set.
Implementation-wise this mostly extends the pre-existing Settings object
to carry additional properties for OCI. However, OCI supports some
concepts .nspawn files did not support yet, which this patch also adds:
1. Support for "masking" files and directories. This functionatly is now
also available via the new --inaccesible= cmdline command, and
Inaccessible= in .nspawn files.
2. Support for mounting arbitrary file systems. (not exposed through
nspawn cmdline nor .nspawn files, because probably not a good idea)
3. Ability to configure the console settings for a container. This
functionality is now also available on the nspawn cmdline in the new
--console= switch (not added to .nspawn for now, as it is something
specific to the invocation really, not a property of the container)
4. Console width/height configuration. Not exposed through
.nspawn/cmdline, but this may be controlled through $COLUMNS and
$LINES like in most other UNIX tools.
5. UID/GID configuration by raw numbers. (not exposed in .nspawn and on
the cmdline, since containers likely have different user tables, and
the existing --user= switch appears to be the better option)
6. OCI hook commands (no exposed in .nspawn/cmdline, as very specific to
OCI)
7. Creation of additional devices nodes in /dev. Most likely not a good
idea, hence not exposed in .nspawn/cmdline. There's already --bind=
to achieve the same, which is the better alternative.
8. Explicit syscall filters. This is not a good idea, due to the skewed
arch support, hence not exposed through .nspawn/cmdline.
9. Configuration of some sysctls on a whitelist. Questionnable, not
supported in .nspawn/cmdline for now.
10. Configuration of all 5 types of capabilities. Not a useful concept,
since the kernel will reduce the caps on execve() anyway. Not
exposed through .nspawn/cmdline as this is not very useful hence.
Note that this only implements the OCI runtime logic itself. It does not
provide a runc-compatible command line tool. This is left for a later
PR. Only with that in place tools such as "buildah" can use the OCI
support in nspawn as drop-in replacement.
Currently still missing is OCI hook support, but it's already parsed and
everything, and should be easy to add. Other than that it's OCI is
implemented pretty comprehensively.
There's a list of incompatibilities in the nspawn-oci.c file. In a later
PR I'd like to convert this into proper markdown and add it to the
documentation directory.
2018-04-25 11:23:37 +02:00
2020-09-22 15:51:17 +02:00
r = mount_nofollow_verbose ( m - > graceful ? LOG_DEBUG : LOG_ERR , source , where , NULL , MS_BIND , NULL ) ;
nspawn: add support for executing OCI runtime bundles with nspawn
This is a pretty large patch, and adds support for OCI runtime bundles
to nspawn. A new switch --oci-bundle= is added that takes a path to an
OCI bundle. The JSON file included therein is read similar to a .nspawn
settings files, however with a different feature set.
Implementation-wise this mostly extends the pre-existing Settings object
to carry additional properties for OCI. However, OCI supports some
concepts .nspawn files did not support yet, which this patch also adds:
1. Support for "masking" files and directories. This functionatly is now
also available via the new --inaccesible= cmdline command, and
Inaccessible= in .nspawn files.
2. Support for mounting arbitrary file systems. (not exposed through
nspawn cmdline nor .nspawn files, because probably not a good idea)
3. Ability to configure the console settings for a container. This
functionality is now also available on the nspawn cmdline in the new
--console= switch (not added to .nspawn for now, as it is something
specific to the invocation really, not a property of the container)
4. Console width/height configuration. Not exposed through
.nspawn/cmdline, but this may be controlled through $COLUMNS and
$LINES like in most other UNIX tools.
5. UID/GID configuration by raw numbers. (not exposed in .nspawn and on
the cmdline, since containers likely have different user tables, and
the existing --user= switch appears to be the better option)
6. OCI hook commands (no exposed in .nspawn/cmdline, as very specific to
OCI)
7. Creation of additional devices nodes in /dev. Most likely not a good
idea, hence not exposed in .nspawn/cmdline. There's already --bind=
to achieve the same, which is the better alternative.
8. Explicit syscall filters. This is not a good idea, due to the skewed
arch support, hence not exposed through .nspawn/cmdline.
9. Configuration of some sysctls on a whitelist. Questionnable, not
supported in .nspawn/cmdline for now.
10. Configuration of all 5 types of capabilities. Not a useful concept,
since the kernel will reduce the caps on execve() anyway. Not
exposed through .nspawn/cmdline as this is not very useful hence.
Note that this only implements the OCI runtime logic itself. It does not
provide a runc-compatible command line tool. This is left for a later
PR. Only with that in place tools such as "buildah" can use the OCI
support in nspawn as drop-in replacement.
Currently still missing is OCI hook support, but it's already parsed and
everything, and should be easy to add. Other than that it's OCI is
implemented pretty comprehensively.
There's a list of incompatibilities in the nspawn-oci.c file. In a later
PR I'd like to convert this into proper markdown and add it to the
documentation directory.
2018-04-25 11:23:37 +02:00
if ( r < 0 )
return m - > graceful ? 0 : r ;
2020-09-22 15:51:17 +02:00
r = mount_nofollow_verbose ( m - > graceful ? LOG_DEBUG : LOG_ERR , NULL , where , NULL , MS_BIND | MS_RDONLY | MS_REMOUNT , NULL ) ;
2019-03-21 12:41:02 +01:00
if ( r < 0 ) {
2020-09-22 16:32:07 +02:00
( void ) umount_verbose ( m - > graceful ? LOG_DEBUG : LOG_ERR , where , UMOUNT_NOFOLLOW ) ;
nspawn: add support for executing OCI runtime bundles with nspawn
This is a pretty large patch, and adds support for OCI runtime bundles
to nspawn. A new switch --oci-bundle= is added that takes a path to an
OCI bundle. The JSON file included therein is read similar to a .nspawn
settings files, however with a different feature set.
Implementation-wise this mostly extends the pre-existing Settings object
to carry additional properties for OCI. However, OCI supports some
concepts .nspawn files did not support yet, which this patch also adds:
1. Support for "masking" files and directories. This functionatly is now
also available via the new --inaccesible= cmdline command, and
Inaccessible= in .nspawn files.
2. Support for mounting arbitrary file systems. (not exposed through
nspawn cmdline nor .nspawn files, because probably not a good idea)
3. Ability to configure the console settings for a container. This
functionality is now also available on the nspawn cmdline in the new
--console= switch (not added to .nspawn for now, as it is something
specific to the invocation really, not a property of the container)
4. Console width/height configuration. Not exposed through
.nspawn/cmdline, but this may be controlled through $COLUMNS and
$LINES like in most other UNIX tools.
5. UID/GID configuration by raw numbers. (not exposed in .nspawn and on
the cmdline, since containers likely have different user tables, and
the existing --user= switch appears to be the better option)
6. OCI hook commands (no exposed in .nspawn/cmdline, as very specific to
OCI)
7. Creation of additional devices nodes in /dev. Most likely not a good
idea, hence not exposed in .nspawn/cmdline. There's already --bind=
to achieve the same, which is the better alternative.
8. Explicit syscall filters. This is not a good idea, due to the skewed
arch support, hence not exposed through .nspawn/cmdline.
9. Configuration of some sysctls on a whitelist. Questionnable, not
supported in .nspawn/cmdline for now.
10. Configuration of all 5 types of capabilities. Not a useful concept,
since the kernel will reduce the caps on execve() anyway. Not
exposed through .nspawn/cmdline as this is not very useful hence.
Note that this only implements the OCI runtime logic itself. It does not
provide a runc-compatible command line tool. This is left for a later
PR. Only with that in place tools such as "buildah" can use the OCI
support in nspawn as drop-in replacement.
Currently still missing is OCI hook support, but it's already parsed and
everything, and should be easy to add. Other than that it's OCI is
implemented pretty comprehensively.
There's a list of incompatibilities in the nspawn-oci.c file. In a later
PR I'd like to convert this into proper markdown and add it to the
documentation directory.
2018-04-25 11:23:37 +02:00
return m - > graceful ? 0 : r ;
2019-03-21 12:41:02 +01:00
}
nspawn: add support for executing OCI runtime bundles with nspawn
This is a pretty large patch, and adds support for OCI runtime bundles
to nspawn. A new switch --oci-bundle= is added that takes a path to an
OCI bundle. The JSON file included therein is read similar to a .nspawn
settings files, however with a different feature set.
Implementation-wise this mostly extends the pre-existing Settings object
to carry additional properties for OCI. However, OCI supports some
concepts .nspawn files did not support yet, which this patch also adds:
1. Support for "masking" files and directories. This functionatly is now
also available via the new --inaccesible= cmdline command, and
Inaccessible= in .nspawn files.
2. Support for mounting arbitrary file systems. (not exposed through
nspawn cmdline nor .nspawn files, because probably not a good idea)
3. Ability to configure the console settings for a container. This
functionality is now also available on the nspawn cmdline in the new
--console= switch (not added to .nspawn for now, as it is something
specific to the invocation really, not a property of the container)
4. Console width/height configuration. Not exposed through
.nspawn/cmdline, but this may be controlled through $COLUMNS and
$LINES like in most other UNIX tools.
5. UID/GID configuration by raw numbers. (not exposed in .nspawn and on
the cmdline, since containers likely have different user tables, and
the existing --user= switch appears to be the better option)
6. OCI hook commands (no exposed in .nspawn/cmdline, as very specific to
OCI)
7. Creation of additional devices nodes in /dev. Most likely not a good
idea, hence not exposed in .nspawn/cmdline. There's already --bind=
to achieve the same, which is the better alternative.
8. Explicit syscall filters. This is not a good idea, due to the skewed
arch support, hence not exposed through .nspawn/cmdline.
9. Configuration of some sysctls on a whitelist. Questionnable, not
supported in .nspawn/cmdline for now.
10. Configuration of all 5 types of capabilities. Not a useful concept,
since the kernel will reduce the caps on execve() anyway. Not
exposed through .nspawn/cmdline as this is not very useful hence.
Note that this only implements the OCI runtime logic itself. It does not
provide a runc-compatible command line tool. This is left for a later
PR. Only with that in place tools such as "buildah" can use the OCI
support in nspawn as drop-in replacement.
Currently still missing is OCI hook support, but it's already parsed and
everything, and should be easy to add. Other than that it's OCI is
implemented pretty comprehensively.
There's a list of incompatibilities in the nspawn-oci.c file. In a later
PR I'd like to convert this into proper markdown and add it to the
documentation directory.
2018-04-25 11:23:37 +02:00
return 0 ;
}
static int mount_arbitrary ( const char * dest , CustomMount * m ) {
_cleanup_free_ char * where = NULL ;
int r ;
assert ( dest ) ;
assert ( m ) ;
2019-10-24 10:33:20 +02:00
r = chase_symlinks ( m - > destination , dest , CHASE_PREFIX_ROOT | CHASE_NONEXISTENT , & where , NULL ) ;
nspawn: add support for executing OCI runtime bundles with nspawn
This is a pretty large patch, and adds support for OCI runtime bundles
to nspawn. A new switch --oci-bundle= is added that takes a path to an
OCI bundle. The JSON file included therein is read similar to a .nspawn
settings files, however with a different feature set.
Implementation-wise this mostly extends the pre-existing Settings object
to carry additional properties for OCI. However, OCI supports some
concepts .nspawn files did not support yet, which this patch also adds:
1. Support for "masking" files and directories. This functionatly is now
also available via the new --inaccesible= cmdline command, and
Inaccessible= in .nspawn files.
2. Support for mounting arbitrary file systems. (not exposed through
nspawn cmdline nor .nspawn files, because probably not a good idea)
3. Ability to configure the console settings for a container. This
functionality is now also available on the nspawn cmdline in the new
--console= switch (not added to .nspawn for now, as it is something
specific to the invocation really, not a property of the container)
4. Console width/height configuration. Not exposed through
.nspawn/cmdline, but this may be controlled through $COLUMNS and
$LINES like in most other UNIX tools.
5. UID/GID configuration by raw numbers. (not exposed in .nspawn and on
the cmdline, since containers likely have different user tables, and
the existing --user= switch appears to be the better option)
6. OCI hook commands (no exposed in .nspawn/cmdline, as very specific to
OCI)
7. Creation of additional devices nodes in /dev. Most likely not a good
idea, hence not exposed in .nspawn/cmdline. There's already --bind=
to achieve the same, which is the better alternative.
8. Explicit syscall filters. This is not a good idea, due to the skewed
arch support, hence not exposed through .nspawn/cmdline.
9. Configuration of some sysctls on a whitelist. Questionnable, not
supported in .nspawn/cmdline for now.
10. Configuration of all 5 types of capabilities. Not a useful concept,
since the kernel will reduce the caps on execve() anyway. Not
exposed through .nspawn/cmdline as this is not very useful hence.
Note that this only implements the OCI runtime logic itself. It does not
provide a runc-compatible command line tool. This is left for a later
PR. Only with that in place tools such as "buildah" can use the OCI
support in nspawn as drop-in replacement.
Currently still missing is OCI hook support, but it's already parsed and
everything, and should be easy to add. Other than that it's OCI is
implemented pretty comprehensively.
There's a list of incompatibilities in the nspawn-oci.c file. In a later
PR I'd like to convert this into proper markdown and add it to the
documentation directory.
2018-04-25 11:23:37 +02:00
if ( r < 0 )
return log_error_errno ( r , " Failed to resolve %s/%s: %m " , dest , m - > destination ) ;
if ( r = = 0 ) { /* Doesn't exist yet? */
r = mkdir_p_label ( where , 0755 ) ;
if ( r < 0 )
return log_error_errno ( r , " Creating mount point for mount %s failed: %m " , where ) ;
}
2020-09-22 15:51:17 +02:00
return mount_nofollow_verbose ( LOG_ERR , m - > source , where , m - > type_argument , 0 , m - > options ) ;
nspawn: add support for executing OCI runtime bundles with nspawn
This is a pretty large patch, and adds support for OCI runtime bundles
to nspawn. A new switch --oci-bundle= is added that takes a path to an
OCI bundle. The JSON file included therein is read similar to a .nspawn
settings files, however with a different feature set.
Implementation-wise this mostly extends the pre-existing Settings object
to carry additional properties for OCI. However, OCI supports some
concepts .nspawn files did not support yet, which this patch also adds:
1. Support for "masking" files and directories. This functionatly is now
also available via the new --inaccesible= cmdline command, and
Inaccessible= in .nspawn files.
2. Support for mounting arbitrary file systems. (not exposed through
nspawn cmdline nor .nspawn files, because probably not a good idea)
3. Ability to configure the console settings for a container. This
functionality is now also available on the nspawn cmdline in the new
--console= switch (not added to .nspawn for now, as it is something
specific to the invocation really, not a property of the container)
4. Console width/height configuration. Not exposed through
.nspawn/cmdline, but this may be controlled through $COLUMNS and
$LINES like in most other UNIX tools.
5. UID/GID configuration by raw numbers. (not exposed in .nspawn and on
the cmdline, since containers likely have different user tables, and
the existing --user= switch appears to be the better option)
6. OCI hook commands (no exposed in .nspawn/cmdline, as very specific to
OCI)
7. Creation of additional devices nodes in /dev. Most likely not a good
idea, hence not exposed in .nspawn/cmdline. There's already --bind=
to achieve the same, which is the better alternative.
8. Explicit syscall filters. This is not a good idea, due to the skewed
arch support, hence not exposed through .nspawn/cmdline.
9. Configuration of some sysctls on a whitelist. Questionnable, not
supported in .nspawn/cmdline for now.
10. Configuration of all 5 types of capabilities. Not a useful concept,
since the kernel will reduce the caps on execve() anyway. Not
exposed through .nspawn/cmdline as this is not very useful hence.
Note that this only implements the OCI runtime logic itself. It does not
provide a runc-compatible command line tool. This is left for a later
PR. Only with that in place tools such as "buildah" can use the OCI
support in nspawn as drop-in replacement.
Currently still missing is OCI hook support, but it's already parsed and
everything, and should be easy to add. Other than that it's OCI is
implemented pretty comprehensively.
There's a list of incompatibilities in the nspawn-oci.c file. In a later
PR I'd like to convert this into proper markdown and add it to the
documentation directory.
2018-04-25 11:23:37 +02:00
}
2015-09-07 15:59:52 +02:00
int mount_custom (
const char * dest ,
2018-04-27 22:01:54 +02:00
CustomMount * mounts , size_t n ,
2019-12-07 11:59:59 +01:00
uid_t uid_shift ,
nspawn: add support for executing OCI runtime bundles with nspawn
This is a pretty large patch, and adds support for OCI runtime bundles
to nspawn. A new switch --oci-bundle= is added that takes a path to an
OCI bundle. The JSON file included therein is read similar to a .nspawn
settings files, however with a different feature set.
Implementation-wise this mostly extends the pre-existing Settings object
to carry additional properties for OCI. However, OCI supports some
concepts .nspawn files did not support yet, which this patch also adds:
1. Support for "masking" files and directories. This functionatly is now
also available via the new --inaccesible= cmdline command, and
Inaccessible= in .nspawn files.
2. Support for mounting arbitrary file systems. (not exposed through
nspawn cmdline nor .nspawn files, because probably not a good idea)
3. Ability to configure the console settings for a container. This
functionality is now also available on the nspawn cmdline in the new
--console= switch (not added to .nspawn for now, as it is something
specific to the invocation really, not a property of the container)
4. Console width/height configuration. Not exposed through
.nspawn/cmdline, but this may be controlled through $COLUMNS and
$LINES like in most other UNIX tools.
5. UID/GID configuration by raw numbers. (not exposed in .nspawn and on
the cmdline, since containers likely have different user tables, and
the existing --user= switch appears to be the better option)
6. OCI hook commands (no exposed in .nspawn/cmdline, as very specific to
OCI)
7. Creation of additional devices nodes in /dev. Most likely not a good
idea, hence not exposed in .nspawn/cmdline. There's already --bind=
to achieve the same, which is the better alternative.
8. Explicit syscall filters. This is not a good idea, due to the skewed
arch support, hence not exposed through .nspawn/cmdline.
9. Configuration of some sysctls on a whitelist. Questionnable, not
supported in .nspawn/cmdline for now.
10. Configuration of all 5 types of capabilities. Not a useful concept,
since the kernel will reduce the caps on execve() anyway. Not
exposed through .nspawn/cmdline as this is not very useful hence.
Note that this only implements the OCI runtime logic itself. It does not
provide a runc-compatible command line tool. This is left for a later
PR. Only with that in place tools such as "buildah" can use the OCI
support in nspawn as drop-in replacement.
Currently still missing is OCI hook support, but it's already parsed and
everything, and should be easy to add. Other than that it's OCI is
implemented pretty comprehensively.
There's a list of incompatibilities in the nspawn-oci.c file. In a later
PR I'd like to convert this into proper markdown and add it to the
documentation directory.
2018-04-25 11:23:37 +02:00
const char * selinux_apifs_context ,
2019-12-06 22:45:14 +01:00
MountSettingsMask mount_settings ) {
2015-09-07 15:59:52 +02:00
2018-04-27 22:01:54 +02:00
size_t i ;
2015-09-07 15:59:52 +02:00
int r ;
assert ( dest ) ;
for ( i = 0 ; i < n ; i + + ) {
CustomMount * m = mounts + i ;
2019-12-12 20:18:37 +01:00
if ( FLAGS_SET ( mount_settings , MOUNT_IN_USERNS ) ! = m - > in_userns )
2019-12-06 22:45:14 +01:00
continue ;
2019-12-12 20:18:37 +01:00
if ( FLAGS_SET ( mount_settings , MOUNT_ROOT_ONLY ) & & ! path_equal ( m - > destination , " / " ) )
2019-12-06 22:45:14 +01:00
continue ;
2019-12-12 20:18:37 +01:00
if ( FLAGS_SET ( mount_settings , MOUNT_NON_ROOT_ONLY ) & & path_equal ( m - > destination , " / " ) )
nspawn: add support for executing OCI runtime bundles with nspawn
This is a pretty large patch, and adds support for OCI runtime bundles
to nspawn. A new switch --oci-bundle= is added that takes a path to an
OCI bundle. The JSON file included therein is read similar to a .nspawn
settings files, however with a different feature set.
Implementation-wise this mostly extends the pre-existing Settings object
to carry additional properties for OCI. However, OCI supports some
concepts .nspawn files did not support yet, which this patch also adds:
1. Support for "masking" files and directories. This functionatly is now
also available via the new --inaccesible= cmdline command, and
Inaccessible= in .nspawn files.
2. Support for mounting arbitrary file systems. (not exposed through
nspawn cmdline nor .nspawn files, because probably not a good idea)
3. Ability to configure the console settings for a container. This
functionality is now also available on the nspawn cmdline in the new
--console= switch (not added to .nspawn for now, as it is something
specific to the invocation really, not a property of the container)
4. Console width/height configuration. Not exposed through
.nspawn/cmdline, but this may be controlled through $COLUMNS and
$LINES like in most other UNIX tools.
5. UID/GID configuration by raw numbers. (not exposed in .nspawn and on
the cmdline, since containers likely have different user tables, and
the existing --user= switch appears to be the better option)
6. OCI hook commands (no exposed in .nspawn/cmdline, as very specific to
OCI)
7. Creation of additional devices nodes in /dev. Most likely not a good
idea, hence not exposed in .nspawn/cmdline. There's already --bind=
to achieve the same, which is the better alternative.
8. Explicit syscall filters. This is not a good idea, due to the skewed
arch support, hence not exposed through .nspawn/cmdline.
9. Configuration of some sysctls on a whitelist. Questionnable, not
supported in .nspawn/cmdline for now.
10. Configuration of all 5 types of capabilities. Not a useful concept,
since the kernel will reduce the caps on execve() anyway. Not
exposed through .nspawn/cmdline as this is not very useful hence.
Note that this only implements the OCI runtime logic itself. It does not
provide a runc-compatible command line tool. This is left for a later
PR. Only with that in place tools such as "buildah" can use the OCI
support in nspawn as drop-in replacement.
Currently still missing is OCI hook support, but it's already parsed and
everything, and should be easy to add. Other than that it's OCI is
implemented pretty comprehensively.
There's a list of incompatibilities in the nspawn-oci.c file. In a later
PR I'd like to convert this into proper markdown and add it to the
documentation directory.
2018-04-25 11:23:37 +02:00
continue ;
2015-09-07 15:59:52 +02:00
switch ( m - > type ) {
case CUSTOM_MOUNT_BIND :
r = mount_bind ( dest , m ) ;
break ;
case CUSTOM_MOUNT_TMPFS :
2019-12-07 11:59:59 +01:00
r = mount_tmpfs ( dest , m , uid_shift , selinux_apifs_context ) ;
2015-09-07 15:59:52 +02:00
break ;
case CUSTOM_MOUNT_OVERLAY :
r = mount_overlay ( dest , m ) ;
break ;
nspawn: add support for executing OCI runtime bundles with nspawn
This is a pretty large patch, and adds support for OCI runtime bundles
to nspawn. A new switch --oci-bundle= is added that takes a path to an
OCI bundle. The JSON file included therein is read similar to a .nspawn
settings files, however with a different feature set.
Implementation-wise this mostly extends the pre-existing Settings object
to carry additional properties for OCI. However, OCI supports some
concepts .nspawn files did not support yet, which this patch also adds:
1. Support for "masking" files and directories. This functionatly is now
also available via the new --inaccesible= cmdline command, and
Inaccessible= in .nspawn files.
2. Support for mounting arbitrary file systems. (not exposed through
nspawn cmdline nor .nspawn files, because probably not a good idea)
3. Ability to configure the console settings for a container. This
functionality is now also available on the nspawn cmdline in the new
--console= switch (not added to .nspawn for now, as it is something
specific to the invocation really, not a property of the container)
4. Console width/height configuration. Not exposed through
.nspawn/cmdline, but this may be controlled through $COLUMNS and
$LINES like in most other UNIX tools.
5. UID/GID configuration by raw numbers. (not exposed in .nspawn and on
the cmdline, since containers likely have different user tables, and
the existing --user= switch appears to be the better option)
6. OCI hook commands (no exposed in .nspawn/cmdline, as very specific to
OCI)
7. Creation of additional devices nodes in /dev. Most likely not a good
idea, hence not exposed in .nspawn/cmdline. There's already --bind=
to achieve the same, which is the better alternative.
8. Explicit syscall filters. This is not a good idea, due to the skewed
arch support, hence not exposed through .nspawn/cmdline.
9. Configuration of some sysctls on a whitelist. Questionnable, not
supported in .nspawn/cmdline for now.
10. Configuration of all 5 types of capabilities. Not a useful concept,
since the kernel will reduce the caps on execve() anyway. Not
exposed through .nspawn/cmdline as this is not very useful hence.
Note that this only implements the OCI runtime logic itself. It does not
provide a runc-compatible command line tool. This is left for a later
PR. Only with that in place tools such as "buildah" can use the OCI
support in nspawn as drop-in replacement.
Currently still missing is OCI hook support, but it's already parsed and
everything, and should be easy to add. Other than that it's OCI is
implemented pretty comprehensively.
There's a list of incompatibilities in the nspawn-oci.c file. In a later
PR I'd like to convert this into proper markdown and add it to the
documentation directory.
2018-04-25 11:23:37 +02:00
case CUSTOM_MOUNT_INACCESSIBLE :
r = mount_inaccessible ( dest , m ) ;
break ;
case CUSTOM_MOUNT_ARBITRARY :
r = mount_arbitrary ( dest , m ) ;
break ;
2015-09-07 15:59:52 +02:00
default :
assert_not_reached ( " Unknown custom mount type " ) ;
}
if ( r < 0 )
return r ;
}
return 0 ;
}
2019-12-23 11:50:02 +01:00
bool has_custom_root_mount ( const CustomMount * mounts , size_t n ) {
size_t i ;
for ( i = 0 ; i < n ; i + + ) {
const CustomMount * m = mounts + i ;
if ( path_equal ( m - > destination , " / " ) )
return true ;
}
return false ;
}
2019-12-07 11:59:59 +01:00
static int setup_volatile_state ( const char * directory , uid_t uid_shift , const char * selinux_apifs_context ) {
2015-09-07 15:59:52 +02:00
_cleanup_free_ char * buf = NULL ;
const char * p , * options ;
int r ;
assert ( directory ) ;
2018-12-19 00:09:57 +01:00
/* --volatile=state means we simply overmount /var with a tmpfs, and the rest read-only. */
2015-09-07 15:59:52 +02:00
2019-03-25 17:04:38 +01:00
r = bind_remount_recursive ( directory , MS_RDONLY , MS_RDONLY , NULL ) ;
2015-09-07 15:59:52 +02:00
if ( r < 0 )
return log_error_errno ( r , " Failed to remount %s read-only: %m " , directory ) ;
p = prefix_roota ( directory , " /var " ) ;
r = mkdir ( p , 0755 ) ;
if ( r < 0 & & errno ! = EEXIST )
return log_error_errno ( errno , " Failed to create %s: %m " , directory ) ;
2020-04-14 15:39:36 +02:00
options = " mode=755 " TMPFS_LIMITS_VOLATILE_STATE ;
nspawn: Simplify tmpfs_patch_options() usage, and trickle that up
One of the things that tmpfs_patch_options does is take an (optional) UID,
and insert "uid=${UID},gid=${UID}" into the options string. So we need a
uid_t argument, and a way of telling if we should use it. Fortunately,
that is built in to the uid_t type by having UID_INVALID as a possible
value.
So this is really a feature that requires one argument. Yet, it is somehow
taking 4! That is absurd. Simplify it to only take one argument, and have
that trickle all the way up to mount_all()'s usage.
Now, in may of the uses, the argument becomes
uid_shift == 0 ? UID_INVALID : uid_shift
because it used to treat uid_shift=0 as invalid unless the patch_ids flag
was also set. This keeps the behavior the same. Note that in all cases
where it is invoked, if !use_userns (sometimes called !userns), then
uid_shift is 0; we don't have to add any checks for that.
That said, I'm pretty sure that "uid=0" and not setting "uid=" are the
same, but Christian Brauner seemed to not think so when implementing the
cgns support. https://github.com/systemd/systemd/pull/3589
2017-06-14 00:06:09 +02:00
r = tmpfs_patch_options ( options , uid_shift = = 0 ? UID_INVALID : uid_shift , selinux_apifs_context , & buf ) ;
2015-09-07 15:59:52 +02:00
if ( r < 0 )
return log_oom ( ) ;
if ( r > 0 )
options = buf ;
2020-09-22 15:51:17 +02:00
return mount_nofollow_verbose ( LOG_ERR , " tmpfs " , p , " tmpfs " , MS_STRICTATIME , options ) ;
2015-09-07 15:59:52 +02:00
}
2019-12-07 11:59:59 +01:00
static int setup_volatile_yes ( const char * directory , uid_t uid_shift , const char * selinux_apifs_context ) {
2015-09-07 15:59:52 +02:00
bool tmpfs_mounted = false , bind_mounted = false ;
char template [ ] = " /tmp/nspawn-volatile-XXXXXX " ;
2019-07-29 09:14:17 +02:00
_cleanup_free_ char * buf = NULL , * bindir = NULL ;
2015-09-07 15:59:52 +02:00
const char * f , * t , * options ;
2019-07-29 09:14:17 +02:00
struct stat st ;
2015-09-07 15:59:52 +02:00
int r ;
assert ( directory ) ;
2019-07-29 09:14:17 +02:00
/* --volatile=yes means we mount a tmpfs to the root dir, and the original /usr to use inside it, and
* that read - only . Before we start setting this up let ' s validate if the image has the / usr merge
* implemented , and let ' s output a friendly log message if it hasn ' t . */
bindir = path_join ( directory , " /bin " ) ;
if ( ! bindir )
return log_oom ( ) ;
if ( lstat ( bindir , & st ) < 0 ) {
if ( errno ! = ENOENT )
return log_error_errno ( errno , " Failed to stat /bin directory below image: %m " ) ;
/* ENOENT is fine, just means the image is probably just a naked /usr and we can create the
* rest . */
} else if ( S_ISDIR ( st . st_mode ) )
return log_error_errno ( SYNTHETIC_ERRNO ( EISDIR ) ,
" Sorry, --volatile=yes mode is not supported with OS images that have not merged /bin/, /sbin/, /lib/, /lib64/ into /usr/. "
" Please work with your distribution and help them adopt the merged /usr scheme. " ) ;
else if ( ! S_ISLNK ( st . st_mode ) )
return log_error_errno ( SYNTHETIC_ERRNO ( EINVAL ) ,
" Error starting image: if --volatile=yes is used /bin must be a symlink (for merged /usr support) or non-existent (in which case a symlink is created automatically). " ) ;
2015-09-07 15:59:52 +02:00
if ( ! mkdtemp ( template ) )
return log_error_errno ( errno , " Failed to create temporary directory: %m " ) ;
2020-04-14 15:39:36 +02:00
options = " mode=755 " TMPFS_LIMITS_ROOTFS ;
nspawn: Simplify tmpfs_patch_options() usage, and trickle that up
One of the things that tmpfs_patch_options does is take an (optional) UID,
and insert "uid=${UID},gid=${UID}" into the options string. So we need a
uid_t argument, and a way of telling if we should use it. Fortunately,
that is built in to the uid_t type by having UID_INVALID as a possible
value.
So this is really a feature that requires one argument. Yet, it is somehow
taking 4! That is absurd. Simplify it to only take one argument, and have
that trickle all the way up to mount_all()'s usage.
Now, in may of the uses, the argument becomes
uid_shift == 0 ? UID_INVALID : uid_shift
because it used to treat uid_shift=0 as invalid unless the patch_ids flag
was also set. This keeps the behavior the same. Note that in all cases
where it is invoked, if !use_userns (sometimes called !userns), then
uid_shift is 0; we don't have to add any checks for that.
That said, I'm pretty sure that "uid=0" and not setting "uid=" are the
same, but Christian Brauner seemed to not think so when implementing the
cgns support. https://github.com/systemd/systemd/pull/3589
2017-06-14 00:06:09 +02:00
r = tmpfs_patch_options ( options , uid_shift = = 0 ? UID_INVALID : uid_shift , selinux_apifs_context , & buf ) ;
2015-09-07 15:59:52 +02:00
if ( r < 0 )
2018-12-19 01:01:46 +01:00
goto fail ;
2015-09-07 15:59:52 +02:00
if ( r > 0 )
options = buf ;
2020-09-22 15:51:17 +02:00
r = mount_nofollow_verbose ( LOG_ERR , " tmpfs " , template , " tmpfs " , MS_STRICTATIME , options ) ;
nspawn,mount-util: add [u]mount_verbose and use it in nspawn
This makes it easier to debug failed nspawn invocations:
Mounting sysfs on /var/lib/machines/fedora-rawhide/sys (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV "")...
Mounting tmpfs on /var/lib/machines/fedora-rawhide/dev (MS_NOSUID|MS_STRICTATIME "mode=755,uid=1450901504,gid=1450901504")...
Mounting tmpfs on /var/lib/machines/fedora-rawhide/dev/shm (MS_NOSUID|MS_NODEV|MS_STRICTATIME "mode=1777,uid=1450901504,gid=1450901504")...
Mounting tmpfs on /var/lib/machines/fedora-rawhide/run (MS_NOSUID|MS_NODEV|MS_STRICTATIME "mode=755,uid=1450901504,gid=1450901504")...
Bind-mounting /sys/fs/selinux on /var/lib/machines/fedora-rawhide/sys/fs/selinux (MS_BIND "")...
Remounting /var/lib/machines/fedora-rawhide/sys/fs/selinux (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_BIND|MS_REMOUNT "")...
Mounting proc on /proc (MS_NOSUID|MS_NOEXEC|MS_NODEV "")...
Bind-mounting /proc/sys on /proc/sys (MS_BIND "")...
Remounting /proc/sys (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_BIND|MS_REMOUNT "")...
Bind-mounting /proc/sysrq-trigger on /proc/sysrq-trigger (MS_BIND "")...
Remounting /proc/sysrq-trigger (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_BIND|MS_REMOUNT "")...
Mounting tmpfs on /tmp (MS_STRICTATIME "mode=1777,uid=0,gid=0")...
Mounting tmpfs on /sys/fs/cgroup (MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME "mode=755,uid=0,gid=0")...
Mounting cgroup on /sys/fs/cgroup/systemd (MS_NOSUID|MS_NOEXEC|MS_NODEV "none,name=systemd,xattr")...
Failed to mount cgroup on /sys/fs/cgroup/systemd (MS_NOSUID|MS_NOEXEC|MS_NODEV "none,name=systemd,xattr"): No such file or directory
2016-10-10 21:55:20 +02:00
if ( r < 0 )
2015-09-07 15:59:52 +02:00
goto fail ;
tmpfs_mounted = true ;
f = prefix_roota ( directory , " /usr " ) ;
t = prefix_roota ( template , " /usr " ) ;
r = mkdir ( t , 0755 ) ;
if ( r < 0 & & errno ! = EEXIST ) {
r = log_error_errno ( errno , " Failed to create %s: %m " , t ) ;
goto fail ;
}
2020-09-22 15:51:17 +02:00
r = mount_nofollow_verbose ( LOG_ERR , f , t , NULL , MS_BIND | MS_REC , NULL ) ;
nspawn,mount-util: add [u]mount_verbose and use it in nspawn
This makes it easier to debug failed nspawn invocations:
Mounting sysfs on /var/lib/machines/fedora-rawhide/sys (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV "")...
Mounting tmpfs on /var/lib/machines/fedora-rawhide/dev (MS_NOSUID|MS_STRICTATIME "mode=755,uid=1450901504,gid=1450901504")...
Mounting tmpfs on /var/lib/machines/fedora-rawhide/dev/shm (MS_NOSUID|MS_NODEV|MS_STRICTATIME "mode=1777,uid=1450901504,gid=1450901504")...
Mounting tmpfs on /var/lib/machines/fedora-rawhide/run (MS_NOSUID|MS_NODEV|MS_STRICTATIME "mode=755,uid=1450901504,gid=1450901504")...
Bind-mounting /sys/fs/selinux on /var/lib/machines/fedora-rawhide/sys/fs/selinux (MS_BIND "")...
Remounting /var/lib/machines/fedora-rawhide/sys/fs/selinux (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_BIND|MS_REMOUNT "")...
Mounting proc on /proc (MS_NOSUID|MS_NOEXEC|MS_NODEV "")...
Bind-mounting /proc/sys on /proc/sys (MS_BIND "")...
Remounting /proc/sys (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_BIND|MS_REMOUNT "")...
Bind-mounting /proc/sysrq-trigger on /proc/sysrq-trigger (MS_BIND "")...
Remounting /proc/sysrq-trigger (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_BIND|MS_REMOUNT "")...
Mounting tmpfs on /tmp (MS_STRICTATIME "mode=1777,uid=0,gid=0")...
Mounting tmpfs on /sys/fs/cgroup (MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME "mode=755,uid=0,gid=0")...
Mounting cgroup on /sys/fs/cgroup/systemd (MS_NOSUID|MS_NOEXEC|MS_NODEV "none,name=systemd,xattr")...
Failed to mount cgroup on /sys/fs/cgroup/systemd (MS_NOSUID|MS_NOEXEC|MS_NODEV "none,name=systemd,xattr"): No such file or directory
2016-10-10 21:55:20 +02:00
if ( r < 0 )
2015-09-07 15:59:52 +02:00
goto fail ;
bind_mounted = true ;
2019-03-25 17:04:38 +01:00
r = bind_remount_recursive ( t , MS_RDONLY , MS_RDONLY , NULL ) ;
2015-09-07 15:59:52 +02:00
if ( r < 0 ) {
log_error_errno ( r , " Failed to remount %s read-only: %m " , t ) ;
goto fail ;
}
2020-09-22 15:51:17 +02:00
r = mount_nofollow_verbose ( LOG_ERR , template , directory , NULL , MS_MOVE , NULL ) ;
nspawn,mount-util: add [u]mount_verbose and use it in nspawn
This makes it easier to debug failed nspawn invocations:
Mounting sysfs on /var/lib/machines/fedora-rawhide/sys (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV "")...
Mounting tmpfs on /var/lib/machines/fedora-rawhide/dev (MS_NOSUID|MS_STRICTATIME "mode=755,uid=1450901504,gid=1450901504")...
Mounting tmpfs on /var/lib/machines/fedora-rawhide/dev/shm (MS_NOSUID|MS_NODEV|MS_STRICTATIME "mode=1777,uid=1450901504,gid=1450901504")...
Mounting tmpfs on /var/lib/machines/fedora-rawhide/run (MS_NOSUID|MS_NODEV|MS_STRICTATIME "mode=755,uid=1450901504,gid=1450901504")...
Bind-mounting /sys/fs/selinux on /var/lib/machines/fedora-rawhide/sys/fs/selinux (MS_BIND "")...
Remounting /var/lib/machines/fedora-rawhide/sys/fs/selinux (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_BIND|MS_REMOUNT "")...
Mounting proc on /proc (MS_NOSUID|MS_NOEXEC|MS_NODEV "")...
Bind-mounting /proc/sys on /proc/sys (MS_BIND "")...
Remounting /proc/sys (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_BIND|MS_REMOUNT "")...
Bind-mounting /proc/sysrq-trigger on /proc/sysrq-trigger (MS_BIND "")...
Remounting /proc/sysrq-trigger (MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_BIND|MS_REMOUNT "")...
Mounting tmpfs on /tmp (MS_STRICTATIME "mode=1777,uid=0,gid=0")...
Mounting tmpfs on /sys/fs/cgroup (MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_STRICTATIME "mode=755,uid=0,gid=0")...
Mounting cgroup on /sys/fs/cgroup/systemd (MS_NOSUID|MS_NOEXEC|MS_NODEV "none,name=systemd,xattr")...
Failed to mount cgroup on /sys/fs/cgroup/systemd (MS_NOSUID|MS_NOEXEC|MS_NODEV "none,name=systemd,xattr"): No such file or directory
2016-10-10 21:55:20 +02:00
if ( r < 0 )
2015-09-07 15:59:52 +02:00
goto fail ;
( void ) rmdir ( template ) ;
return 0 ;
fail :
if ( bind_mounted )
2020-09-22 16:32:07 +02:00
( void ) umount_verbose ( LOG_ERR , t , UMOUNT_NOFOLLOW ) ;
2015-09-07 15:59:52 +02:00
if ( tmpfs_mounted )
2020-09-22 16:32:07 +02:00
( void ) umount_verbose ( LOG_ERR , template , UMOUNT_NOFOLLOW ) ;
2015-09-07 15:59:52 +02:00
( void ) rmdir ( template ) ;
return r ;
}
2017-02-08 16:54:31 +01:00
2019-12-07 11:59:59 +01:00
static int setup_volatile_overlay ( const char * directory , uid_t uid_shift , const char * selinux_apifs_context ) {
2018-12-19 01:02:06 +01:00
_cleanup_free_ char * buf = NULL , * escaped_directory = NULL , * escaped_upper = NULL , * escaped_work = NULL ;
char template [ ] = " /tmp/nspawn-volatile-XXXXXX " ;
const char * upper , * work , * options ;
bool tmpfs_mounted = false ;
int r ;
assert ( directory ) ;
/* --volatile=overlay means we mount an overlayfs to the root dir. */
if ( ! mkdtemp ( template ) )
return log_error_errno ( errno , " Failed to create temporary directory: %m " ) ;
2020-04-14 15:39:36 +02:00
options = " mode=755 " TMPFS_LIMITS_ROOTFS ;
2018-12-19 01:02:06 +01:00
r = tmpfs_patch_options ( options , uid_shift = = 0 ? UID_INVALID : uid_shift , selinux_apifs_context , & buf ) ;
if ( r < 0 )
goto finish ;
if ( r > 0 )
options = buf ;
2020-09-22 15:51:17 +02:00
r = mount_nofollow_verbose ( LOG_ERR , " tmpfs " , template , " tmpfs " , MS_STRICTATIME , options ) ;
2018-12-19 01:02:06 +01:00
if ( r < 0 )
goto finish ;
tmpfs_mounted = true ;
upper = strjoina ( template , " /upper " ) ;
work = strjoina ( template , " /work " ) ;
if ( mkdir ( upper , 0755 ) < 0 ) {
r = log_error_errno ( errno , " Failed to create %s: %m " , upper ) ;
goto finish ;
}
if ( mkdir ( work , 0755 ) < 0 ) {
r = log_error_errno ( errno , " Failed to create %s: %m " , work ) ;
goto finish ;
}
/* And now, let's overmount the root dir with an overlayfs that uses the root dir as lower dir. It's kinda nice
* that the kernel allows us to do that without going through some mount point rearrangements . */
escaped_directory = shell_escape ( directory , " ,: " ) ;
escaped_upper = shell_escape ( upper , " ,: " ) ;
escaped_work = shell_escape ( work , " ,: " ) ;
if ( ! escaped_directory | | ! escaped_upper | | ! escaped_work ) {
r = - ENOMEM ;
goto finish ;
}
options = strjoina ( " lowerdir= " , escaped_directory , " ,upperdir= " , escaped_upper , " ,workdir= " , escaped_work ) ;
2020-09-22 15:51:17 +02:00
r = mount_nofollow_verbose ( LOG_ERR , " overlay " , directory , " overlay " , 0 , options ) ;
2018-12-19 01:02:06 +01:00
finish :
if ( tmpfs_mounted )
2020-09-22 16:32:07 +02:00
( void ) umount_verbose ( LOG_ERR , template , UMOUNT_NOFOLLOW ) ;
2018-12-19 01:02:06 +01:00
( void ) rmdir ( template ) ;
return r ;
}
2018-12-19 00:09:57 +01:00
int setup_volatile_mode (
const char * directory ,
VolatileMode mode ,
2019-12-07 11:59:59 +01:00
uid_t uid_shift ,
2018-12-19 00:09:57 +01:00
const char * selinux_apifs_context ) {
switch ( mode ) {
case VOLATILE_YES :
2019-12-07 11:59:59 +01:00
return setup_volatile_yes ( directory , uid_shift , selinux_apifs_context ) ;
2018-12-19 00:09:57 +01:00
case VOLATILE_STATE :
2019-12-07 11:59:59 +01:00
return setup_volatile_state ( directory , uid_shift , selinux_apifs_context ) ;
2018-12-19 00:09:57 +01:00
2018-12-19 01:02:06 +01:00
case VOLATILE_OVERLAY :
2019-12-07 11:59:59 +01:00
return setup_volatile_overlay ( directory , uid_shift , selinux_apifs_context ) ;
2018-12-19 01:02:06 +01:00
2018-12-19 00:09:57 +01:00
default :
return 0 ;
}
}
2017-02-08 16:54:31 +01:00
/* Expects *pivot_root_new and *pivot_root_old to be initialised to allocated memory or NULL. */
int pivot_root_parse ( char * * pivot_root_new , char * * pivot_root_old , const char * s ) {
_cleanup_free_ char * root_new = NULL , * root_old = NULL ;
const char * p = s ;
int r ;
assert ( pivot_root_new ) ;
assert ( pivot_root_old ) ;
r = extract_first_word ( & p , & root_new , " : " , EXTRACT_DONT_COALESCE_SEPARATORS ) ;
if ( r < 0 )
return r ;
if ( r = = 0 )
return - EINVAL ;
if ( isempty ( p ) )
root_old = NULL ;
else {
root_old = strdup ( p ) ;
if ( ! root_old )
return - ENOMEM ;
}
if ( ! path_is_absolute ( root_new ) )
return - EINVAL ;
if ( root_old & & ! path_is_absolute ( root_old ) )
return - EINVAL ;
free_and_replace ( * pivot_root_new , root_new ) ;
free_and_replace ( * pivot_root_old , root_old ) ;
return 0 ;
}
int setup_pivot_root ( const char * directory , const char * pivot_root_new , const char * pivot_root_old ) {
_cleanup_free_ char * directory_pivot_root_new = NULL ;
_cleanup_free_ char * pivot_tmp_pivot_root_old = NULL ;
char pivot_tmp [ ] = " /tmp/nspawn-pivot-XXXXXX " ;
bool remove_pivot_tmp = false ;
int r ;
assert ( directory ) ;
if ( ! pivot_root_new )
return 0 ;
/* Pivot pivot_root_new to / and the existing / to pivot_root_old.
* If pivot_root_old is NULL , the existing / disappears .
* This requires a temporary directory , pivot_tmp , which is
* not a child of either .
*
* This is typically used for OSTree - style containers , where
* the root partition contains several sysroots which could be
* run . Normally , one would be chosen by the bootloader and
* pivoted to / by initramfs .
*
* For example , for an OSTree deployment , pivot_root_new
* would be : / ostree / deploy / $ os / deploy / $ checksum . Note that this
* code doesn ’ t do the / var mount which OSTree expects : use
* - - bind + / sysroot / ostree / deploy / $ os / var : / var for that .
*
* So in the OSTree case , we ’ ll end up with something like :
* - directory = / tmp / nspawn - root - 123456
* - pivot_root_new = / ostree / deploy / os / deploy / 123 abc
* - pivot_root_old = / sysroot
* - directory_pivot_root_new =
* / tmp / nspawn - root - 123456 / ostree / deploy / os / deploy / 123 abc
* - pivot_tmp = / tmp / nspawn - pivot - 123456
* - pivot_tmp_pivot_root_old = / tmp / nspawn - pivot - 123456 / sysroot
*
* Requires all file systems at directory and below to be mounted
* MS_PRIVATE or MS_SLAVE so they can be moved .
*/
2019-06-19 15:20:13 +02:00
directory_pivot_root_new = path_join ( directory , pivot_root_new ) ;
if ( ! directory_pivot_root_new )
return log_oom ( ) ;
2017-02-08 16:54:31 +01:00
/* Remount directory_pivot_root_new to make it movable. */
2020-09-22 15:51:17 +02:00
r = mount_nofollow_verbose ( LOG_ERR , directory_pivot_root_new , directory_pivot_root_new , NULL , MS_BIND , NULL ) ;
2017-02-08 16:54:31 +01:00
if ( r < 0 )
goto done ;
if ( pivot_root_old ) {
if ( ! mkdtemp ( pivot_tmp ) ) {
r = log_error_errno ( errno , " Failed to create temporary directory: %m " ) ;
goto done ;
}
remove_pivot_tmp = true ;
2019-06-19 15:20:13 +02:00
pivot_tmp_pivot_root_old = path_join ( pivot_tmp , pivot_root_old ) ;
if ( ! pivot_tmp_pivot_root_old ) {
r = log_oom ( ) ;
goto done ;
}
2017-02-08 16:54:31 +01:00
2020-09-22 15:51:17 +02:00
r = mount_nofollow_verbose ( LOG_ERR , directory_pivot_root_new , pivot_tmp , NULL , MS_MOVE , NULL ) ;
2017-02-08 16:54:31 +01:00
if ( r < 0 )
goto done ;
2020-09-22 15:51:17 +02:00
r = mount_nofollow_verbose ( LOG_ERR , directory , pivot_tmp_pivot_root_old , NULL , MS_MOVE , NULL ) ;
2017-02-08 16:54:31 +01:00
if ( r < 0 )
goto done ;
2020-09-22 15:51:17 +02:00
r = mount_nofollow_verbose ( LOG_ERR , pivot_tmp , directory , NULL , MS_MOVE , NULL ) ;
2017-02-08 16:54:31 +01:00
if ( r < 0 )
goto done ;
} else {
2020-09-22 15:51:17 +02:00
r = mount_nofollow_verbose ( LOG_ERR , directory_pivot_root_new , directory , NULL , MS_MOVE , NULL ) ;
2017-02-08 16:54:31 +01:00
if ( r < 0 )
goto done ;
}
done :
if ( remove_pivot_tmp )
( void ) rmdir ( pivot_tmp ) ;
return r ;
}