fork: extend clone3() to support setting a PID
The main motivation to add set_tid to clone3() is CRIU.
To restore a process with the same PID/TID CRIU currently uses
/proc/sys/kernel/ns_last_pid. It writes the desired (PID - 1) to
ns_last_pid and then (quickly) does a clone(). This works most of the
time, but it is racy. It is also slow as it requires multiple syscalls.
Extending clone3() to support *set_tid makes it possible restore a
process using CRIU without accessing /proc/sys/kernel/ns_last_pid and
race free (as long as the desired PID/TID is available).
This clone3() extension places the same restrictions (CAP_SYS_ADMIN)
on clone3() with *set_tid as they are currently in place for ns_last_pid.
The original version of this change was using a single value for
set_tid. At the 2019 LPC, after presenting set_tid, it was, however,
decided to change set_tid to an array to enable setting the PID of a
process in multiple PID namespaces at the same time. If a process is
created in a PID namespace it is possible to influence the PID inside
and outside of the PID namespace. Details also in the corresponding
selftest.
To create a process with the following PIDs:
PID NS level Requested PID
0 (host) 31496
1 42
2 1
For that example the two newly introduced parameters to struct
clone_args (set_tid and set_tid_size) would need to be:
set_tid[0] = 1;
set_tid[1] = 42;
set_tid[2] = 31496;
set_tid_size = 3;
If only the PIDs of the two innermost nested PID namespaces should be
defined it would look like this:
set_tid[0] = 1;
set_tid[1] = 42;
set_tid_size = 2;
The PID of the newly created process would then be the next available
free PID in the PID namespace level 0 (host) and 42 in the PID namespace
at level 1 and the PID of the process in the innermost PID namespace
would be 1.
The set_tid array is used to specify the PID of a process starting
from the innermost nested PID namespaces up to set_tid_size PID namespaces.
set_tid_size cannot be larger then the current PID namespace level.
Signed-off-by: Adrian Reber <areber@redhat.com>
Reviewed-by: Christian Brauner <christian.brauner@ubuntu.com>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Dmitry Safonov <0x7f454c46@gmail.com>
Acked-by: Andrei Vagin <avagin@gmail.com>
Link: https://lore.kernel.org/r/20191115123621.142252-1-areber@redhat.com
Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
This commit is contained in:
parent
17a810699c
commit
49cb2fc42c
7 changed files with 121 additions and 36 deletions
|
|
@ -124,7 +124,8 @@ extern struct pid *find_vpid(int nr);
|
|||
extern struct pid *find_get_pid(int nr);
|
||||
extern struct pid *find_ge_pid(int nr, struct pid_namespace *);
|
||||
|
||||
extern struct pid *alloc_pid(struct pid_namespace *ns);
|
||||
extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
|
||||
size_t set_tid_size);
|
||||
extern void free_pid(struct pid *pid);
|
||||
extern void disable_pid_allocation(struct pid_namespace *ns);
|
||||
|
||||
|
|
|
|||
|
|
@ -12,6 +12,8 @@
|
|||
#include <linux/ns_common.h>
|
||||
#include <linux/idr.h>
|
||||
|
||||
/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
|
||||
#define MAX_PID_NS_LEVEL 32
|
||||
|
||||
struct fs_pin;
|
||||
|
||||
|
|
|
|||
|
|
@ -26,6 +26,9 @@ struct kernel_clone_args {
|
|||
unsigned long stack;
|
||||
unsigned long stack_size;
|
||||
unsigned long tls;
|
||||
pid_t *set_tid;
|
||||
/* Number of elements in *set_tid */
|
||||
size_t set_tid_size;
|
||||
};
|
||||
|
||||
/*
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue