seccomp updates for v5.11-rc1
- Improve seccomp performance via constant-action bitmaps (YiFei Zhu & Kees Cook)
- Fix bogus __user annotations (Jann Horn)
- Add missed CONFIG for improved selftest coverage (Mickaël Salaün)
-----BEGIN PGP SIGNATURE-----
iQIzBAABCgAdFiEEpcP2jyKd1g9yPm4TiXL039xtwCYFAl/ZG5IACgkQiXL039xt
wCbhuw/+P77jwT/p1DRnKp5vG7TXTqqXrdhQZYNyBUxRaKSGCEMydvJn/h3KscyW
4eEy9vZKTAhIQg5oI5OXZ9jxzFdpxEg8lMPSKReNEga3d0//H9gOJHYc782D/bf1
+6x6I4qWv+LMM/52P60gznBH+3WFVtyM5Jw+LF5igOCEVSERoZ3ChsmdSZgkALG0
DJXKL+Dy1Wj9ESeBtuh1UsKoh4ADTAoPC+LvfGuxn2T+VtnxX/sOSDkkrpHfX+2J
UKkIgWJHeNmq74nwWjpNuDz24ARTiVWOVQX01nOHRohtu39TZcpU774Pdp4Dsj2W
oDDwOzIWp4/27aQxkOKv6NXMwd29XbrpH1gweyuvQh9cohSbzx6qZlXujqyd9izs
6Nh74mvC3cns6sQWSWz5ddU4dMQ4rNjpD2CK1P8A7ZVTfH+5baaPmF8CRp126E6f
/MAUk7Rfbe6YfYdfMwhXXhTvus0e5yenGFXr46gasJDfGnyy4cLS/MO7AZ+mR0CB
d9DnrsIJVggL5cZ2LZmivIng18JWnbkgnenmHSXahdLstmYVkdpo4ckBl1G/dXK0
lDmi9j9FoTxB6OrztEKA0RZB+C1e6q7X7euwsHjgF9XKgD5S+DdeYwqd2lypjyvb
d9VNLFdngD0CRY7wcJZKRma+yPemlPNurdMjF9LrqaAu232G1UA=
=jJwG
-----END PGP SIGNATURE-----
Merge tag 'seccomp-v5.11-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux
Pull seccomp updates from Kees Cook:
"The major change here is finally gaining seccomp constant-action
bitmaps, which internally reduces the seccomp overhead for many
real-world syscall filters to O(1), as discussed at Plumbers this
year.
- Improve seccomp performance via constant-action bitmaps (YiFei Zhu
& Kees Cook)
- Fix bogus __user annotations (Jann Horn)
- Add missed CONFIG for improved selftest coverage (Mickaël Salaün)"
* tag 'seccomp-v5.11-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux:
selftests/seccomp: Update kernel config
seccomp: Remove bogus __user annotations
seccomp/cache: Report cache data through /proc/pid/seccomp_cache
xtensa: Enable seccomp architecture tracking
sh: Enable seccomp architecture tracking
s390: Enable seccomp architecture tracking
riscv: Enable seccomp architecture tracking
powerpc: Enable seccomp architecture tracking
parisc: Enable seccomp architecture tracking
csky: Enable seccomp architecture tracking
arm: Enable seccomp architecture tracking
arm64: Enable seccomp architecture tracking
selftests/seccomp: Compare bitmap vs filter overhead
x86: Enable seccomp architecture tracking
seccomp/cache: Add "emulator" to check if filter is constant allow
seccomp/cache: Lookup syscall allowlist bitmap for fast path
This commit is contained in:
commit
e994cc240a
21 changed files with 588 additions and 28 deletions
296
kernel/seccomp.c
296
kernel/seccomp.c
|
|
@ -143,6 +143,38 @@ struct notification {
|
|||
struct list_head notifications;
|
||||
};
|
||||
|
||||
#ifdef SECCOMP_ARCH_NATIVE
|
||||
/**
|
||||
* struct action_cache - per-filter cache of seccomp actions per
|
||||
* arch/syscall pair
|
||||
*
|
||||
* @allow_native: A bitmap where each bit represents whether the
|
||||
* filter will always allow the syscall, for the
|
||||
* native architecture.
|
||||
* @allow_compat: A bitmap where each bit represents whether the
|
||||
* filter will always allow the syscall, for the
|
||||
* compat architecture.
|
||||
*/
|
||||
struct action_cache {
|
||||
DECLARE_BITMAP(allow_native, SECCOMP_ARCH_NATIVE_NR);
|
||||
#ifdef SECCOMP_ARCH_COMPAT
|
||||
DECLARE_BITMAP(allow_compat, SECCOMP_ARCH_COMPAT_NR);
|
||||
#endif
|
||||
};
|
||||
#else
|
||||
struct action_cache { };
|
||||
|
||||
static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter,
|
||||
const struct seccomp_data *sd)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter)
|
||||
{
|
||||
}
|
||||
#endif /* SECCOMP_ARCH_NATIVE */
|
||||
|
||||
/**
|
||||
* struct seccomp_filter - container for seccomp BPF programs
|
||||
*
|
||||
|
|
@ -159,6 +191,7 @@ struct notification {
|
|||
* this filter after reaching 0. The @users count is always smaller
|
||||
* or equal to @refs. Hence, reaching 0 for @users does not mean
|
||||
* the filter can be freed.
|
||||
* @cache: cache of arch/syscall mappings to actions
|
||||
* @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
|
||||
* @prev: points to a previously installed, or inherited, filter
|
||||
* @prog: the BPF program to evaluate
|
||||
|
|
@ -180,6 +213,7 @@ struct seccomp_filter {
|
|||
refcount_t refs;
|
||||
refcount_t users;
|
||||
bool log;
|
||||
struct action_cache cache;
|
||||
struct seccomp_filter *prev;
|
||||
struct bpf_prog *prog;
|
||||
struct notification *notif;
|
||||
|
|
@ -298,6 +332,52 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
|
|||
return 0;
|
||||
}
|
||||
|
||||
#ifdef SECCOMP_ARCH_NATIVE
|
||||
static inline bool seccomp_cache_check_allow_bitmap(const void *bitmap,
|
||||
size_t bitmap_size,
|
||||
int syscall_nr)
|
||||
{
|
||||
if (unlikely(syscall_nr < 0 || syscall_nr >= bitmap_size))
|
||||
return false;
|
||||
syscall_nr = array_index_nospec(syscall_nr, bitmap_size);
|
||||
|
||||
return test_bit(syscall_nr, bitmap);
|
||||
}
|
||||
|
||||
/**
|
||||
* seccomp_cache_check_allow - lookup seccomp cache
|
||||
* @sfilter: The seccomp filter
|
||||
* @sd: The seccomp data to lookup the cache with
|
||||
*
|
||||
* Returns true if the seccomp_data is cached and allowed.
|
||||
*/
|
||||
static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter,
|
||||
const struct seccomp_data *sd)
|
||||
{
|
||||
int syscall_nr = sd->nr;
|
||||
const struct action_cache *cache = &sfilter->cache;
|
||||
|
||||
#ifndef SECCOMP_ARCH_COMPAT
|
||||
/* A native-only architecture doesn't need to check sd->arch. */
|
||||
return seccomp_cache_check_allow_bitmap(cache->allow_native,
|
||||
SECCOMP_ARCH_NATIVE_NR,
|
||||
syscall_nr);
|
||||
#else
|
||||
if (likely(sd->arch == SECCOMP_ARCH_NATIVE))
|
||||
return seccomp_cache_check_allow_bitmap(cache->allow_native,
|
||||
SECCOMP_ARCH_NATIVE_NR,
|
||||
syscall_nr);
|
||||
if (likely(sd->arch == SECCOMP_ARCH_COMPAT))
|
||||
return seccomp_cache_check_allow_bitmap(cache->allow_compat,
|
||||
SECCOMP_ARCH_COMPAT_NR,
|
||||
syscall_nr);
|
||||
#endif /* SECCOMP_ARCH_COMPAT */
|
||||
|
||||
WARN_ON_ONCE(true);
|
||||
return false;
|
||||
}
|
||||
#endif /* SECCOMP_ARCH_NATIVE */
|
||||
|
||||
/**
|
||||
* seccomp_run_filters - evaluates all seccomp filters against @sd
|
||||
* @sd: optional seccomp data to be passed to filters
|
||||
|
|
@ -320,6 +400,9 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
|
|||
if (WARN_ON(f == NULL))
|
||||
return SECCOMP_RET_KILL_PROCESS;
|
||||
|
||||
if (seccomp_cache_check_allow(f, sd))
|
||||
return SECCOMP_RET_ALLOW;
|
||||
|
||||
/*
|
||||
* All filters in the list are evaluated and the lowest BPF return
|
||||
* value always takes priority (ignoring the DATA).
|
||||
|
|
@ -470,6 +553,9 @@ void seccomp_filter_release(struct task_struct *tsk)
|
|||
{
|
||||
struct seccomp_filter *orig = tsk->seccomp.filter;
|
||||
|
||||
/* We are effectively holding the siglock by not having any sighand. */
|
||||
WARN_ON(tsk->sighand != NULL);
|
||||
|
||||
/* Detach task from its filter tree. */
|
||||
tsk->seccomp.filter = NULL;
|
||||
__seccomp_filter_release(orig);
|
||||
|
|
@ -544,7 +630,12 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
|
|||
{
|
||||
struct seccomp_filter *sfilter;
|
||||
int ret;
|
||||
const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE);
|
||||
const bool save_orig =
|
||||
#if defined(CONFIG_CHECKPOINT_RESTORE) || defined(SECCOMP_ARCH_NATIVE)
|
||||
true;
|
||||
#else
|
||||
false;
|
||||
#endif
|
||||
|
||||
if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
|
@ -609,6 +700,148 @@ out:
|
|||
return filter;
|
||||
}
|
||||
|
||||
#ifdef SECCOMP_ARCH_NATIVE
|
||||
/**
|
||||
* seccomp_is_const_allow - check if filter is constant allow with given data
|
||||
* @fprog: The BPF programs
|
||||
* @sd: The seccomp data to check against, only syscall number and arch
|
||||
* number are considered constant.
|
||||
*/
|
||||
static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog,
|
||||
struct seccomp_data *sd)
|
||||
{
|
||||
unsigned int reg_value = 0;
|
||||
unsigned int pc;
|
||||
bool op_res;
|
||||
|
||||
if (WARN_ON_ONCE(!fprog))
|
||||
return false;
|
||||
|
||||
for (pc = 0; pc < fprog->len; pc++) {
|
||||
struct sock_filter *insn = &fprog->filter[pc];
|
||||
u16 code = insn->code;
|
||||
u32 k = insn->k;
|
||||
|
||||
switch (code) {
|
||||
case BPF_LD | BPF_W | BPF_ABS:
|
||||
switch (k) {
|
||||
case offsetof(struct seccomp_data, nr):
|
||||
reg_value = sd->nr;
|
||||
break;
|
||||
case offsetof(struct seccomp_data, arch):
|
||||
reg_value = sd->arch;
|
||||
break;
|
||||
default:
|
||||
/* can't optimize (non-constant value load) */
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
case BPF_RET | BPF_K:
|
||||
/* reached return with constant values only, check allow */
|
||||
return k == SECCOMP_RET_ALLOW;
|
||||
case BPF_JMP | BPF_JA:
|
||||
pc += insn->k;
|
||||
break;
|
||||
case BPF_JMP | BPF_JEQ | BPF_K:
|
||||
case BPF_JMP | BPF_JGE | BPF_K:
|
||||
case BPF_JMP | BPF_JGT | BPF_K:
|
||||
case BPF_JMP | BPF_JSET | BPF_K:
|
||||
switch (BPF_OP(code)) {
|
||||
case BPF_JEQ:
|
||||
op_res = reg_value == k;
|
||||
break;
|
||||
case BPF_JGE:
|
||||
op_res = reg_value >= k;
|
||||
break;
|
||||
case BPF_JGT:
|
||||
op_res = reg_value > k;
|
||||
break;
|
||||
case BPF_JSET:
|
||||
op_res = !!(reg_value & k);
|
||||
break;
|
||||
default:
|
||||
/* can't optimize (unknown jump) */
|
||||
return false;
|
||||
}
|
||||
|
||||
pc += op_res ? insn->jt : insn->jf;
|
||||
break;
|
||||
case BPF_ALU | BPF_AND | BPF_K:
|
||||
reg_value &= k;
|
||||
break;
|
||||
default:
|
||||
/* can't optimize (unknown insn) */
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/* ran off the end of the filter?! */
|
||||
WARN_ON(1);
|
||||
return false;
|
||||
}
|
||||
|
||||
static void seccomp_cache_prepare_bitmap(struct seccomp_filter *sfilter,
|
||||
void *bitmap, const void *bitmap_prev,
|
||||
size_t bitmap_size, int arch)
|
||||
{
|
||||
struct sock_fprog_kern *fprog = sfilter->prog->orig_prog;
|
||||
struct seccomp_data sd;
|
||||
int nr;
|
||||
|
||||
if (bitmap_prev) {
|
||||
/* The new filter must be as restrictive as the last. */
|
||||
bitmap_copy(bitmap, bitmap_prev, bitmap_size);
|
||||
} else {
|
||||
/* Before any filters, all syscalls are always allowed. */
|
||||
bitmap_fill(bitmap, bitmap_size);
|
||||
}
|
||||
|
||||
for (nr = 0; nr < bitmap_size; nr++) {
|
||||
/* No bitmap change: not a cacheable action. */
|
||||
if (!test_bit(nr, bitmap))
|
||||
continue;
|
||||
|
||||
sd.nr = nr;
|
||||
sd.arch = arch;
|
||||
|
||||
/* No bitmap change: continue to always allow. */
|
||||
if (seccomp_is_const_allow(fprog, &sd))
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Not a cacheable action: always run filters.
|
||||
* atomic clear_bit() not needed, filter not visible yet.
|
||||
*/
|
||||
__clear_bit(nr, bitmap);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* seccomp_cache_prepare - emulate the filter to find cachable syscalls
|
||||
* @sfilter: The seccomp filter
|
||||
*
|
||||
* Returns 0 if successful or -errno if error occurred.
|
||||
*/
|
||||
static void seccomp_cache_prepare(struct seccomp_filter *sfilter)
|
||||
{
|
||||
struct action_cache *cache = &sfilter->cache;
|
||||
const struct action_cache *cache_prev =
|
||||
sfilter->prev ? &sfilter->prev->cache : NULL;
|
||||
|
||||
seccomp_cache_prepare_bitmap(sfilter, cache->allow_native,
|
||||
cache_prev ? cache_prev->allow_native : NULL,
|
||||
SECCOMP_ARCH_NATIVE_NR,
|
||||
SECCOMP_ARCH_NATIVE);
|
||||
|
||||
#ifdef SECCOMP_ARCH_COMPAT
|
||||
seccomp_cache_prepare_bitmap(sfilter, cache->allow_compat,
|
||||
cache_prev ? cache_prev->allow_compat : NULL,
|
||||
SECCOMP_ARCH_COMPAT_NR,
|
||||
SECCOMP_ARCH_COMPAT);
|
||||
#endif /* SECCOMP_ARCH_COMPAT */
|
||||
}
|
||||
#endif /* SECCOMP_ARCH_NATIVE */
|
||||
|
||||
/**
|
||||
* seccomp_attach_filter: validate and attach filter
|
||||
* @flags: flags to change filter behavior
|
||||
|
|
@ -658,6 +891,7 @@ static long seccomp_attach_filter(unsigned int flags,
|
|||
* task reference.
|
||||
*/
|
||||
filter->prev = current->seccomp.filter;
|
||||
seccomp_cache_prepare(filter);
|
||||
current->seccomp.filter = filter;
|
||||
atomic_inc(¤t->seccomp.filter_count);
|
||||
|
||||
|
|
@ -1967,7 +2201,7 @@ static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names)
|
|||
return true;
|
||||
}
|
||||
|
||||
static int read_actions_logged(struct ctl_table *ro_table, void __user *buffer,
|
||||
static int read_actions_logged(struct ctl_table *ro_table, void *buffer,
|
||||
size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
char names[sizeof(seccomp_actions_avail)];
|
||||
|
|
@ -1985,7 +2219,7 @@ static int read_actions_logged(struct ctl_table *ro_table, void __user *buffer,
|
|||
return proc_dostring(&table, 0, buffer, lenp, ppos);
|
||||
}
|
||||
|
||||
static int write_actions_logged(struct ctl_table *ro_table, void __user *buffer,
|
||||
static int write_actions_logged(struct ctl_table *ro_table, void *buffer,
|
||||
size_t *lenp, loff_t *ppos, u32 *actions_logged)
|
||||
{
|
||||
char names[sizeof(seccomp_actions_avail)];
|
||||
|
|
@ -2103,3 +2337,59 @@ static int __init seccomp_sysctl_init(void)
|
|||
device_initcall(seccomp_sysctl_init)
|
||||
|
||||
#endif /* CONFIG_SYSCTL */
|
||||
|
||||
#ifdef CONFIG_SECCOMP_CACHE_DEBUG
|
||||
/* Currently CONFIG_SECCOMP_CACHE_DEBUG implies SECCOMP_ARCH_NATIVE */
|
||||
static void proc_pid_seccomp_cache_arch(struct seq_file *m, const char *name,
|
||||
const void *bitmap, size_t bitmap_size)
|
||||
{
|
||||
int nr;
|
||||
|
||||
for (nr = 0; nr < bitmap_size; nr++) {
|
||||
bool cached = test_bit(nr, bitmap);
|
||||
char *status = cached ? "ALLOW" : "FILTER";
|
||||
|
||||
seq_printf(m, "%s %d %s\n", name, nr, status);
|
||||
}
|
||||
}
|
||||
|
||||
int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns,
|
||||
struct pid *pid, struct task_struct *task)
|
||||
{
|
||||
struct seccomp_filter *f;
|
||||
unsigned long flags;
|
||||
|
||||
/*
|
||||
* We don't want some sandboxed process to know what their seccomp
|
||||
* filters consist of.
|
||||
*/
|
||||
if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN))
|
||||
return -EACCES;
|
||||
|
||||
if (!lock_task_sighand(task, &flags))
|
||||
return -ESRCH;
|
||||
|
||||
f = READ_ONCE(task->seccomp.filter);
|
||||
if (!f) {
|
||||
unlock_task_sighand(task, &flags);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* prevent filter from being freed while we are printing it */
|
||||
__get_seccomp_filter(f);
|
||||
unlock_task_sighand(task, &flags);
|
||||
|
||||
proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_NATIVE_NAME,
|
||||
f->cache.allow_native,
|
||||
SECCOMP_ARCH_NATIVE_NR);
|
||||
|
||||
#ifdef SECCOMP_ARCH_COMPAT
|
||||
proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_COMPAT_NAME,
|
||||
f->cache.allow_compat,
|
||||
SECCOMP_ARCH_COMPAT_NR);
|
||||
#endif /* SECCOMP_ARCH_COMPAT */
|
||||
|
||||
__put_seccomp_filter(f);
|
||||
return 0;
|
||||
}
|
||||
#endif /* CONFIG_SECCOMP_CACHE_DEBUG */
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue