seccomp updates for v5.11-rc1

- Improve seccomp performance via constant-action bitmaps (YiFei Zhu & Kees Cook)
 
 - Fix bogus __user annotations (Jann Horn)
 
 - Add missed CONFIG for improved selftest coverage (Mickaël Salaün)
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEpcP2jyKd1g9yPm4TiXL039xtwCYFAl/ZG5IACgkQiXL039xt
 wCbhuw/+P77jwT/p1DRnKp5vG7TXTqqXrdhQZYNyBUxRaKSGCEMydvJn/h3KscyW
 4eEy9vZKTAhIQg5oI5OXZ9jxzFdpxEg8lMPSKReNEga3d0//H9gOJHYc782D/bf1
 +6x6I4qWv+LMM/52P60gznBH+3WFVtyM5Jw+LF5igOCEVSERoZ3ChsmdSZgkALG0
 DJXKL+Dy1Wj9ESeBtuh1UsKoh4ADTAoPC+LvfGuxn2T+VtnxX/sOSDkkrpHfX+2J
 UKkIgWJHeNmq74nwWjpNuDz24ARTiVWOVQX01nOHRohtu39TZcpU774Pdp4Dsj2W
 oDDwOzIWp4/27aQxkOKv6NXMwd29XbrpH1gweyuvQh9cohSbzx6qZlXujqyd9izs
 6Nh74mvC3cns6sQWSWz5ddU4dMQ4rNjpD2CK1P8A7ZVTfH+5baaPmF8CRp126E6f
 /MAUk7Rfbe6YfYdfMwhXXhTvus0e5yenGFXr46gasJDfGnyy4cLS/MO7AZ+mR0CB
 d9DnrsIJVggL5cZ2LZmivIng18JWnbkgnenmHSXahdLstmYVkdpo4ckBl1G/dXK0
 lDmi9j9FoTxB6OrztEKA0RZB+C1e6q7X7euwsHjgF9XKgD5S+DdeYwqd2lypjyvb
 d9VNLFdngD0CRY7wcJZKRma+yPemlPNurdMjF9LrqaAu232G1UA=
 =jJwG
 -----END PGP SIGNATURE-----

Merge tag 'seccomp-v5.11-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux

Pull seccomp updates from Kees Cook:
 "The major change here is finally gaining seccomp constant-action
  bitmaps, which internally reduces the seccomp overhead for many
  real-world syscall filters to O(1), as discussed at Plumbers this
  year.

   - Improve seccomp performance via constant-action bitmaps (YiFei Zhu
     & Kees Cook)

   - Fix bogus __user annotations (Jann Horn)

   - Add missed CONFIG for improved selftest coverage (Mickaël Salaün)"

* tag 'seccomp-v5.11-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux:
  selftests/seccomp: Update kernel config
  seccomp: Remove bogus __user annotations
  seccomp/cache: Report cache data through /proc/pid/seccomp_cache
  xtensa: Enable seccomp architecture tracking
  sh: Enable seccomp architecture tracking
  s390: Enable seccomp architecture tracking
  riscv: Enable seccomp architecture tracking
  powerpc: Enable seccomp architecture tracking
  parisc: Enable seccomp architecture tracking
  csky: Enable seccomp architecture tracking
  arm: Enable seccomp architecture tracking
  arm64: Enable seccomp architecture tracking
  selftests/seccomp: Compare bitmap vs filter overhead
  x86: Enable seccomp architecture tracking
  seccomp/cache: Add "emulator" to check if filter is constant allow
  seccomp/cache: Lookup syscall allowlist bitmap for fast path
This commit is contained in:
Linus Torvalds 2020-12-16 11:30:10 -08:00
commit e994cc240a
21 changed files with 588 additions and 28 deletions

View file

@ -143,6 +143,38 @@ struct notification {
struct list_head notifications;
};
#ifdef SECCOMP_ARCH_NATIVE
/**
* struct action_cache - per-filter cache of seccomp actions per
* arch/syscall pair
*
* @allow_native: A bitmap where each bit represents whether the
* filter will always allow the syscall, for the
* native architecture.
* @allow_compat: A bitmap where each bit represents whether the
* filter will always allow the syscall, for the
* compat architecture.
*/
struct action_cache {
DECLARE_BITMAP(allow_native, SECCOMP_ARCH_NATIVE_NR);
#ifdef SECCOMP_ARCH_COMPAT
DECLARE_BITMAP(allow_compat, SECCOMP_ARCH_COMPAT_NR);
#endif
};
#else
struct action_cache { };
static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter,
const struct seccomp_data *sd)
{
return false;
}
static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter)
{
}
#endif /* SECCOMP_ARCH_NATIVE */
/**
* struct seccomp_filter - container for seccomp BPF programs
*
@ -159,6 +191,7 @@ struct notification {
* this filter after reaching 0. The @users count is always smaller
* or equal to @refs. Hence, reaching 0 for @users does not mean
* the filter can be freed.
* @cache: cache of arch/syscall mappings to actions
* @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
* @prev: points to a previously installed, or inherited, filter
* @prog: the BPF program to evaluate
@ -180,6 +213,7 @@ struct seccomp_filter {
refcount_t refs;
refcount_t users;
bool log;
struct action_cache cache;
struct seccomp_filter *prev;
struct bpf_prog *prog;
struct notification *notif;
@ -298,6 +332,52 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
return 0;
}
#ifdef SECCOMP_ARCH_NATIVE
static inline bool seccomp_cache_check_allow_bitmap(const void *bitmap,
size_t bitmap_size,
int syscall_nr)
{
if (unlikely(syscall_nr < 0 || syscall_nr >= bitmap_size))
return false;
syscall_nr = array_index_nospec(syscall_nr, bitmap_size);
return test_bit(syscall_nr, bitmap);
}
/**
* seccomp_cache_check_allow - lookup seccomp cache
* @sfilter: The seccomp filter
* @sd: The seccomp data to lookup the cache with
*
* Returns true if the seccomp_data is cached and allowed.
*/
static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter,
const struct seccomp_data *sd)
{
int syscall_nr = sd->nr;
const struct action_cache *cache = &sfilter->cache;
#ifndef SECCOMP_ARCH_COMPAT
/* A native-only architecture doesn't need to check sd->arch. */
return seccomp_cache_check_allow_bitmap(cache->allow_native,
SECCOMP_ARCH_NATIVE_NR,
syscall_nr);
#else
if (likely(sd->arch == SECCOMP_ARCH_NATIVE))
return seccomp_cache_check_allow_bitmap(cache->allow_native,
SECCOMP_ARCH_NATIVE_NR,
syscall_nr);
if (likely(sd->arch == SECCOMP_ARCH_COMPAT))
return seccomp_cache_check_allow_bitmap(cache->allow_compat,
SECCOMP_ARCH_COMPAT_NR,
syscall_nr);
#endif /* SECCOMP_ARCH_COMPAT */
WARN_ON_ONCE(true);
return false;
}
#endif /* SECCOMP_ARCH_NATIVE */
/**
* seccomp_run_filters - evaluates all seccomp filters against @sd
* @sd: optional seccomp data to be passed to filters
@ -320,6 +400,9 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
if (WARN_ON(f == NULL))
return SECCOMP_RET_KILL_PROCESS;
if (seccomp_cache_check_allow(f, sd))
return SECCOMP_RET_ALLOW;
/*
* All filters in the list are evaluated and the lowest BPF return
* value always takes priority (ignoring the DATA).
@ -470,6 +553,9 @@ void seccomp_filter_release(struct task_struct *tsk)
{
struct seccomp_filter *orig = tsk->seccomp.filter;
/* We are effectively holding the siglock by not having any sighand. */
WARN_ON(tsk->sighand != NULL);
/* Detach task from its filter tree. */
tsk->seccomp.filter = NULL;
__seccomp_filter_release(orig);
@ -544,7 +630,12 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
{
struct seccomp_filter *sfilter;
int ret;
const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE);
const bool save_orig =
#if defined(CONFIG_CHECKPOINT_RESTORE) || defined(SECCOMP_ARCH_NATIVE)
true;
#else
false;
#endif
if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
return ERR_PTR(-EINVAL);
@ -609,6 +700,148 @@ out:
return filter;
}
#ifdef SECCOMP_ARCH_NATIVE
/**
* seccomp_is_const_allow - check if filter is constant allow with given data
* @fprog: The BPF programs
* @sd: The seccomp data to check against, only syscall number and arch
* number are considered constant.
*/
static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog,
struct seccomp_data *sd)
{
unsigned int reg_value = 0;
unsigned int pc;
bool op_res;
if (WARN_ON_ONCE(!fprog))
return false;
for (pc = 0; pc < fprog->len; pc++) {
struct sock_filter *insn = &fprog->filter[pc];
u16 code = insn->code;
u32 k = insn->k;
switch (code) {
case BPF_LD | BPF_W | BPF_ABS:
switch (k) {
case offsetof(struct seccomp_data, nr):
reg_value = sd->nr;
break;
case offsetof(struct seccomp_data, arch):
reg_value = sd->arch;
break;
default:
/* can't optimize (non-constant value load) */
return false;
}
break;
case BPF_RET | BPF_K:
/* reached return with constant values only, check allow */
return k == SECCOMP_RET_ALLOW;
case BPF_JMP | BPF_JA:
pc += insn->k;
break;
case BPF_JMP | BPF_JEQ | BPF_K:
case BPF_JMP | BPF_JGE | BPF_K:
case BPF_JMP | BPF_JGT | BPF_K:
case BPF_JMP | BPF_JSET | BPF_K:
switch (BPF_OP(code)) {
case BPF_JEQ:
op_res = reg_value == k;
break;
case BPF_JGE:
op_res = reg_value >= k;
break;
case BPF_JGT:
op_res = reg_value > k;
break;
case BPF_JSET:
op_res = !!(reg_value & k);
break;
default:
/* can't optimize (unknown jump) */
return false;
}
pc += op_res ? insn->jt : insn->jf;
break;
case BPF_ALU | BPF_AND | BPF_K:
reg_value &= k;
break;
default:
/* can't optimize (unknown insn) */
return false;
}
}
/* ran off the end of the filter?! */
WARN_ON(1);
return false;
}
static void seccomp_cache_prepare_bitmap(struct seccomp_filter *sfilter,
void *bitmap, const void *bitmap_prev,
size_t bitmap_size, int arch)
{
struct sock_fprog_kern *fprog = sfilter->prog->orig_prog;
struct seccomp_data sd;
int nr;
if (bitmap_prev) {
/* The new filter must be as restrictive as the last. */
bitmap_copy(bitmap, bitmap_prev, bitmap_size);
} else {
/* Before any filters, all syscalls are always allowed. */
bitmap_fill(bitmap, bitmap_size);
}
for (nr = 0; nr < bitmap_size; nr++) {
/* No bitmap change: not a cacheable action. */
if (!test_bit(nr, bitmap))
continue;
sd.nr = nr;
sd.arch = arch;
/* No bitmap change: continue to always allow. */
if (seccomp_is_const_allow(fprog, &sd))
continue;
/*
* Not a cacheable action: always run filters.
* atomic clear_bit() not needed, filter not visible yet.
*/
__clear_bit(nr, bitmap);
}
}
/**
* seccomp_cache_prepare - emulate the filter to find cachable syscalls
* @sfilter: The seccomp filter
*
* Returns 0 if successful or -errno if error occurred.
*/
static void seccomp_cache_prepare(struct seccomp_filter *sfilter)
{
struct action_cache *cache = &sfilter->cache;
const struct action_cache *cache_prev =
sfilter->prev ? &sfilter->prev->cache : NULL;
seccomp_cache_prepare_bitmap(sfilter, cache->allow_native,
cache_prev ? cache_prev->allow_native : NULL,
SECCOMP_ARCH_NATIVE_NR,
SECCOMP_ARCH_NATIVE);
#ifdef SECCOMP_ARCH_COMPAT
seccomp_cache_prepare_bitmap(sfilter, cache->allow_compat,
cache_prev ? cache_prev->allow_compat : NULL,
SECCOMP_ARCH_COMPAT_NR,
SECCOMP_ARCH_COMPAT);
#endif /* SECCOMP_ARCH_COMPAT */
}
#endif /* SECCOMP_ARCH_NATIVE */
/**
* seccomp_attach_filter: validate and attach filter
* @flags: flags to change filter behavior
@ -658,6 +891,7 @@ static long seccomp_attach_filter(unsigned int flags,
* task reference.
*/
filter->prev = current->seccomp.filter;
seccomp_cache_prepare(filter);
current->seccomp.filter = filter;
atomic_inc(&current->seccomp.filter_count);
@ -1967,7 +2201,7 @@ static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names)
return true;
}
static int read_actions_logged(struct ctl_table *ro_table, void __user *buffer,
static int read_actions_logged(struct ctl_table *ro_table, void *buffer,
size_t *lenp, loff_t *ppos)
{
char names[sizeof(seccomp_actions_avail)];
@ -1985,7 +2219,7 @@ static int read_actions_logged(struct ctl_table *ro_table, void __user *buffer,
return proc_dostring(&table, 0, buffer, lenp, ppos);
}
static int write_actions_logged(struct ctl_table *ro_table, void __user *buffer,
static int write_actions_logged(struct ctl_table *ro_table, void *buffer,
size_t *lenp, loff_t *ppos, u32 *actions_logged)
{
char names[sizeof(seccomp_actions_avail)];
@ -2103,3 +2337,59 @@ static int __init seccomp_sysctl_init(void)
device_initcall(seccomp_sysctl_init)
#endif /* CONFIG_SYSCTL */
#ifdef CONFIG_SECCOMP_CACHE_DEBUG
/* Currently CONFIG_SECCOMP_CACHE_DEBUG implies SECCOMP_ARCH_NATIVE */
static void proc_pid_seccomp_cache_arch(struct seq_file *m, const char *name,
const void *bitmap, size_t bitmap_size)
{
int nr;
for (nr = 0; nr < bitmap_size; nr++) {
bool cached = test_bit(nr, bitmap);
char *status = cached ? "ALLOW" : "FILTER";
seq_printf(m, "%s %d %s\n", name, nr, status);
}
}
int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
struct seccomp_filter *f;
unsigned long flags;
/*
* We don't want some sandboxed process to know what their seccomp
* filters consist of.
*/
if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN))
return -EACCES;
if (!lock_task_sighand(task, &flags))
return -ESRCH;
f = READ_ONCE(task->seccomp.filter);
if (!f) {
unlock_task_sighand(task, &flags);
return 0;
}
/* prevent filter from being freed while we are printing it */
__get_seccomp_filter(f);
unlock_task_sighand(task, &flags);
proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_NATIVE_NAME,
f->cache.allow_native,
SECCOMP_ARCH_NATIVE_NR);
#ifdef SECCOMP_ARCH_COMPAT
proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_COMPAT_NAME,
f->cache.allow_compat,
SECCOMP_ARCH_COMPAT_NR);
#endif /* SECCOMP_ARCH_COMPAT */
__put_seccomp_filter(f);
return 0;
}
#endif /* CONFIG_SECCOMP_CACHE_DEBUG */