Merge branch 'for-5.14' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup updates from Tejun Heo:

 - cgroup.kill is added which implements atomic killing of the whole
   subtree.

   Down the line, this should be able to replace the multiple userland
   implementations of "keep killing till empty".

 - PSI can now be turned off at boot time to avoid overhead for
   configurations which don't care about PSI.

* 'for-5.14' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
  cgroup: make per-cgroup pressure stall tracking configurable
  cgroup: Fix kernel-doc
  cgroup: inline cgroup_task_freeze()
  tests/cgroup: test cgroup.kill
  tests/cgroup: move cg_wait_for(), cg_prepare_for_wait()
  tests/cgroup: use cgroup.kill in cg_killall()
  docs/cgroup: add entry for cgroup.kill
  cgroup: introduce cgroup.kill
This commit is contained in:
Linus Torvalds 2021-07-01 17:22:14 -07:00
commit 3dbdb38e28
13 changed files with 569 additions and 108 deletions

View file

@ -209,6 +209,22 @@ struct cgroup_namespace init_cgroup_ns = {
static struct file_system_type cgroup2_fs_type;
static struct cftype cgroup_base_files[];
/* cgroup optional features */
enum cgroup_opt_features {
#ifdef CONFIG_PSI
OPT_FEATURE_PRESSURE,
#endif
OPT_FEATURE_COUNT
};
static const char *cgroup_opt_feature_names[OPT_FEATURE_COUNT] = {
#ifdef CONFIG_PSI
"pressure",
#endif
};
static u16 cgroup_feature_disable_mask __read_mostly;
static int cgroup_apply_control(struct cgroup *cgrp);
static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
static void css_task_iter_skip(struct css_task_iter *it,
@ -2390,7 +2406,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
}
/**
* cgroup_taskset_migrate - migrate a taskset
* cgroup_migrate_execute - migrate a taskset
* @mgctx: migration context
*
* Migrate tasks in @mgctx as setup by migration preparation functions.
@ -3632,6 +3648,18 @@ static void cgroup_pressure_release(struct kernfs_open_file *of)
{
psi_trigger_replace(&of->priv, NULL);
}
bool cgroup_psi_enabled(void)
{
return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
}
#else /* CONFIG_PSI */
bool cgroup_psi_enabled(void)
{
return false;
}
#endif /* CONFIG_PSI */
static int cgroup_freeze_show(struct seq_file *seq, void *v)
@ -3668,6 +3696,80 @@ static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
return nbytes;
}
static void __cgroup_kill(struct cgroup *cgrp)
{
struct css_task_iter it;
struct task_struct *task;
lockdep_assert_held(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
set_bit(CGRP_KILL, &cgrp->flags);
spin_unlock_irq(&css_set_lock);
css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);
while ((task = css_task_iter_next(&it))) {
/* Ignore kernel threads here. */
if (task->flags & PF_KTHREAD)
continue;
/* Skip tasks that are already dying. */
if (__fatal_signal_pending(task))
continue;
send_sig(SIGKILL, task, 0);
}
css_task_iter_end(&it);
spin_lock_irq(&css_set_lock);
clear_bit(CGRP_KILL, &cgrp->flags);
spin_unlock_irq(&css_set_lock);
}
static void cgroup_kill(struct cgroup *cgrp)
{
struct cgroup_subsys_state *css;
struct cgroup *dsct;
lockdep_assert_held(&cgroup_mutex);
cgroup_for_each_live_descendant_pre(dsct, css, cgrp)
__cgroup_kill(dsct);
}
static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
ssize_t ret = 0;
int kill;
struct cgroup *cgrp;
ret = kstrtoint(strstrip(buf), 0, &kill);
if (ret)
return ret;
if (kill != 1)
return -ERANGE;
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENOENT;
/*
* Killing is a process directed operation, i.e. the whole thread-group
* is taken down so act like we do for cgroup.procs and only make this
* writable in non-threaded cgroups.
*/
if (cgroup_is_threaded(cgrp))
ret = -EOPNOTSUPP;
else
cgroup_kill(cgrp);
cgroup_kn_unlock(of->kn);
return ret ?: nbytes;
}
static int cgroup_file_open(struct kernfs_open_file *of)
{
struct cftype *cft = of_cft(of);
@ -3882,6 +3984,8 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
restart:
for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
/* does cft->flags tell us to skip this file on @cgrp? */
if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
continue;
if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
continue;
if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
@ -3959,6 +4063,9 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
WARN_ON(cft->ss || cft->kf_ops);
if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
continue;
if (cft->seq_start)
kf_ops = &cgroup_kf_ops;
else
@ -4860,6 +4967,11 @@ static struct cftype cgroup_base_files[] = {
.seq_show = cgroup_freeze_show,
.write = cgroup_freeze_write,
},
{
.name = "cgroup.kill",
.flags = CFTYPE_NOT_ON_ROOT,
.write = cgroup_kill_write,
},
{
.name = "cpu.stat",
.seq_show = cpu_stat_show,
@ -4867,6 +4979,7 @@ static struct cftype cgroup_base_files[] = {
#ifdef CONFIG_PSI
{
.name = "io.pressure",
.flags = CFTYPE_PRESSURE,
.seq_show = cgroup_io_pressure_show,
.write = cgroup_io_pressure_write,
.poll = cgroup_pressure_poll,
@ -4874,6 +4987,7 @@ static struct cftype cgroup_base_files[] = {
},
{
.name = "memory.pressure",
.flags = CFTYPE_PRESSURE,
.seq_show = cgroup_memory_pressure_show,
.write = cgroup_memory_pressure_write,
.poll = cgroup_pressure_poll,
@ -4881,6 +4995,7 @@ static struct cftype cgroup_base_files[] = {
},
{
.name = "cpu.pressure",
.flags = CFTYPE_PRESSURE,
.seq_show = cgroup_cpu_pressure_show,
.write = cgroup_cpu_pressure_write,
.poll = cgroup_pressure_poll,
@ -6080,6 +6195,8 @@ void cgroup_post_fork(struct task_struct *child,
struct kernel_clone_args *kargs)
__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
{
unsigned long cgrp_flags = 0;
bool kill = false;
struct cgroup_subsys *ss;
struct css_set *cset;
int i;
@ -6091,6 +6208,11 @@ void cgroup_post_fork(struct task_struct *child,
/* init tasks are special, only link regular threads */
if (likely(child->pid)) {
if (kargs->cgrp)
cgrp_flags = kargs->cgrp->flags;
else
cgrp_flags = cset->dfl_cgrp->flags;
WARN_ON_ONCE(!list_empty(&child->cg_list));
cset->nr_tasks++;
css_set_move_task(child, NULL, cset, false);
@ -6099,23 +6221,32 @@ void cgroup_post_fork(struct task_struct *child,
cset = NULL;
}
/*
* If the cgroup has to be frozen, the new task has too. Let's set
* the JOBCTL_TRAP_FREEZE jobctl bit to get the task into the
* frozen state.
*/
if (unlikely(cgroup_task_freeze(child))) {
spin_lock(&child->sighand->siglock);
WARN_ON_ONCE(child->frozen);
child->jobctl |= JOBCTL_TRAP_FREEZE;
spin_unlock(&child->sighand->siglock);
if (!(child->flags & PF_KTHREAD)) {
if (unlikely(test_bit(CGRP_FREEZE, &cgrp_flags))) {
/*
* If the cgroup has to be frozen, the new task has
* too. Let's set the JOBCTL_TRAP_FREEZE jobctl bit to
* get the task into the frozen state.
*/
spin_lock(&child->sighand->siglock);
WARN_ON_ONCE(child->frozen);
child->jobctl |= JOBCTL_TRAP_FREEZE;
spin_unlock(&child->sighand->siglock);
/*
* Calling cgroup_update_frozen() isn't required here,
* because it will be called anyway a bit later from
* do_freezer_trap(). So we avoid cgroup's transient
* switch from the frozen state and back.
*/
}
/*
* Calling cgroup_update_frozen() isn't required here,
* because it will be called anyway a bit later from
* do_freezer_trap(). So we avoid cgroup's transient switch
* from the frozen state and back.
* If the cgroup is to be killed notice it now and take the
* child down right after we finished preparing it for
* userspace.
*/
kill = test_bit(CGRP_KILL, &cgrp_flags);
}
spin_unlock_irq(&css_set_lock);
@ -6138,6 +6269,10 @@ void cgroup_post_fork(struct task_struct *child,
put_css_set(rcset);
}
/* Cgroup has to be killed so take down child immediately. */
if (unlikely(kill))
do_send_sig_info(SIGKILL, SEND_SIG_NOINFO, child, PIDTYPE_TGID);
cgroup_css_set_put_fork(kargs);
}
@ -6163,7 +6298,8 @@ void cgroup_exit(struct task_struct *tsk)
cset->nr_tasks--;
WARN_ON_ONCE(cgroup_task_frozen(tsk));
if (unlikely(cgroup_task_freeze(tsk)))
if (unlikely(!(tsk->flags & PF_KTHREAD) &&
test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags)))
cgroup_update_frozen(task_dfl_cgroup(tsk));
spin_unlock_irq(&css_set_lock);
@ -6214,6 +6350,15 @@ static int __init cgroup_disable(char *str)
pr_info("Disabling %s control group subsystem\n",
ss->name);
}
for (i = 0; i < OPT_FEATURE_COUNT; i++) {
if (strcmp(token, cgroup_opt_feature_names[i]))
continue;
cgroup_feature_disable_mask |= 1 << i;
pr_info("Disabling %s control group feature\n",
cgroup_opt_feature_names[i]);
break;
}
}
return 1;
}
@ -6512,6 +6657,9 @@ static ssize_t show_delegatable_files(struct cftype *files, char *buf,
if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
continue;
if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
continue;
if (prefix)
ret += snprintf(buf + ret, size - ret, "%s.", prefix);

View file

@ -220,7 +220,7 @@ void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp)
}
/**
* cgroup_rstat_flush_begin - flush stats in @cgrp's subtree and hold
* cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold
* @cgrp: target cgroup
*
* Flush stats in @cgrp's subtree and prevent further flushes. Must be