Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched-devel

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched-devel: (62 commits) sched: build fix sched: better rt-group documentation sched: features fix sched: /debug/sched_features sched: add SCHED_FEAT_DEADLINE sched: debug: show a weight tree sched: fair: weight calculations sched: fair-group: de-couple load-balancing from the rb-trees sched: fair-group scheduling vs latency sched: rt-group: optimize dequeue_rt_stack sched: debug: add some debug code to handle the full hierarchy sched: fair-group: SMP-nice for group scheduling sched, cpuset: customize sched domains, core sched, cpuset: customize sched domains, docs sched: prepatory code movement sched: rt: multi level group constraints sched: task_group hierarchy sched: fix the task_group hierarchy for UID grouping sched: allow the group scheduler to have multiple levels sched: mix tasks and groups ...
2008-04-21 15:40:24 -07:00 · 2008-04-21 15:40:24 -07:00 · ec965350bb
commit ec965350bb
parent 5f033bb9bc 486fdae214
68 changed files with 3174 additions and 1014 deletions
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@ -108,6 +108,7 @@ extern int __bitmap_weight(const unsigned long *bitmap, int bits);

 extern int bitmap_scnprintf(char *buf, unsigned int len,
 			const unsigned long *src, int nbits);
+extern int bitmap_scnprintf_len(unsigned int len);
 extern int __bitmap_parse(const char *buf, unsigned int buflen, int is_user,
 			unsigned long *dst, int nbits);
 extern int bitmap_parse_user(const char __user *ubuf, unsigned int ulen,
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@ -222,8 +222,13 @@ int __next_cpu(int n, const cpumask_t *srcp);
 #define next_cpu(n, src)	({ (void)(src); 1; })
 #endif

+#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
+extern cpumask_t *cpumask_of_cpu_map;
+#define cpumask_of_cpu(cpu)    (cpumask_of_cpu_map[cpu])
+
+#else
 #define cpumask_of_cpu(cpu)						\
-({									\
+(*({									\
 	typeof(_unused_cpumask_arg_) m;					\
 	if (sizeof(m) == sizeof(unsigned long)) {			\
 		m.bits[0] = 1UL<<(cpu);					\
@ -231,8 +236,9 @@ int __next_cpu(int n, const cpumask_t *srcp);
 		cpus_clear(m);						\
 		cpu_set((cpu), m);					\
 	}								\
-	m;								\
-})
+	&m;								\
+}))
+#endif

 #define CPU_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(NR_CPUS)

@ -243,6 +249,8 @@ int __next_cpu(int n, const cpumask_t *srcp);
 	[BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD			\
 } }

+#define CPU_MASK_ALL_PTR	(&CPU_MASK_ALL)
+
 #else

 #define CPU_MASK_ALL							\
@ -251,6 +259,10 @@ int __next_cpu(int n, const cpumask_t *srcp);
 	[BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD			\
 } }

+/* cpu_mask_all is in init/main.c */
+extern cpumask_t cpu_mask_all;
+#define CPU_MASK_ALL_PTR	(&cpu_mask_all)
+
 #endif

 #define CPU_MASK_NONE							\
@ -273,6 +285,13 @@ static inline int __cpumask_scnprintf(char *buf, int len,
 	return bitmap_scnprintf(buf, len, srcp->bits, nbits);
 }

+#define cpumask_scnprintf_len(len) \
+			__cpumask_scnprintf_len((len))
+static inline int __cpumask_scnprintf_len(int len)
+{
+	return bitmap_scnprintf_len(len);
+}
+
 #define cpumask_parse_user(ubuf, ulen, dst) \
 			__cpumask_parse_user((ubuf), (ulen), &(dst), NR_CPUS)
 static inline int __cpumask_parse_user(const char __user *buf, int len,
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@ -20,8 +20,8 @@ extern int number_of_cpusets;	/* How many cpusets are defined in system? */
 extern int cpuset_init_early(void);
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
-extern cpumask_t cpuset_cpus_allowed(struct task_struct *p);
-extern cpumask_t cpuset_cpus_allowed_locked(struct task_struct *p);
+extern void cpuset_cpus_allowed(struct task_struct *p, cpumask_t *mask);
+extern void cpuset_cpus_allowed_locked(struct task_struct *p, cpumask_t *mask);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
 #define cpuset_current_mems_allowed (current->mems_allowed)
 void cpuset_init_current_mems_allowed(void);
@ -84,13 +84,14 @@ static inline int cpuset_init_early(void) { return 0; }
 static inline int cpuset_init(void) { return 0; }
 static inline void cpuset_init_smp(void) {}

-static inline cpumask_t cpuset_cpus_allowed(struct task_struct *p)
+static inline void cpuset_cpus_allowed(struct task_struct *p, cpumask_t *mask)
 {
-	return cpu_possible_map;
+	*mask = cpu_possible_map;
 }
-static inline cpumask_t cpuset_cpus_allowed_locked(struct task_struct *p)
+static inline void cpuset_cpus_allowed_locked(struct task_struct *p,
+								cpumask_t *mask)
 {
-	return cpu_possible_map;
+	*mask = cpu_possible_map;
 }

 static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@ -151,6 +151,9 @@ extern struct group_info init_groups;
 	.cpus_allowed	= CPU_MASK_ALL,					\
 	.mm		= NULL,						\
 	.active_mm	= &init_mm,					\
+	.se		= {						\
+		.group_node 	= LIST_HEAD_INIT(tsk.se.group_node),	\
+	},								\
 	.rt		= {						\
 		.run_list	= LIST_HEAD_INIT(tsk.rt.run_list),	\
 		.time_slice	= HZ, 					\
--- a/include/linux/ktime.h
+++ b/include/linux/ktime.h
@ -327,4 +327,10 @@ extern void ktime_get_ts(struct timespec *ts);
 /* Get the real (wall-) time in timespec format: */
 #define ktime_get_real_ts(ts)	getnstimeofday(ts)

+static inline ktime_t ns_to_ktime(u64 ns)
+{
+	static const ktime_t ktime_zero = { .tv64 = 0 };
+	return ktime_add_ns(ktime_zero, ns);
+}
+
 #endif
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@ -704,6 +704,7 @@ enum cpu_idle_type {
 #define SD_POWERSAVINGS_BALANCE	256	/* Balance for power savings */
 #define SD_SHARE_PKG_RESOURCES	512	/* Domain members share cpu pkg resources */
 #define SD_SERIALIZE		1024	/* Only a single load balancing instance */
+#define SD_WAKE_IDLE_FAR	2048	/* Gain latency sacrificing cache hit */

 #define BALANCE_FOR_MC_POWER	\
 	(sched_smt_power_savings ? SD_POWERSAVINGS_BALANCE : 0)
@ -733,12 +734,31 @@ struct sched_group {
 	u32 reciprocal_cpu_power;
 };

+enum sched_domain_level {
+	SD_LV_NONE = 0,
+	SD_LV_SIBLING,
+	SD_LV_MC,
+	SD_LV_CPU,
+	SD_LV_NODE,
+	SD_LV_ALLNODES,
+	SD_LV_MAX
+};
+
+struct sched_domain_attr {
+	int relax_domain_level;
+};
+
+#define SD_ATTR_INIT	(struct sched_domain_attr) {	\
+	.relax_domain_level = -1,			\
+}
+
 struct sched_domain {
 	/* These fields must be setup */
 	struct sched_domain *parent;	/* top domain must be null terminated */
 	struct sched_domain *child;	/* bottom domain must be null terminated */
 	struct sched_group *groups;	/* the balancing groups of the domain */
 	cpumask_t span;			/* span of all CPUs in this domain */
+	int first_cpu;			/* cache of the first cpu in this domain */
 	unsigned long min_interval;	/* Minimum balance interval ms */
 	unsigned long max_interval;	/* Maximum balance interval ms */
 	unsigned int busy_factor;	/* less balancing by factor if busy */
@ -750,6 +770,7 @@ struct sched_domain {
 	unsigned int wake_idx;
 	unsigned int forkexec_idx;
 	int flags;			/* See SD_* */
+	enum sched_domain_level level;

 	/* Runtime fields. */
 	unsigned long last_balance;	/* init to jiffies. units in jiffies */
@ -789,7 +810,8 @@ struct sched_domain {
 #endif
 };

-extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new);
+extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+				    struct sched_domain_attr *dattr_new);
 extern int arch_reinit_sched_domains(void);

 #endif	/* CONFIG_SMP */
@ -889,7 +911,8 @@ struct sched_class {
 	void (*set_curr_task) (struct rq *rq);
 	void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
 	void (*task_new) (struct rq *rq, struct task_struct *p);
-	void (*set_cpus_allowed)(struct task_struct *p, cpumask_t *newmask);
+	void (*set_cpus_allowed)(struct task_struct *p,
+				 const cpumask_t *newmask);

 	void (*join_domain)(struct rq *rq);
 	void (*leave_domain)(struct rq *rq);
@ -923,6 +946,7 @@ struct load_weight {
 struct sched_entity {
 	struct load_weight	load;		/* for load-balancing */
 	struct rb_node		run_node;
+	struct list_head	group_node;
 	unsigned int		on_rq;

 	u64			exec_start;
@ -982,6 +1006,7 @@ struct sched_rt_entity {
 	unsigned long timeout;
 	int nr_cpus_allowed;

+	struct sched_rt_entity *back;
 #ifdef CONFIG_RT_GROUP_SCHED
 	struct sched_rt_entity	*parent;
 	/* rq on which this entity is (to be) queued: */
@ -1502,15 +1527,21 @@ static inline void put_task_struct(struct task_struct *t)
 #define used_math() tsk_used_math(current)

 #ifdef CONFIG_SMP
-extern int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask);
+extern int set_cpus_allowed_ptr(struct task_struct *p,
+				const cpumask_t *new_mask);
 #else
-static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
+static inline int set_cpus_allowed_ptr(struct task_struct *p,
+				       const cpumask_t *new_mask)
 {
-	if (!cpu_isset(0, new_mask))
+	if (!cpu_isset(0, *new_mask))
 		return -EINVAL;
 	return 0;
 }
 #endif
+static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
+{
+	return set_cpus_allowed_ptr(p, &new_mask);
+}

 extern unsigned long long sched_clock(void);

@ -1551,7 +1582,6 @@ static inline void wake_up_idle_cpu(int cpu) { }
 extern unsigned int sysctl_sched_latency;
 extern unsigned int sysctl_sched_min_granularity;
 extern unsigned int sysctl_sched_wakeup_granularity;
-extern unsigned int sysctl_sched_batch_wakeup_granularity;
 extern unsigned int sysctl_sched_child_runs_first;
 extern unsigned int sysctl_sched_features;
 extern unsigned int sysctl_sched_migration_cost;
@ -1564,6 +1594,10 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
 extern unsigned int sysctl_sched_rt_period;
 extern int sysctl_sched_rt_runtime;

+int sched_rt_handler(struct ctl_table *table, int write,
+		struct file *filp, void __user *buffer, size_t *lenp,
+		loff_t *ppos);
+
 extern unsigned int sysctl_sched_compat_yield;

 #ifdef CONFIG_RT_MUTEXES
@ -2031,7 +2065,7 @@ static inline void arch_pick_mmap_layout(struct mm_struct *mm)
 }
 #endif

-extern long sched_setaffinity(pid_t pid, cpumask_t new_mask);
+extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask);
 extern long sched_getaffinity(pid_t pid, cpumask_t *mask);

 extern int sched_mc_power_savings, sched_smt_power_savings;
@ -2041,8 +2075,11 @@ extern void normalize_rt_tasks(void);
 #ifdef CONFIG_GROUP_SCHED

 extern struct task_group init_task_group;
+#ifdef CONFIG_USER_SCHED
+extern struct task_group root_task_group;
+#endif

-extern struct task_group *sched_create_group(void);
+extern struct task_group *sched_create_group(struct task_group *parent);
 extern void sched_destroy_group(struct task_group *tg);
 extern void sched_move_task(struct task_struct *tsk);
 #ifdef CONFIG_FAIR_GROUP_SCHED
@ -2053,6 +2090,9 @@ extern unsigned long sched_group_shares(struct task_group *tg);
 extern int sched_group_set_rt_runtime(struct task_group *tg,
 				      long rt_runtime_us);
 extern long sched_group_rt_runtime(struct task_group *tg);
+extern int sched_group_set_rt_period(struct task_group *tg,
+				      long rt_period_us);
+extern long sched_group_rt_period(struct task_group *tg);
 #endif
 #endif

--- a/include/linux/sysdev.h
+++ b/include/linux/sysdev.h
@ -45,12 +45,16 @@ struct sysdev_class_attribute {
 	ssize_t (*store)(struct sysdev_class *, const char *, size_t);
 };

-#define SYSDEV_CLASS_ATTR(_name,_mode,_show,_store) 		\
-struct sysdev_class_attribute attr_##_name = { 			\
+#define _SYSDEV_CLASS_ATTR(_name,_mode,_show,_store) 		\
+{					 			\
 	.attr = {.name = __stringify(_name), .mode = _mode },	\
 	.show	= _show,					\
 	.store	= _store,					\
-};
+}
+
+#define SYSDEV_CLASS_ATTR(_name,_mode,_show,_store) 		\
+	struct sysdev_class_attribute attr_##_name = 		\
+		_SYSDEV_CLASS_ATTR(_name,_mode,_show,_store)


 extern int sysdev_class_register(struct sysdev_class *);
@ -100,15 +104,16 @@ struct sysdev_attribute {
 };


-#define _SYSDEV_ATTR(_name,_mode,_show,_store)			\
+#define _SYSDEV_ATTR(_name, _mode, _show, _store)		\
 {								\
 	.attr = { .name = __stringify(_name), .mode = _mode },	\
 	.show	= _show,					\
 	.store	= _store,					\
 }

-#define SYSDEV_ATTR(_name,_mode,_show,_store)		\
-struct sysdev_attribute attr_##_name = _SYSDEV_ATTR(_name,_mode,_show,_store);
+#define SYSDEV_ATTR(_name, _mode, _show, _store)		\
+	struct sysdev_attribute attr_##_name =			\
+		_SYSDEV_ATTR(_name, _mode, _show, _store);

 extern int sysdev_create_file(struct sys_device *, struct sysdev_attribute *);
 extern void sysdev_remove_file(struct sys_device *, struct sysdev_attribute *);
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@ -38,16 +38,15 @@
 #endif

 #ifndef nr_cpus_node
-#define nr_cpus_node(node)							\
-	({									\
-		cpumask_t __tmp__;						\
-		__tmp__ = node_to_cpumask(node);				\
-		cpus_weight(__tmp__);						\
+#define nr_cpus_node(node)				\
+	({						\
+		node_to_cpumask_ptr(__tmp__, node);	\
+		cpus_weight(*__tmp__);			\
 	})
 #endif

-#define for_each_node_with_cpus(node)						\
-	for_each_online_node(node)						\
+#define for_each_node_with_cpus(node)			\
+	for_each_online_node(node)			\
 		if (nr_cpus_node(node))

 void arch_update_cpu_topology(void);
@ -80,7 +79,9 @@ void arch_update_cpu_topology(void);
 * by defining their own arch-specific initializer in include/asm/topology.h.
 * A definition there will automagically override these default initializers
 * and allow arch-specific performance tuning of sched_domains.
+ * (Only non-zero and non-null fields need be specified.)
 */
+
 #ifdef CONFIG_SCHED_SMT
 /* MCD - Do we really need this?  It is always on if CONFIG_SCHED_SMT is,
 * so can't we drop this in favor of CONFIG_SCHED_SMT?
@ -89,20 +90,10 @@ void arch_update_cpu_topology(void);
 /* Common values for SMT siblings */
 #ifndef SD_SIBLING_INIT
 #define SD_SIBLING_INIT (struct sched_domain) {		\
-	.span			= CPU_MASK_NONE,	\
-	.parent			= NULL,			\
-	.child			= NULL,			\
-	.groups			= NULL,			\
 	.min_interval		= 1,			\
 	.max_interval		= 2,			\
 	.busy_factor		= 64,			\
 	.imbalance_pct		= 110,			\
-	.cache_nice_tries	= 0,			\
-	.busy_idx		= 0,			\
-	.idle_idx		= 0,			\
-	.newidle_idx		= 0,			\
-	.wake_idx		= 0,			\
-	.forkexec_idx		= 0,			\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_FORK	\
@ -112,7 +103,6 @@ void arch_update_cpu_topology(void);
 				| SD_SHARE_CPUPOWER,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
-	.nr_balance_failed	= 0,			\
 }
 #endif
 #endif /* CONFIG_SCHED_SMT */
@ -121,18 +111,12 @@ void arch_update_cpu_topology(void);
 /* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */
 #ifndef SD_MC_INIT
 #define SD_MC_INIT (struct sched_domain) {		\
-	.span			= CPU_MASK_NONE,	\
-	.parent			= NULL,			\
-	.child			= NULL,			\
-	.groups			= NULL,			\
 	.min_interval		= 1,			\
 	.max_interval		= 4,			\
 	.busy_factor		= 64,			\
 	.imbalance_pct		= 125,			\
 	.cache_nice_tries	= 1,			\
 	.busy_idx		= 2,			\
-	.idle_idx		= 0,			\
-	.newidle_idx		= 0,			\
 	.wake_idx		= 1,			\
 	.forkexec_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
@ -144,7 +128,6 @@ void arch_update_cpu_topology(void);
 				| BALANCE_FOR_MC_POWER,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
-	.nr_balance_failed	= 0,			\
 }
 #endif
 #endif /* CONFIG_SCHED_MC */
@ -152,10 +135,6 @@ void arch_update_cpu_topology(void);
 /* Common values for CPUs */
 #ifndef SD_CPU_INIT
 #define SD_CPU_INIT (struct sched_domain) {		\
-	.span			= CPU_MASK_NONE,	\
-	.parent			= NULL,			\
-	.child			= NULL,			\
-	.groups			= NULL,			\
 	.min_interval		= 1,			\
 	.max_interval		= 4,			\
 	.busy_factor		= 64,			\
@ -174,16 +153,11 @@ void arch_update_cpu_topology(void);
 				| BALANCE_FOR_PKG_POWER,\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
-	.nr_balance_failed	= 0,			\
 }
 #endif

 /* sched_domains SD_ALLNODES_INIT for NUMA machines */
 #define SD_ALLNODES_INIT (struct sched_domain) {	\
-	.span			= CPU_MASK_NONE,	\
-	.parent			= NULL,			\
-	.child			= NULL,			\
-	.groups			= NULL,			\
 	.min_interval		= 64,			\
 	.max_interval		= 64*num_online_cpus(),	\
 	.busy_factor		= 128,			\
@ -191,14 +165,10 @@ void arch_update_cpu_topology(void);
 	.cache_nice_tries	= 1,			\
 	.busy_idx		= 3,			\
 	.idle_idx		= 3,			\
-	.newidle_idx		= 0, /* unused */	\
-	.wake_idx		= 0, /* unused */	\
-	.forkexec_idx		= 0, /* unused */	\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_SERIALIZE,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 64,			\
-	.nr_balance_failed	= 0,			\
 }

 #ifdef CONFIG_NUMA