Merge branch 'linus' into irq/threaded

Conflicts:
	include/linux/irq.h
	kernel/irq/handle.c
This commit is contained in:
Ingo Molnar 2009-04-06 01:41:22 +02:00
commit 9efe21cb82
7424 changed files with 841787 additions and 291611 deletions

View file

@ -93,6 +93,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
obj-$(CONFIG_FUNCTION_TRACER) += trace/
obj-$(CONFIG_TRACING) += trace/
obj-$(CONFIG_SMP) += sched_cpupri.o
obj-$(CONFIG_SLOW_WORK) += slow-work.o
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is

View file

@ -49,6 +49,7 @@ asynchronous and synchronous parts of the kernel.
*/
#include <linux/async.h>
#include <linux/bug.h>
#include <linux/module.h>
#include <linux/wait.h>
#include <linux/sched.h>
@ -387,20 +388,11 @@ static int async_manager_thread(void *unused)
static int __init async_init(void)
{
if (async_enabled)
if (IS_ERR(kthread_run(async_manager_thread, NULL,
"async/mgr")))
async_enabled = 0;
async_enabled =
!IS_ERR(kthread_run(async_manager_thread, NULL, "async/mgr"));
WARN_ON(!async_enabled);
return 0;
}
static int __init setup_async(char *str)
{
async_enabled = 1;
return 1;
}
__setup("fastboot", setup_async);
core_initcall(async_init);

View file

@ -766,6 +766,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
audit_log_format(ab, " msg=");
size = nlmsg_len(nlh);
if (size > 0 &&
((unsigned char *)data)[size - 1] == '\0')
size--;
audit_log_n_untrustedstring(ab, data, size);
}
audit_set_pid(ab, pid);
@ -1382,7 +1385,7 @@ void audit_log_n_string(struct audit_buffer *ab, const char *string,
int audit_string_contains_control(const char *string, size_t len)
{
const unsigned char *p;
for (p = string; p < (const unsigned char *)string + len && *p; p++) {
for (p = string; p < (const unsigned char *)string + len; p++) {
if (*p == '"' || *p < 0x21 || *p > 0x7e)
return 1;
}
@ -1437,13 +1440,13 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
/* We will allow 11 spaces for ' (deleted)' to be appended */
pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
if (!pathname) {
audit_log_format(ab, "<no memory>");
audit_log_string(ab, "<no_memory>");
return;
}
p = d_path(path, pathname, PATH_MAX+11);
if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */
/* FIXME: can we save some information here? */
audit_log_format(ab, "<too long>");
audit_log_string(ab, "<too_long>");
} else
audit_log_untrustedstring(ab, p);
kfree(pathname);

View file

@ -385,6 +385,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
mutex_lock(&inode->inotify_mutex);
if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) {
mutex_unlock(&inode->inotify_mutex);
put_inotify_watch(&old->watch);
free_chunk(chunk);
return -ENOSPC;
}
@ -394,6 +395,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
chunk->dead = 1;
inotify_evict_watch(&chunk->watch);
mutex_unlock(&inode->inotify_mutex);
put_inotify_watch(&old->watch);
put_inotify_watch(&chunk->watch);
return 0;
}

View file

@ -135,18 +135,18 @@ static void audit_remove_watch(struct audit_watch *watch)
static inline void audit_free_rule(struct audit_entry *e)
{
int i;
struct audit_krule *erule = &e->rule;
/* some rules don't have associated watches */
if (e->rule.watch)
audit_put_watch(e->rule.watch);
if (e->rule.fields)
for (i = 0; i < e->rule.field_count; i++) {
struct audit_field *f = &e->rule.fields[i];
if (erule->watch)
audit_put_watch(erule->watch);
if (erule->fields)
for (i = 0; i < erule->field_count; i++) {
struct audit_field *f = &erule->fields[i];
kfree(f->lsm_str);
security_audit_rule_free(f->lsm_rule);
}
kfree(e->rule.fields);
kfree(e->rule.filterkey);
kfree(erule->fields);
kfree(erule->filterkey);
kfree(e);
}

View file

@ -66,6 +66,7 @@
#include <linux/syscalls.h>
#include <linux/inotify.h>
#include <linux/capability.h>
#include <linux/fs_struct.h>
#include "audit.h"
@ -328,6 +329,14 @@ static int audit_match_filetype(struct audit_context *ctx, int which)
*/
#ifdef CONFIG_AUDIT_TREE
static void audit_set_auditable(struct audit_context *ctx)
{
if (!ctx->prio) {
ctx->prio = 1;
ctx->current_state = AUDIT_RECORD_CONTEXT;
}
}
static int put_tree_ref(struct audit_context *ctx, struct audit_chunk *chunk)
{
struct audit_tree_refs *p = ctx->trees;
@ -741,17 +750,9 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
rcu_read_unlock();
}
static void audit_set_auditable(struct audit_context *ctx)
{
if (!ctx->prio) {
ctx->prio = 1;
ctx->current_state = AUDIT_RECORD_CONTEXT;
}
}
static inline struct audit_context *audit_get_context(struct task_struct *tsk,
int return_valid,
int return_code)
long return_code)
{
struct audit_context *context = tsk->audit_context;
@ -1023,7 +1024,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
{
char arg_num_len_buf[12];
const char __user *tmp_p = p;
/* how many digits are in arg_num? 3 is the length of a=\n */
/* how many digits are in arg_num? 3 is the length of " a=" */
size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 3;
size_t len, len_left, to_send;
size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN;
@ -1109,7 +1110,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
* so we can be sure nothing was lost.
*/
if ((i == 0) && (too_long))
audit_log_format(*ab, "a%d_len=%zu ", arg_num,
audit_log_format(*ab, " a%d_len=%zu", arg_num,
has_cntl ? 2*len : len);
/*
@ -1129,7 +1130,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
buf[to_send] = '\0';
/* actually log it */
audit_log_format(*ab, "a%d", arg_num);
audit_log_format(*ab, " a%d", arg_num);
if (too_long)
audit_log_format(*ab, "[%d]", i);
audit_log_format(*ab, "=");
@ -1137,7 +1138,6 @@ static int audit_log_single_execve_arg(struct audit_context *context,
audit_log_n_hex(*ab, buf, to_send);
else
audit_log_format(*ab, "\"%s\"", buf);
audit_log_format(*ab, "\n");
p += to_send;
len_left -= to_send;
@ -1165,7 +1165,7 @@ static void audit_log_execve_info(struct audit_context *context,
p = (const char __user *)axi->mm->arg_start;
audit_log_format(*ab, "argc=%d ", axi->argc);
audit_log_format(*ab, "argc=%d", axi->argc);
/*
* we need some kernel buffer to hold the userspace args. Just
@ -1478,7 +1478,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
case 0:
/* name was specified as a relative path and the
* directory component is the cwd */
audit_log_d_path(ab, " name=", &context->pwd);
audit_log_d_path(ab, "name=", &context->pwd);
break;
default:
/* log the name's directory component */
@ -2149,7 +2149,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
* __audit_mq_open - record audit data for a POSIX MQ open
* @oflag: open flag
* @mode: mode bits
* @u_attr: queue attributes
* @attr: queue attributes
*
*/
void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr)
@ -2196,7 +2196,7 @@ void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
/**
* __audit_mq_notify - record audit data for a POSIX MQ notify
* @mqdes: MQ descriptor
* @u_notification: Notification event
* @notification: Notification event
*
*/

View file

@ -94,7 +94,6 @@ struct cgroupfs_root {
char release_agent_path[PATH_MAX];
};
/*
* The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
* subsystems that are otherwise unattached - it never has more than a
@ -102,6 +101,39 @@ struct cgroupfs_root {
*/
static struct cgroupfs_root rootnode;
/*
* CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
* cgroup_subsys->use_id != 0.
*/
#define CSS_ID_MAX (65535)
struct css_id {
/*
* The css to which this ID points. This pointer is set to valid value
* after cgroup is populated. If cgroup is removed, this will be NULL.
* This pointer is expected to be RCU-safe because destroy()
* is called after synchronize_rcu(). But for safe use, css_is_removed()
* css_tryget() should be used for avoiding race.
*/
struct cgroup_subsys_state *css;
/*
* ID of this css.
*/
unsigned short id;
/*
* Depth in hierarchy which this ID belongs to.
*/
unsigned short depth;
/*
* ID is freed by RCU. (and lookup routine is RCU safe.)
*/
struct rcu_head rcu_head;
/*
* Hierarchy of CSS ID belongs to.
*/
unsigned short stack[0]; /* Array of Length (depth+1) */
};
/* The list of hierarchy roots */
static LIST_HEAD(roots);
@ -185,6 +217,8 @@ struct cg_cgroup_link {
static struct css_set init_css_set;
static struct cg_cgroup_link init_css_set_link;
static int cgroup_subsys_init_idr(struct cgroup_subsys *ss);
/* css_set_lock protects the list of css_set objects, and the
* chain of tasks off each css_set. Nests outside task->alloc_lock
* due to cgroup_iter_start() */
@ -567,6 +601,9 @@ static struct backing_dev_info cgroup_backing_dev_info = {
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
static int alloc_css_id(struct cgroup_subsys *ss,
struct cgroup *parent, struct cgroup *child);
static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
{
struct inode *inode = new_inode(sb);
@ -585,13 +622,18 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
* Call subsys's pre_destroy handler.
* This is called before css refcnt check.
*/
static void cgroup_call_pre_destroy(struct cgroup *cgrp)
static int cgroup_call_pre_destroy(struct cgroup *cgrp)
{
struct cgroup_subsys *ss;
int ret = 0;
for_each_subsys(cgrp->root, ss)
if (ss->pre_destroy)
ss->pre_destroy(ss, cgrp);
return;
if (ss->pre_destroy) {
ret = ss->pre_destroy(ss, cgrp);
if (ret)
break;
}
return ret;
}
static void free_cgroup_rcu(struct rcu_head *obj)
@ -685,6 +727,22 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
remove_dir(dentry);
}
/*
* A queue for waiters to do rmdir() cgroup. A tasks will sleep when
* cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
* reference to css->refcnt. In general, this refcnt is expected to goes down
* to zero, soon.
*
* CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex;
*/
DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp)
{
if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
wake_up_all(&cgroup_rmdir_waitq);
}
static int rebind_subsystems(struct cgroupfs_root *root,
unsigned long final_bits)
{
@ -857,16 +915,16 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
}
ret = rebind_subsystems(root, opts.subsys_bits);
if (ret)
goto out_unlock;
/* (re)populate subsystem files */
if (!ret)
cgroup_populate_dir(cgrp);
cgroup_populate_dir(cgrp);
if (opts.release_agent)
strcpy(root->release_agent_path, opts.release_agent);
out_unlock:
if (opts.release_agent)
kfree(opts.release_agent);
kfree(opts.release_agent);
mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
return ret;
@ -969,15 +1027,13 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
/* First find the desired set of subsystems */
ret = parse_cgroupfs_options(data, &opts);
if (ret) {
if (opts.release_agent)
kfree(opts.release_agent);
kfree(opts.release_agent);
return ret;
}
root = kzalloc(sizeof(*root), GFP_KERNEL);
if (!root) {
if (opts.release_agent)
kfree(opts.release_agent);
kfree(opts.release_agent);
return -ENOMEM;
}
@ -1071,7 +1127,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
mutex_unlock(&cgroup_mutex);
}
return simple_set_mnt(mnt, sb);
simple_set_mnt(mnt, sb);
return 0;
free_cg_links:
free_cg_links(&tmp_cg_links);
@ -1279,6 +1336,12 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
synchronize_rcu();
put_css_set(cg);
/*
* wake up rmdir() waiter. the rmdir should fail since the cgroup
* is no longer empty.
*/
cgroup_wakeup_rmdir_waiters(cgrp);
return 0;
}
@ -1624,10 +1687,10 @@ static struct inode_operations cgroup_dir_inode_operations = {
.rename = cgroup_rename,
};
static int cgroup_create_file(struct dentry *dentry, int mode,
static int cgroup_create_file(struct dentry *dentry, mode_t mode,
struct super_block *sb)
{
static struct dentry_operations cgroup_dops = {
static const struct dentry_operations cgroup_dops = {
.d_iput = cgroup_diput,
};
@ -1670,7 +1733,7 @@ static int cgroup_create_file(struct dentry *dentry, int mode,
* @mode: mode to set on new directory.
*/
static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
int mode)
mode_t mode)
{
struct dentry *parent;
int error = 0;
@ -1688,6 +1751,33 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
return error;
}
/**
* cgroup_file_mode - deduce file mode of a control file
* @cft: the control file in question
*
* returns cft->mode if ->mode is not 0
* returns S_IRUGO|S_IWUSR if it has both a read and a write handler
* returns S_IRUGO if it has only a read handler
* returns S_IWUSR if it has only a write hander
*/
static mode_t cgroup_file_mode(const struct cftype *cft)
{
mode_t mode = 0;
if (cft->mode)
return cft->mode;
if (cft->read || cft->read_u64 || cft->read_s64 ||
cft->read_map || cft->read_seq_string)
mode |= S_IRUGO;
if (cft->write || cft->write_u64 || cft->write_s64 ||
cft->write_string || cft->trigger)
mode |= S_IWUSR;
return mode;
}
int cgroup_add_file(struct cgroup *cgrp,
struct cgroup_subsys *subsys,
const struct cftype *cft)
@ -1695,6 +1785,7 @@ int cgroup_add_file(struct cgroup *cgrp,
struct dentry *dir = cgrp->dentry;
struct dentry *dentry;
int error;
mode_t mode;
char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
@ -1705,7 +1796,8 @@ int cgroup_add_file(struct cgroup *cgrp,
BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
dentry = lookup_one_len(name, dir, strlen(name));
if (!IS_ERR(dentry)) {
error = cgroup_create_file(dentry, 0644 | S_IFREG,
mode = cgroup_file_mode(cft);
error = cgroup_create_file(dentry, mode | S_IFREG,
cgrp->root->sb);
if (!error)
dentry->d_fsdata = (void *)cft;
@ -2287,6 +2379,7 @@ static struct cftype files[] = {
.write_u64 = cgroup_tasks_write,
.release = cgroup_tasks_release,
.private = FILE_TASKLIST,
.mode = S_IRUGO | S_IWUSR,
},
{
@ -2326,6 +2419,17 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
return err;
}
/* This cgroup is ready now */
for_each_subsys(cgrp->root, ss) {
struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
/*
* Update id->css pointer and make this css visible from
* CSS ID functions. This pointer will be dereferened
* from RCU-read-side without locks.
*/
if (css->id)
rcu_assign_pointer(css->id->css, css);
}
return 0;
}
@ -2337,6 +2441,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
css->cgroup = cgrp;
atomic_set(&css->refcnt, 1);
css->flags = 0;
css->id = NULL;
if (cgrp == dummytop)
set_bit(CSS_ROOT, &css->flags);
BUG_ON(cgrp->subsys[ss->subsys_id]);
@ -2375,7 +2480,7 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
* Must be called with the mutex on the parent inode held
*/
static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
int mode)
mode_t mode)
{
struct cgroup *cgrp;
struct cgroupfs_root *root = parent->root;
@ -2412,6 +2517,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
goto err_destroy;
}
init_cgroup_css(css, ss, cgrp);
if (ss->use_id)
if (alloc_css_id(ss, parent, cgrp))
goto err_destroy;
/* At error, ->destroy() callback has to free assigned ID. */
}
cgroup_lock_hierarchy(root);
@ -2554,9 +2663,11 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
struct cgroup *cgrp = dentry->d_fsdata;
struct dentry *d;
struct cgroup *parent;
DEFINE_WAIT(wait);
int ret;
/* the vfs holds both inode->i_mutex already */
again:
mutex_lock(&cgroup_mutex);
if (atomic_read(&cgrp->count) != 0) {
mutex_unlock(&cgroup_mutex);
@ -2572,17 +2683,39 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
* Call pre_destroy handlers of subsys. Notify subsystems
* that rmdir() request comes.
*/
cgroup_call_pre_destroy(cgrp);
ret = cgroup_call_pre_destroy(cgrp);
if (ret)
return ret;
mutex_lock(&cgroup_mutex);
parent = cgrp->parent;
if (atomic_read(&cgrp->count)
|| !list_empty(&cgrp->children)
|| !cgroup_clear_css_refs(cgrp)) {
if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
mutex_unlock(&cgroup_mutex);
return -EBUSY;
}
/*
* css_put/get is provided for subsys to grab refcnt to css. In typical
* case, subsystem has no reference after pre_destroy(). But, under
* hierarchy management, some *temporal* refcnt can be hold.
* To avoid returning -EBUSY to a user, waitqueue is used. If subsys
* is really busy, it should return -EBUSY at pre_destroy(). wake_up
* is called when css_put() is called and refcnt goes down to 0.
*/
set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
if (!cgroup_clear_css_refs(cgrp)) {
mutex_unlock(&cgroup_mutex);
schedule();
finish_wait(&cgroup_rmdir_waitq, &wait);
clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
if (signal_pending(current))
return -EINTR;
goto again;
}
/* NO css_tryget() can success after here. */
finish_wait(&cgroup_rmdir_waitq, &wait);
clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
spin_lock(&release_list_lock);
set_bit(CGRP_REMOVED, &cgrp->flags);
@ -2707,6 +2840,8 @@ int __init cgroup_init(void)
struct cgroup_subsys *ss = subsys[i];
if (!ss->early_init)
cgroup_init_subsys(ss);
if (ss->use_id)
cgroup_subsys_init_idr(ss);
}
/* Add init_css_set to the hash table */
@ -3083,18 +3218,19 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
}
/**
* cgroup_is_descendant - see if @cgrp is a descendant of current task's cgrp
* cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
* @cgrp: the cgroup in question
* @task: the task in question
*
* See if @cgrp is a descendant of the current task's cgroup in
* the appropriate hierarchy.
* See if @cgrp is a descendant of @task's cgroup in the appropriate
* hierarchy.
*
* If we are sending in dummytop, then presumably we are creating
* the top cgroup in the subsystem.
*
* Called only by the ns (nsproxy) cgroup.
*/
int cgroup_is_descendant(const struct cgroup *cgrp)
int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
{
int ret;
struct cgroup *target;
@ -3104,7 +3240,7 @@ int cgroup_is_descendant(const struct cgroup *cgrp)
return 1;
get_first_subsys(cgrp, NULL, &subsys_id);
target = task_cgroup(current, subsys_id);
target = task_cgroup(task, subsys_id);
while (cgrp != target && cgrp!= cgrp->top_cgroup)
cgrp = cgrp->parent;
ret = (cgrp == target);
@ -3137,10 +3273,12 @@ void __css_put(struct cgroup_subsys_state *css)
{
struct cgroup *cgrp = css->cgroup;
rcu_read_lock();
if ((atomic_dec_return(&css->refcnt) == 1) &&
notify_on_release(cgrp)) {
set_bit(CGRP_RELEASABLE, &cgrp->flags);
check_for_release(cgrp);
if (atomic_dec_return(&css->refcnt) == 1) {
if (notify_on_release(cgrp)) {
set_bit(CGRP_RELEASABLE, &cgrp->flags);
check_for_release(cgrp);
}
cgroup_wakeup_rmdir_waiters(cgrp);
}
rcu_read_unlock();
}
@ -3240,3 +3378,232 @@ static int __init cgroup_disable(char *str)
return 1;
}
__setup("cgroup_disable=", cgroup_disable);
/*
* Functons for CSS ID.
*/
/*
*To get ID other than 0, this should be called when !cgroup_is_removed().
*/
unsigned short css_id(struct cgroup_subsys_state *css)
{
struct css_id *cssid = rcu_dereference(css->id);
if (cssid)
return cssid->id;
return 0;
}
unsigned short css_depth(struct cgroup_subsys_state *css)
{
struct css_id *cssid = rcu_dereference(css->id);
if (cssid)
return cssid->depth;
return 0;
}
bool css_is_ancestor(struct cgroup_subsys_state *child,
const struct cgroup_subsys_state *root)
{
struct css_id *child_id = rcu_dereference(child->id);
struct css_id *root_id = rcu_dereference(root->id);
if (!child_id || !root_id || (child_id->depth < root_id->depth))
return false;
return child_id->stack[root_id->depth] == root_id->id;
}
static void __free_css_id_cb(struct rcu_head *head)
{
struct css_id *id;
id = container_of(head, struct css_id, rcu_head);
kfree(id);
}
void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
{
struct css_id *id = css->id;
/* When this is called before css_id initialization, id can be NULL */
if (!id)
return;
BUG_ON(!ss->use_id);
rcu_assign_pointer(id->css, NULL);
rcu_assign_pointer(css->id, NULL);
spin_lock(&ss->id_lock);
idr_remove(&ss->idr, id->id);
spin_unlock(&ss->id_lock);
call_rcu(&id->rcu_head, __free_css_id_cb);
}
/*
* This is called by init or create(). Then, calls to this function are
* always serialized (By cgroup_mutex() at create()).
*/
static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
{
struct css_id *newid;
int myid, error, size;
BUG_ON(!ss->use_id);
size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
newid = kzalloc(size, GFP_KERNEL);
if (!newid)
return ERR_PTR(-ENOMEM);
/* get id */
if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) {
error = -ENOMEM;
goto err_out;
}
spin_lock(&ss->id_lock);
/* Don't use 0. allocates an ID of 1-65535 */
error = idr_get_new_above(&ss->idr, newid, 1, &myid);
spin_unlock(&ss->id_lock);
/* Returns error when there are no free spaces for new ID.*/
if (error) {
error = -ENOSPC;
goto err_out;
}
if (myid > CSS_ID_MAX)
goto remove_idr;
newid->id = myid;
newid->depth = depth;
return newid;
remove_idr:
error = -ENOSPC;
spin_lock(&ss->id_lock);
idr_remove(&ss->idr, myid);
spin_unlock(&ss->id_lock);
err_out:
kfree(newid);
return ERR_PTR(error);
}
static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss)
{
struct css_id *newid;
struct cgroup_subsys_state *rootcss;
spin_lock_init(&ss->id_lock);
idr_init(&ss->idr);
rootcss = init_css_set.subsys[ss->subsys_id];
newid = get_new_cssid(ss, 0);
if (IS_ERR(newid))
return PTR_ERR(newid);
newid->stack[0] = newid->id;
newid->css = rootcss;
rootcss->id = newid;
return 0;
}
static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
struct cgroup *child)
{
int subsys_id, i, depth = 0;
struct cgroup_subsys_state *parent_css, *child_css;
struct css_id *child_id, *parent_id = NULL;
subsys_id = ss->subsys_id;
parent_css = parent->subsys[subsys_id];
child_css = child->subsys[subsys_id];
depth = css_depth(parent_css) + 1;
parent_id = parent_css->id;
child_id = get_new_cssid(ss, depth);
if (IS_ERR(child_id))
return PTR_ERR(child_id);
for (i = 0; i < depth; i++)
child_id->stack[i] = parent_id->stack[i];
child_id->stack[depth] = child_id->id;
/*
* child_id->css pointer will be set after this cgroup is available
* see cgroup_populate_dir()
*/
rcu_assign_pointer(child_css->id, child_id);
return 0;
}
/**
* css_lookup - lookup css by id
* @ss: cgroup subsys to be looked into.
* @id: the id
*
* Returns pointer to cgroup_subsys_state if there is valid one with id.
* NULL if not. Should be called under rcu_read_lock()
*/
struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
{
struct css_id *cssid = NULL;
BUG_ON(!ss->use_id);
cssid = idr_find(&ss->idr, id);
if (unlikely(!cssid))
return NULL;
return rcu_dereference(cssid->css);
}
/**
* css_get_next - lookup next cgroup under specified hierarchy.
* @ss: pointer to subsystem
* @id: current position of iteration.
* @root: pointer to css. search tree under this.
* @foundid: position of found object.
*
* Search next css under the specified hierarchy of rootid. Calling under
* rcu_read_lock() is necessary. Returns NULL if it reaches the end.
*/
struct cgroup_subsys_state *
css_get_next(struct cgroup_subsys *ss, int id,
struct cgroup_subsys_state *root, int *foundid)
{
struct cgroup_subsys_state *ret = NULL;
struct css_id *tmp;
int tmpid;
int rootid = css_id(root);
int depth = css_depth(root);
if (!rootid)
return NULL;
BUG_ON(!ss->use_id);
/* fill start point for scan */
tmpid = id;
while (1) {
/*
* scan next entry from bitmap(tree), tmpid is updated after
* idr_get_next().
*/
spin_lock(&ss->id_lock);
tmp = idr_get_next(&ss->idr, &tmpid);
spin_unlock(&ss->id_lock);
if (!tmp)
break;
if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
ret = rcu_dereference(tmp->css);
if (ret) {
*foundid = tmpid;
break;
}
}
/* continue to scan from next id */
tmpid = tmpid + 1;
}
return ret;
}

View file

@ -40,9 +40,7 @@ static u64 taskcount_read(struct cgroup *cont, struct cftype *cft)
{
u64 count;
cgroup_lock();
count = cgroup_task_count(cont);
cgroup_unlock();
return count;
}

View file

@ -281,7 +281,7 @@ int __ref cpu_down(unsigned int cpu)
goto out;
}
cpu_clear(cpu, cpu_active_map);
set_cpu_active(cpu, false);
/*
* Make sure the all cpus did the reschedule and are not
@ -296,7 +296,7 @@ int __ref cpu_down(unsigned int cpu)
err = _cpu_down(cpu, 0);
if (cpu_online(cpu))
cpu_set(cpu, cpu_active_map);
set_cpu_active(cpu, true);
out:
cpu_maps_update_done();
@ -333,7 +333,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
goto out_notify;
BUG_ON(!cpu_online(cpu));
cpu_set(cpu, cpu_active_map);
set_cpu_active(cpu, true);
/* Now call notifier in preparation. */
raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu);

View file

@ -128,10 +128,6 @@ static inline struct cpuset *task_cs(struct task_struct *task)
return container_of(task_subsys_state(task, cpuset_subsys_id),
struct cpuset, css);
}
struct cpuset_hotplug_scanner {
struct cgroup_scanner scan;
struct cgroup *to;
};
/* bits in struct cpuset flags field */
typedef enum {
@ -521,6 +517,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
return 0;
}
#ifdef CONFIG_SMP
/*
* Helper routine for generate_sched_domains().
* Do cpusets a, b have overlapping cpus_allowed masks?
@ -815,6 +812,18 @@ static void do_rebuild_sched_domains(struct work_struct *unused)
put_online_cpus();
}
#else /* !CONFIG_SMP */
static void do_rebuild_sched_domains(struct work_struct *unused)
{
}
static int generate_sched_domains(struct cpumask **domains,
struct sched_domain_attr **attributes)
{
*domains = NULL;
return 1;
}
#endif /* CONFIG_SMP */
static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
@ -1026,101 +1035,70 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
mutex_unlock(&callback_mutex);
}
/*
* Rebind task's vmas to cpuset's new mems_allowed, and migrate pages to new
* nodes if memory_migrate flag is set. Called with cgroup_mutex held.
*/
static void cpuset_change_nodemask(struct task_struct *p,
struct cgroup_scanner *scan)
{
struct mm_struct *mm;
struct cpuset *cs;
int migrate;
const nodemask_t *oldmem = scan->data;
mm = get_task_mm(p);
if (!mm)
return;
cs = cgroup_cs(scan->cg);
migrate = is_memory_migrate(cs);
mpol_rebind_mm(mm, &cs->mems_allowed);
if (migrate)
cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
mmput(mm);
}
static void *cpuset_being_rebound;
/**
* update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
* @cs: the cpuset in which each task's mems_allowed mask needs to be changed
* @oldmem: old mems_allowed of cpuset cs
* @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
*
* Called with cgroup_mutex held
* Return 0 if successful, -errno if not.
* No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
* if @heap != NULL.
*/
static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
struct ptr_heap *heap)
{
struct task_struct *p;
struct mm_struct **mmarray;
int i, n, ntasks;
int migrate;
int fudge;
struct cgroup_iter it;
int retval;
struct cgroup_scanner scan;
cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
fudge = 10; /* spare mmarray[] slots */
fudge += cpumask_weight(cs->cpus_allowed);/* imagine 1 fork-bomb/cpu */
retval = -ENOMEM;
scan.cg = cs->css.cgroup;
scan.test_task = NULL;
scan.process_task = cpuset_change_nodemask;
scan.heap = heap;
scan.data = (nodemask_t *)oldmem;
/*
* Allocate mmarray[] to hold mm reference for each task
* in cpuset cs. Can't kmalloc GFP_KERNEL while holding
* tasklist_lock. We could use GFP_ATOMIC, but with a
* few more lines of code, we can retry until we get a big
* enough mmarray[] w/o using GFP_ATOMIC.
*/
while (1) {
ntasks = cgroup_task_count(cs->css.cgroup); /* guess */
ntasks += fudge;
mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
if (!mmarray)
goto done;
read_lock(&tasklist_lock); /* block fork */
if (cgroup_task_count(cs->css.cgroup) <= ntasks)
break; /* got enough */
read_unlock(&tasklist_lock); /* try again */
kfree(mmarray);
}
n = 0;
/* Load up mmarray[] with mm reference for each task in cpuset. */
cgroup_iter_start(cs->css.cgroup, &it);
while ((p = cgroup_iter_next(cs->css.cgroup, &it))) {
struct mm_struct *mm;
if (n >= ntasks) {
printk(KERN_WARNING
"Cpuset mempolicy rebind incomplete.\n");
break;
}
mm = get_task_mm(p);
if (!mm)
continue;
mmarray[n++] = mm;
}
cgroup_iter_end(cs->css.cgroup, &it);
read_unlock(&tasklist_lock);
/*
* Now that we've dropped the tasklist spinlock, we can
* rebind the vma mempolicies of each mm in mmarray[] to their
* new cpuset, and release that mm. The mpol_rebind_mm()
* call takes mmap_sem, which we couldn't take while holding
* tasklist_lock. Forks can happen again now - the mpol_dup()
* cpuset_being_rebound check will catch such forks, and rebind
* their vma mempolicies too. Because we still hold the global
* cgroup_mutex, we know that no other rebind effort will
* be contending for the global variable cpuset_being_rebound.
* The mpol_rebind_mm() call takes mmap_sem, which we couldn't
* take while holding tasklist_lock. Forks can happen - the
* mpol_dup() cpuset_being_rebound check will catch such forks,
* and rebind their vma mempolicies too. Because we still hold
* the global cgroup_mutex, we know that no other rebind effort
* will be contending for the global variable cpuset_being_rebound.
* It's ok if we rebind the same mm twice; mpol_rebind_mm()
* is idempotent. Also migrate pages in each mm to new nodes.
*/
migrate = is_memory_migrate(cs);
for (i = 0; i < n; i++) {
struct mm_struct *mm = mmarray[i];
mpol_rebind_mm(mm, &cs->mems_allowed);
if (migrate)
cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
mmput(mm);
}
cgroup_scan_tasks(&scan);
/* We're done rebinding vmas to this cpuset's new mems_allowed. */
kfree(mmarray);
cpuset_being_rebound = NULL;
retval = 0;
done:
return retval;
}
/*
@ -1141,6 +1119,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
{
nodemask_t oldmem;
int retval;
struct ptr_heap heap;
/*
* top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
@ -1175,12 +1154,18 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
goto done;
retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
if (retval < 0)
goto done;
mutex_lock(&callback_mutex);
cs->mems_allowed = trialcs->mems_allowed;
cs->mems_generation = cpuset_mems_generation++;
mutex_unlock(&callback_mutex);
retval = update_tasks_nodemask(cs, &oldmem);
update_tasks_nodemask(cs, &oldmem, &heap);
heap_free(&heap);
done:
return retval;
}
@ -1192,8 +1177,10 @@ int current_cpuset_is_being_rebound(void)
static int update_relax_domain_level(struct cpuset *cs, s64 val)
{
#ifdef CONFIG_SMP
if (val < -1 || val >= SD_LV_MAX)
return -EINVAL;
#endif
if (val != cs->relax_domain_level) {
cs->relax_domain_level = val;
@ -1355,19 +1342,22 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
struct cgroup *cont, struct task_struct *tsk)
{
struct cpuset *cs = cgroup_cs(cont);
int ret = 0;
if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
return -ENOSPC;
if (tsk->flags & PF_THREAD_BOUND) {
mutex_lock(&callback_mutex);
if (!cpumask_equal(&tsk->cpus_allowed, cs->cpus_allowed))
ret = -EINVAL;
mutex_unlock(&callback_mutex);
}
/*
* Kthreads bound to specific cpus cannot be moved to a new cpuset; we
* cannot change their cpu affinity and isolating such threads by their
* set of allowed nodes is unnecessary. Thus, cpusets are not
* applicable for such threads. This prevents checking for success of
* set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may
* be changed.
*/
if (tsk->flags & PF_THREAD_BOUND)
return -EINVAL;
return ret < 0 ? ret : security_task_setscheduler(tsk, 0, NULL);
return security_task_setscheduler(tsk, 0, NULL);
}
static void cpuset_attach(struct cgroup_subsys *ss,
@ -1706,6 +1696,7 @@ static struct cftype files[] = {
.read_u64 = cpuset_read_u64,
.write_u64 = cpuset_write_u64,
.private = FILE_MEMORY_PRESSURE,
.mode = S_IRUGO,
},
{
@ -1913,10 +1904,9 @@ int __init cpuset_init(void)
static void cpuset_do_move_task(struct task_struct *tsk,
struct cgroup_scanner *scan)
{
struct cpuset_hotplug_scanner *chsp;
struct cgroup *new_cgroup = scan->data;
chsp = container_of(scan, struct cpuset_hotplug_scanner, scan);
cgroup_attach_task(chsp->to, tsk);
cgroup_attach_task(new_cgroup, tsk);
}
/**
@ -1932,15 +1922,15 @@ static void cpuset_do_move_task(struct task_struct *tsk,
*/
static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
{
struct cpuset_hotplug_scanner scan;
struct cgroup_scanner scan;
scan.scan.cg = from->css.cgroup;
scan.scan.test_task = NULL; /* select all tasks in cgroup */
scan.scan.process_task = cpuset_do_move_task;
scan.scan.heap = NULL;
scan.to = to->css.cgroup;
scan.cg = from->css.cgroup;
scan.test_task = NULL; /* select all tasks in cgroup */
scan.process_task = cpuset_do_move_task;
scan.heap = NULL;
scan.data = to->css.cgroup;
if (cgroup_scan_tasks(&scan.scan))
if (cgroup_scan_tasks(&scan))
printk(KERN_ERR "move_member_tasks_to_cpuset: "
"cgroup_scan_tasks failed\n");
}
@ -2033,7 +2023,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
remove_tasks_in_empty_cpuset(cp);
else {
update_tasks_cpumask(cp, NULL);
update_tasks_nodemask(cp, &oldmems);
update_tasks_nodemask(cp, &oldmems, NULL);
}
}
}
@ -2069,7 +2059,9 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
}
cgroup_lock();
mutex_lock(&callback_mutex);
cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
mutex_unlock(&callback_mutex);
scan_for_empty_cpusets(&top_cpuset);
ndoms = generate_sched_domains(&doms, &attr);
cgroup_unlock();
@ -2092,11 +2084,12 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
cgroup_lock();
switch (action) {
case MEM_ONLINE:
top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
break;
case MEM_OFFLINE:
mutex_lock(&callback_mutex);
top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
scan_for_empty_cpusets(&top_cpuset);
mutex_unlock(&callback_mutex);
if (action == MEM_OFFLINE)
scan_for_empty_cpusets(&top_cpuset);
break;
default:
break;
@ -2206,26 +2199,24 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
}
/**
* cpuset_zone_allowed_softwall - Can we allocate on zone z's memory node?
* @z: is this zone on an allowed node?
* cpuset_node_allowed_softwall - Can we allocate on a memory node?
* @node: is this an allowed node?
* @gfp_mask: memory allocation flags
*
* If we're in interrupt, yes, we can always allocate. If
* __GFP_THISNODE is set, yes, we can always allocate. If zone
* z's node is in our tasks mems_allowed, yes. If it's not a
* __GFP_HARDWALL request and this zone's nodes is in the nearest
* hardwalled cpuset ancestor to this tasks cpuset, yes.
* If the task has been OOM killed and has access to memory reserves
* as specified by the TIF_MEMDIE flag, yes.
* If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is
* set, yes, we can always allocate. If node is in our task's mems_allowed,
* yes. If it's not a __GFP_HARDWALL request and this node is in the nearest
* hardwalled cpuset ancestor to this task's cpuset, yes. If the task has been
* OOM killed and has access to memory reserves as specified by the TIF_MEMDIE
* flag, yes.
* Otherwise, no.
*
* If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall()
* reduces to cpuset_zone_allowed_hardwall(). Otherwise,
* cpuset_zone_allowed_softwall() might sleep, and might allow a zone
* from an enclosing cpuset.
* If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to
* cpuset_node_allowed_hardwall(). Otherwise, cpuset_node_allowed_softwall()
* might sleep, and might allow a node from an enclosing cpuset.
*
* cpuset_zone_allowed_hardwall() only handles the simpler case of
* hardwall cpusets, and never sleeps.
* cpuset_node_allowed_hardwall() only handles the simpler case of hardwall
* cpusets, and never sleeps.
*
* The __GFP_THISNODE placement logic is really handled elsewhere,
* by forcibly using a zonelist starting at a specified node, and by
@ -2264,20 +2255,17 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
* GFP_USER - only nodes in current tasks mems allowed ok.
*
* Rule:
* Don't call cpuset_zone_allowed_softwall if you can't sleep, unless you
* Don't call cpuset_node_allowed_softwall if you can't sleep, unless you
* pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
* the code that might scan up ancestor cpusets and sleep.
*/
int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
{
int node; /* node that zone z is on */
const struct cpuset *cs; /* current cpuset ancestors */
int allowed; /* is allocation in zone z allowed? */
if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
return 1;
node = zone_to_nid(z);
might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
if (node_isset(node, current->mems_allowed))
return 1;
@ -2306,15 +2294,15 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
}
/*
* cpuset_zone_allowed_hardwall - Can we allocate on zone z's memory node?
* @z: is this zone on an allowed node?
* cpuset_node_allowed_hardwall - Can we allocate on a memory node?
* @node: is this an allowed node?
* @gfp_mask: memory allocation flags
*
* If we're in interrupt, yes, we can always allocate.
* If __GFP_THISNODE is set, yes, we can always allocate. If zone
* z's node is in our tasks mems_allowed, yes. If the task has been
* OOM killed and has access to memory reserves as specified by the
* TIF_MEMDIE flag, yes. Otherwise, no.
* If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is
* set, yes, we can always allocate. If node is in our task's mems_allowed,
* yes. If the task has been OOM killed and has access to memory reserves as
* specified by the TIF_MEMDIE flag, yes.
* Otherwise, no.
*
* The __GFP_THISNODE placement logic is really handled elsewhere,
* by forcibly using a zonelist starting at a specified node, and by
@ -2322,20 +2310,16 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
* any node on the zonelist except the first. By the time any such
* calls get to this routine, we should just shut up and say 'yes'.
*
* Unlike the cpuset_zone_allowed_softwall() variant, above,
* this variant requires that the zone be in the current tasks
* Unlike the cpuset_node_allowed_softwall() variant, above,
* this variant requires that the node be in the current task's
* mems_allowed or that we're in interrupt. It does not scan up the
* cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
* It never sleeps.
*/
int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
{
int node; /* node that zone z is on */
if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
return 1;
node = zone_to_nid(z);
if (node_isset(node, current->mems_allowed))
return 1;
/*

View file

@ -18,6 +18,7 @@
#include <linux/syscalls.h>
#include <linux/sysctl.h>
#include <linux/types.h>
#include <linux/fs_struct.h>
static void default_handler(int, struct pt_regs *);
@ -145,28 +146,6 @@ __set_personality(u_long personality)
return 0;
}
if (atomic_read(&current->fs->count) != 1) {
struct fs_struct *fsp, *ofsp;
fsp = copy_fs_struct(current->fs);
if (fsp == NULL) {
module_put(ep->module);
return -ENOMEM;
}
task_lock(current);
ofsp = current->fs;
current->fs = fsp;
task_unlock(current);
put_fs_struct(ofsp);
}
/*
* At that point we are guaranteed to be the sole owner of
* current->fs.
*/
current->personality = personality;
oep = current_thread_info()->exec_domain;
current_thread_info()->exec_domain = ep;

View file

@ -46,6 +46,7 @@
#include <linux/blkdev.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/tracehook.h>
#include <linux/fs_struct.h>
#include <linux/init_task.h>
#include <trace/sched.h>
@ -61,11 +62,6 @@ DEFINE_TRACE(sched_process_wait);
static void exit_mm(struct task_struct * tsk);
static inline int task_detached(struct task_struct *p)
{
return p->exit_signal == -1;
}
static void __unhash_process(struct task_struct *p)
{
nr_threads--;
@ -362,16 +358,12 @@ static void reparent_to_kthreadd(void)
void __set_special_pids(struct pid *pid)
{
struct task_struct *curr = current->group_leader;
pid_t nr = pid_nr(pid);
if (task_session(curr) != pid) {
if (task_session(curr) != pid)
change_pid(curr, PIDTYPE_SID, pid);
set_task_session(curr, nr);
}
if (task_pgrp(curr) != pid) {
if (task_pgrp(curr) != pid)
change_pid(curr, PIDTYPE_PGID, pid);
set_task_pgrp(curr, nr);
}
}
static void set_special_pids(struct pid *pid)
@ -429,7 +421,6 @@ EXPORT_SYMBOL(disallow_signal);
void daemonize(const char *name, ...)
{
va_list args;
struct fs_struct *fs;
sigset_t blocked;
va_start(args, name);
@ -462,11 +453,7 @@ void daemonize(const char *name, ...)
/* Become as one with the init task */
exit_fs(current); /* current->fs->count--; */
fs = init_task.fs;
current->fs = fs;
atomic_inc(&fs->count);
daemonize_fs_struct();
exit_files(current);
current->files = init_task.files;
atomic_inc(&current->files->count);
@ -565,30 +552,6 @@ void exit_files(struct task_struct *tsk)
}
}
void put_fs_struct(struct fs_struct *fs)
{
/* No need to hold fs->lock if we are killing it */
if (atomic_dec_and_test(&fs->count)) {
path_put(&fs->root);
path_put(&fs->pwd);
kmem_cache_free(fs_cachep, fs);
}
}
void exit_fs(struct task_struct *tsk)
{
struct fs_struct * fs = tsk->fs;
if (fs) {
task_lock(tsk);
tsk->fs = NULL;
task_unlock(tsk);
put_fs_struct(fs);
}
}
EXPORT_SYMBOL_GPL(exit_fs);
#ifdef CONFIG_MM_OWNER
/*
* Task p is exiting and it owned mm, lets find a new owner for it
@ -731,119 +694,6 @@ static void exit_mm(struct task_struct * tsk)
mmput(mm);
}
/*
* Return nonzero if @parent's children should reap themselves.
*
* Called with write_lock_irq(&tasklist_lock) held.
*/
static int ignoring_children(struct task_struct *parent)
{
int ret;
struct sighand_struct *psig = parent->sighand;
unsigned long flags;
spin_lock_irqsave(&psig->siglock, flags);
ret = (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
(psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT));
spin_unlock_irqrestore(&psig->siglock, flags);
return ret;
}
/*
* Detach all tasks we were using ptrace on.
* Any that need to be release_task'd are put on the @dead list.
*
* Called with write_lock(&tasklist_lock) held.
*/
static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
{
struct task_struct *p, *n;
int ign = -1;
list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) {
__ptrace_unlink(p);
if (p->exit_state != EXIT_ZOMBIE)
continue;
/*
* If it's a zombie, our attachedness prevented normal
* parent notification or self-reaping. Do notification
* now if it would have happened earlier. If it should
* reap itself, add it to the @dead list. We can't call
* release_task() here because we already hold tasklist_lock.
*
* If it's our own child, there is no notification to do.
* But if our normal children self-reap, then this child
* was prevented by ptrace and we must reap it now.
*/
if (!task_detached(p) && thread_group_empty(p)) {
if (!same_thread_group(p->real_parent, parent))
do_notify_parent(p, p->exit_signal);
else {
if (ign < 0)
ign = ignoring_children(parent);
if (ign)
p->exit_signal = -1;
}
}
if (task_detached(p)) {
/*
* Mark it as in the process of being reaped.
*/
p->exit_state = EXIT_DEAD;
list_add(&p->ptrace_entry, dead);
}
}
}
/*
* Finish up exit-time ptrace cleanup.
*
* Called without locks.
*/
static void ptrace_exit_finish(struct task_struct *parent,
struct list_head *dead)
{
struct task_struct *p, *n;
BUG_ON(!list_empty(&parent->ptraced));
list_for_each_entry_safe(p, n, dead, ptrace_entry) {
list_del_init(&p->ptrace_entry);
release_task(p);
}
}
static void reparent_thread(struct task_struct *p, struct task_struct *father)
{
if (p->pdeath_signal)
/* We already hold the tasklist_lock here. */
group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
list_move_tail(&p->sibling, &p->real_parent->children);
/* If this is a threaded reparent there is no need to
* notify anyone anything has happened.
*/
if (same_thread_group(p->real_parent, father))
return;
/* We don't want people slaying init. */
if (!task_detached(p))
p->exit_signal = SIGCHLD;
/* If we'd notified the old parent about this child's death,
* also notify the new parent.
*/
if (!ptrace_reparented(p) &&
p->exit_state == EXIT_ZOMBIE &&
!task_detached(p) && thread_group_empty(p))
do_notify_parent(p, p->exit_signal);
kill_orphaned_pgrp(p, father);
}
/*
* When we die, we re-parent all our children.
* Try to give them to another thread in our thread
@ -883,17 +733,51 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
return pid_ns->child_reaper;
}
/*
* Any that need to be release_task'd are put on the @dead list.
*/
static void reparent_thread(struct task_struct *father, struct task_struct *p,
struct list_head *dead)
{
if (p->pdeath_signal)
group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
list_move_tail(&p->sibling, &p->real_parent->children);
if (task_detached(p))
return;
/*
* If this is a threaded reparent there is no need to
* notify anyone anything has happened.
*/
if (same_thread_group(p->real_parent, father))
return;
/* We don't want people slaying init. */
p->exit_signal = SIGCHLD;
/* If it has exited notify the new parent about this child's death. */
if (!p->ptrace &&
p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
do_notify_parent(p, p->exit_signal);
if (task_detached(p)) {
p->exit_state = EXIT_DEAD;
list_move_tail(&p->sibling, dead);
}
}
kill_orphaned_pgrp(p, father);
}
static void forget_original_parent(struct task_struct *father)
{
struct task_struct *p, *n, *reaper;
LIST_HEAD(ptrace_dead);
LIST_HEAD(dead_children);
exit_ptrace(father);
write_lock_irq(&tasklist_lock);
reaper = find_new_reaper(father);
/*
* First clean up ptrace if we were using it.
*/
ptrace_exit(father, &ptrace_dead);
list_for_each_entry_safe(p, n, &father->children, sibling) {
p->real_parent = reaper;
@ -901,13 +785,16 @@ static void forget_original_parent(struct task_struct *father)
BUG_ON(p->ptrace);
p->parent = p->real_parent;
}
reparent_thread(p, father);
reparent_thread(father, p, &dead_children);
}
write_unlock_irq(&tasklist_lock);
BUG_ON(!list_empty(&father->children));
ptrace_exit_finish(father, &ptrace_dead);
list_for_each_entry_safe(p, n, &dead_children, sibling) {
list_del_init(&p->sibling);
release_task(p);
}
}
/*
@ -1419,6 +1306,18 @@ static int wait_task_zombie(struct task_struct *p, int options,
return retval;
}
static int *task_stopped_code(struct task_struct *p, bool ptrace)
{
if (ptrace) {
if (task_is_stopped_or_traced(p))
return &p->exit_code;
} else {
if (p->signal->flags & SIGNAL_STOP_STOPPED)
return &p->signal->group_exit_code;
}
return NULL;
}
/*
* Handle sys_wait4 work for one task in state TASK_STOPPED. We hold
* read_lock(&tasklist_lock) on entry. If we return zero, we still hold
@ -1429,7 +1328,7 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
int options, struct siginfo __user *infop,
int __user *stat_addr, struct rusage __user *ru)
{
int retval, exit_code, why;
int retval, exit_code, *p_code, why;
uid_t uid = 0; /* unneeded, required by compiler */
pid_t pid;
@ -1439,22 +1338,16 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
exit_code = 0;
spin_lock_irq(&p->sighand->siglock);
if (unlikely(!task_is_stopped_or_traced(p)))
p_code = task_stopped_code(p, ptrace);
if (unlikely(!p_code))
goto unlock_sig;
if (!ptrace && p->signal->group_stop_count > 0)
/*
* A group stop is in progress and this is the group leader.
* We won't report until all threads have stopped.
*/
goto unlock_sig;
exit_code = p->exit_code;
exit_code = *p_code;
if (!exit_code)
goto unlock_sig;
if (!unlikely(options & WNOWAIT))
p->exit_code = 0;
*p_code = 0;
/* don't need the RCU readlock here as we're holding a spinlock */
uid = __task_cred(p)->uid;
@ -1610,7 +1503,7 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
*/
*notask_error = 0;
if (task_is_stopped_or_traced(p))
if (task_stopped_code(p, ptrace))
return wait_task_stopped(ptrace, p, options,
infop, stat_addr, ru);
@ -1814,7 +1707,7 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
pid = find_get_pid(-upid);
} else if (upid == 0) {
type = PIDTYPE_PGID;
pid = get_pid(task_pgrp(current));
pid = get_task_pid(current, PIDTYPE_PGID);
} else /* upid > 0 */ {
type = PIDTYPE_PID;
pid = find_get_pid(upid);

View file

@ -15,11 +15,22 @@
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <linux/module.h>
#include <linux/init.h>
#include <linux/ftrace.h>
#include <asm/uaccess.h>
#include <linux/memory.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/init.h>
#include <asm/sections.h>
#include <asm/uaccess.h>
/*
* mutex protecting text section modification (dynamic code patching).
* some users need to sleep (allocating memory...) while they hold this lock.
*
* NOT exported to modules - patching kernel text is a really delicate matter.
*/
DEFINE_MUTEX(text_mutex);
extern struct exception_table_entry __start___ex_table[];
extern struct exception_table_entry __stop___ex_table[];
@ -41,31 +52,50 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
return e;
}
__notrace_funcgraph int core_kernel_text(unsigned long addr)
static inline int init_kernel_text(unsigned long addr)
{
if (addr >= (unsigned long)_sinittext &&
addr <= (unsigned long)_einittext)
return 1;
return 0;
}
int core_kernel_text(unsigned long addr)
{
if (addr >= (unsigned long)_stext &&
addr <= (unsigned long)_etext)
return 1;
if (system_state == SYSTEM_BOOTING &&
addr >= (unsigned long)_sinittext &&
addr <= (unsigned long)_einittext)
init_kernel_text(addr))
return 1;
return 0;
}
__notrace_funcgraph int __kernel_text_address(unsigned long addr)
int __kernel_text_address(unsigned long addr)
{
if (core_kernel_text(addr))
return 1;
return __module_text_address(addr) != NULL;
if (is_module_text_address(addr))
return 1;
/*
* There might be init symbols in saved stacktraces.
* Give those symbols a chance to be printed in
* backtraces (such as lockdep traces).
*
* Since we are after the module-symbols check, there's
* no danger of address overlap:
*/
if (init_kernel_text(addr))
return 1;
return 0;
}
int kernel_text_address(unsigned long addr)
{
if (core_kernel_text(addr))
return 1;
return module_text_address(addr) != NULL;
return is_module_text_address(addr);
}
/*
@ -81,5 +111,5 @@ int func_ptr_is_kernel_text(void *ptr)
addr = (unsigned long) dereference_function_descriptor(ptr);
if (core_kernel_text(addr))
return 1;
return module_text_address(addr) != NULL;
return is_module_text_address(addr);
}

View file

@ -60,6 +60,7 @@
#include <linux/tty.h>
#include <linux/proc_fs.h>
#include <linux/blkdev.h>
#include <linux/fs_struct.h>
#include <trace/sched.h>
#include <linux/magic.h>
@ -284,7 +285,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
mm->free_area_cache = oldmm->mmap_base;
mm->cached_hole_size = ~0UL;
mm->map_count = 0;
cpus_clear(mm->cpu_vm_mask);
cpumask_clear(mm_cpumask(mm));
mm->mm_rb = RB_ROOT;
rb_link = &mm->mm_rb.rb_node;
rb_parent = NULL;
@ -681,38 +682,21 @@ fail_nomem:
return retval;
}
static struct fs_struct *__copy_fs_struct(struct fs_struct *old)
{
struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
/* We don't need to lock fs - think why ;-) */
if (fs) {
atomic_set(&fs->count, 1);
rwlock_init(&fs->lock);
fs->umask = old->umask;
read_lock(&old->lock);
fs->root = old->root;
path_get(&old->root);
fs->pwd = old->pwd;
path_get(&old->pwd);
read_unlock(&old->lock);
}
return fs;
}
struct fs_struct *copy_fs_struct(struct fs_struct *old)
{
return __copy_fs_struct(old);
}
EXPORT_SYMBOL_GPL(copy_fs_struct);
static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
{
struct fs_struct *fs = current->fs;
if (clone_flags & CLONE_FS) {
atomic_inc(&current->fs->count);
/* tsk->fs is already what we want */
write_lock(&fs->lock);
if (fs->in_exec) {
write_unlock(&fs->lock);
return -EAGAIN;
}
fs->users++;
write_unlock(&fs->lock);
return 0;
}
tsk->fs = __copy_fs_struct(current->fs);
tsk->fs = copy_fs_struct(fs);
if (!tsk->fs)
return -ENOMEM;
return 0;
@ -841,6 +825,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
atomic_set(&sig->live, 1);
init_waitqueue_head(&sig->wait_chldexit);
sig->flags = 0;
if (clone_flags & CLONE_NEWPID)
sig->flags |= SIGNAL_UNKILLABLE;
sig->group_exit_code = 0;
sig->group_exit_task = NULL;
sig->group_stop_count = 0;
@ -1125,7 +1111,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
goto bad_fork_cleanup_mm;
if ((retval = copy_io(clone_flags, p)))
goto bad_fork_cleanup_namespaces;
retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
if (retval)
goto bad_fork_cleanup_io;
@ -1263,8 +1249,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->signal->leader_pid = pid;
tty_kref_put(p->signal->tty);
p->signal->tty = tty_kref_get(current->signal->tty);
set_task_pgrp(p, task_pgrp_nr(current));
set_task_session(p, task_session_nr(current));
attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
attach_pid(p, PIDTYPE_SID, task_session(current));
list_add_tail_rcu(&p->tasks, &init_task.tasks);
@ -1488,6 +1472,7 @@ void __init proc_caches_init(void)
mm_cachep = kmem_cache_create("mm_struct",
sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
mmap_init();
}
@ -1543,12 +1528,16 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
{
struct fs_struct *fs = current->fs;
if ((unshare_flags & CLONE_FS) &&
(fs && atomic_read(&fs->count) > 1)) {
*new_fsp = __copy_fs_struct(current->fs);
if (!*new_fsp)
return -ENOMEM;
}
if (!(unshare_flags & CLONE_FS) || !fs)
return 0;
/* don't need lock here; in the worst case we'll do useless copy */
if (fs->users == 1)
return 0;
*new_fsp = copy_fs_struct(fs);
if (!*new_fsp)
return -ENOMEM;
return 0;
}
@ -1664,8 +1653,13 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
if (new_fs) {
fs = current->fs;
write_lock(&fs->lock);
current->fs = new_fs;
new_fs = fs;
if (--fs->users)
new_fs = NULL;
else
new_fs = fs;
write_unlock(&fs->lock);
}
if (new_mm) {
@ -1704,7 +1698,7 @@ bad_unshare_cleanup_sigh:
bad_unshare_cleanup_fs:
if (new_fs)
put_fs_struct(new_fs);
free_fs_struct(new_fs);
bad_unshare_cleanup_thread:
bad_unshare_out:

View file

@ -114,7 +114,9 @@ struct futex_q {
};
/*
* Split the global futex_lock into every hash list lock.
* Hash buckets are shared by all the futex_keys that hash to the same
* location. Each key may have multiple futex_q structures, one for each task
* waiting on a futex.
*/
struct futex_hash_bucket {
spinlock_t lock;
@ -189,8 +191,7 @@ static void drop_futex_key_refs(union futex_key *key)
/**
* get_futex_key - Get parameters which are the keys for a futex.
* @uaddr: virtual address of the futex
* @shared: NULL for a PROCESS_PRIVATE futex,
* &current->mm->mmap_sem for a PROCESS_SHARED futex
* @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
* @key: address where result is stored.
*
* Returns a negative error code or 0
@ -200,9 +201,7 @@ static void drop_futex_key_refs(union futex_key *key)
* offset_within_page). For private mappings, it's (uaddr, current->mm).
* We can usually work out the index without swapping in the page.
*
* fshared is NULL for PROCESS_PRIVATE futexes
* For other futexes, it points to &current->mm->mmap_sem and
* caller must have taken the reader lock. but NOT any spinlocks.
* lock_page() might sleep, the caller should not hold a spinlock.
*/
static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
{
@ -299,41 +298,6 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from)
return ret ? -EFAULT : 0;
}
/*
* Fault handling.
*/
static int futex_handle_fault(unsigned long address, int attempt)
{
struct vm_area_struct * vma;
struct mm_struct *mm = current->mm;
int ret = -EFAULT;
if (attempt > 2)
return ret;
down_read(&mm->mmap_sem);
vma = find_vma(mm, address);
if (vma && address >= vma->vm_start &&
(vma->vm_flags & VM_WRITE)) {
int fault;
fault = handle_mm_fault(mm, vma, address, 1);
if (unlikely((fault & VM_FAULT_ERROR))) {
#if 0
/* XXX: let's do this when we verify it is OK */
if (ret & VM_FAULT_OOM)
ret = -ENOMEM;
#endif
} else {
ret = 0;
if (fault & VM_FAULT_MAJOR)
current->maj_flt++;
else
current->min_flt++;
}
}
up_read(&mm->mmap_sem);
return ret;
}
/*
* PI code:
@ -589,10 +553,9 @@ static void wake_futex(struct futex_q *q)
* The waiting task can free the futex_q as soon as this is written,
* without taking any locks. This must come last.
*
* A memory barrier is required here to prevent the following store
* to lock_ptr from getting ahead of the wakeup. Clearing the lock
* at the end of wake_up_all() does not prevent this store from
* moving.
* A memory barrier is required here to prevent the following store to
* lock_ptr from getting ahead of the wakeup. Clearing the lock at the
* end of wake_up() does not prevent this store from moving.
*/
smp_wmb();
q->lock_ptr = NULL;
@ -692,9 +655,16 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
}
}
static inline void
double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
{
spin_unlock(&hb1->lock);
if (hb1 != hb2)
spin_unlock(&hb2->lock);
}
/*
* Wake up all waiters hashed on the physical page that is mapped
* to this virtual address:
* Wake up waiters matching bitset queued on this futex (uaddr).
*/
static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
{
@ -750,9 +720,9 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
struct futex_hash_bucket *hb1, *hb2;
struct plist_head *head;
struct futex_q *this, *next;
int ret, op_ret, attempt = 0;
int ret, op_ret;
retryfull:
retry:
ret = get_futex_key(uaddr1, fshared, &key1);
if (unlikely(ret != 0))
goto out;
@ -763,16 +733,13 @@ retryfull:
hb1 = hash_futex(&key1);
hb2 = hash_futex(&key2);
retry:
double_lock_hb(hb1, hb2);
retry_private:
op_ret = futex_atomic_op_inuser(op, uaddr2);
if (unlikely(op_ret < 0)) {
u32 dummy;
spin_unlock(&hb1->lock);
if (hb1 != hb2)
spin_unlock(&hb2->lock);
double_unlock_hb(hb1, hb2);
#ifndef CONFIG_MMU
/*
@ -788,26 +755,16 @@ retry:
goto out_put_keys;
}
/*
* futex_atomic_op_inuser needs to both read and write
* *(int __user *)uaddr2, but we can't modify it
* non-atomically. Therefore, if get_user below is not
* enough, we need to handle the fault ourselves, while
* still holding the mmap_sem.
*/
if (attempt++) {
ret = futex_handle_fault((unsigned long)uaddr2,
attempt);
if (ret)
goto out_put_keys;
goto retry;
}
ret = get_user(dummy, uaddr2);
if (ret)
return ret;
goto out_put_keys;
goto retryfull;
if (!fshared)
goto retry_private;
put_futex_key(fshared, &key2);
put_futex_key(fshared, &key1);
goto retry;
}
head = &hb1->chain;
@ -834,9 +791,7 @@ retry:
ret += op_ret;
}
spin_unlock(&hb1->lock);
if (hb1 != hb2)
spin_unlock(&hb2->lock);
double_unlock_hb(hb1, hb2);
out_put_keys:
put_futex_key(fshared, &key2);
out_put_key1:
@ -869,6 +824,7 @@ retry:
hb1 = hash_futex(&key1);
hb2 = hash_futex(&key2);
retry_private:
double_lock_hb(hb1, hb2);
if (likely(cmpval != NULL)) {
@ -877,16 +833,18 @@ retry:
ret = get_futex_value_locked(&curval, uaddr1);
if (unlikely(ret)) {
spin_unlock(&hb1->lock);
if (hb1 != hb2)
spin_unlock(&hb2->lock);
double_unlock_hb(hb1, hb2);
ret = get_user(curval, uaddr1);
if (ret)
goto out_put_keys;
if (!ret)
goto retry;
if (!fshared)
goto retry_private;
goto out_put_keys;
put_futex_key(fshared, &key2);
put_futex_key(fshared, &key1);
goto retry;
}
if (curval != *cmpval) {
ret = -EAGAIN;
@ -923,9 +881,7 @@ retry:
}
out_unlock:
spin_unlock(&hb1->lock);
if (hb1 != hb2)
spin_unlock(&hb2->lock);
double_unlock_hb(hb1, hb2);
/* drop_futex_key_refs() must be called outside the spinlocks. */
while (--drop_count >= 0)
@ -1063,7 +1019,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
struct futex_pi_state *pi_state = q->pi_state;
struct task_struct *oldowner = pi_state->owner;
u32 uval, curval, newval;
int ret, attempt = 0;
int ret;
/* Owner died? */
if (!pi_state->owner)
@ -1076,11 +1032,9 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
* in the user space variable. This must be atomic as we have
* to preserve the owner died bit here.
*
* Note: We write the user space value _before_ changing the
* pi_state because we can fault here. Imagine swapped out
* pages or a fork, which was running right before we acquired
* mmap_sem, that marked all the anonymous memory readonly for
* cow.
* Note: We write the user space value _before_ changing the pi_state
* because we can fault here. Imagine swapped out pages or a fork
* that marked all the anonymous memory readonly for cow.
*
* Modifying pi_state _before_ the user space value would
* leave the pi_state in an inconsistent state when we fault
@ -1136,7 +1090,7 @@ retry:
handle_fault:
spin_unlock(q->lock_ptr);
ret = futex_handle_fault((unsigned long)uaddr, attempt++);
ret = get_user(uval, uaddr);
spin_lock(q->lock_ptr);
@ -1185,10 +1139,11 @@ retry:
if (unlikely(ret != 0))
goto out;
retry_private:
hb = queue_lock(&q);
/*
* Access the page AFTER the futex is queued.
* Access the page AFTER the hash-bucket is locked.
* Order is important:
*
* Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
@ -1204,20 +1159,23 @@ retry:
* a wakeup when *uaddr != val on entry to the syscall. This is
* rare, but normal.
*
* for shared futexes, we hold the mmap semaphore, so the mapping
* For shared futexes, we hold the mmap semaphore, so the mapping
* cannot have changed since we looked it up in get_futex_key.
*/
ret = get_futex_value_locked(&uval, uaddr);
if (unlikely(ret)) {
queue_unlock(&q, hb);
put_futex_key(fshared, &q.key);
ret = get_user(uval, uaddr);
if (ret)
goto out_put_key;
if (!ret)
goto retry;
goto out;
if (!fshared)
goto retry_private;
put_futex_key(fshared, &q.key);
goto retry;
}
ret = -EWOULDBLOCK;
if (unlikely(uval != val)) {
@ -1248,16 +1206,13 @@ retry:
if (!abs_time)
schedule();
else {
unsigned long slack;
slack = current->timer_slack_ns;
if (rt_task(current))
slack = 0;
hrtimer_init_on_stack(&t.timer,
clockrt ? CLOCK_REALTIME :
CLOCK_MONOTONIC,
HRTIMER_MODE_ABS);
hrtimer_init_sleeper(&t, current);
hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack);
hrtimer_set_expires_range_ns(&t.timer, *abs_time,
current->timer_slack_ns);
hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
if (!hrtimer_active(&t.timer))
@ -1354,7 +1309,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
struct futex_hash_bucket *hb;
u32 uval, newval, curval;
struct futex_q q;
int ret, lock_taken, ownerdied = 0, attempt = 0;
int ret, lock_taken, ownerdied = 0;
if (refill_pi_state_cache())
return -ENOMEM;
@ -1374,7 +1329,7 @@ retry:
if (unlikely(ret != 0))
goto out;
retry_unlocked:
retry_private:
hb = queue_lock(&q);
retry_locked:
@ -1458,6 +1413,7 @@ retry_locked:
* exit to complete.
*/
queue_unlock(&q, hb);
put_futex_key(fshared, &q.key);
cond_resched();
goto retry;
@ -1564,6 +1520,13 @@ retry_locked:
}
}
/*
* If fixup_pi_state_owner() faulted and was unable to handle the
* fault, unlock it and return the fault to userspace.
*/
if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
rt_mutex_unlock(&q.pi_state->pi_mutex);
/* Unqueue and drop the lock */
unqueue_me_pi(&q);
@ -1591,22 +1554,18 @@ uaddr_faulted:
*/
queue_unlock(&q, hb);
if (attempt++) {
ret = futex_handle_fault((unsigned long)uaddr, attempt);
if (ret)
goto out_put_key;
goto retry_unlocked;
}
ret = get_user(uval, uaddr);
if (!ret)
goto retry;
if (ret)
goto out_put_key;
if (to)
destroy_hrtimer_on_stack(&to->timer);
return ret;
if (!fshared)
goto retry_private;
put_futex_key(fshared, &q.key);
goto retry;
}
/*
* Userspace attempted a TID -> 0 atomic transition, and failed.
* This is the in-kernel slowpath: we look up the PI state (if any),
@ -1619,7 +1578,7 @@ static int futex_unlock_pi(u32 __user *uaddr, int fshared)
u32 uval;
struct plist_head *head;
union futex_key key = FUTEX_KEY_INIT;
int ret, attempt = 0;
int ret;
retry:
if (get_user(uval, uaddr))
@ -1635,7 +1594,6 @@ retry:
goto out;
hb = hash_futex(&key);
retry_unlocked:
spin_lock(&hb->lock);
/*
@ -1700,14 +1658,7 @@ pi_faulted:
* we have to drop the mmap_sem in order to call get_user().
*/
spin_unlock(&hb->lock);
if (attempt++) {
ret = futex_handle_fault((unsigned long)uaddr, attempt);
if (ret)
goto out;
uval = 0;
goto retry_unlocked;
}
put_futex_key(fshared, &key);
ret = get_user(uval, uaddr);
if (!ret)

View file

@ -4,3 +4,4 @@ obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o
obj-$(CONFIG_PM_SLEEP) += pm.o

View file

@ -17,6 +17,7 @@
#include <linux/kernel_stat.h>
#include <linux/rculist.h>
#include <linux/hash.h>
#include <trace/irq.h>
#include <linux/bootmem.h>
#include "internals.h"
@ -347,6 +348,9 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
"but no thread function available.", irq, action->name);
}
DEFINE_TRACE(irq_handler_entry);
DEFINE_TRACE(irq_handler_exit);
/**
* handle_IRQ_event - irq action chain handler
* @irq: the interrupt number
@ -365,7 +369,9 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
local_irq_enable_in_hardirq();
do {
trace_irq_handler_entry(irq, action);
ret = action->handler(irq, action->dev_id);
trace_irq_handler_exit(irq, action, ret);
switch (ret) {
case IRQ_WAKE_THREAD:

View file

@ -12,6 +12,8 @@ extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
unsigned long flags);
extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
extern struct lock_class_key irq_desc_lock_class;
extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr);

View file

@ -185,6 +185,20 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *desc)
}
#endif
void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
{
if (suspend) {
if (!desc->action || (desc->action->flags & IRQF_TIMER))
return;
desc->status |= IRQ_SUSPENDED;
}
if (!desc->depth++) {
desc->status |= IRQ_DISABLED;
desc->chip->disable(irq);
}
}
/**
* disable_irq_nosync - disable an irq without waiting
* @irq: Interrupt to disable
@ -205,10 +219,7 @@ void disable_irq_nosync(unsigned int irq)
return;
spin_lock_irqsave(&desc->lock, flags);
if (!desc->depth++) {
desc->status |= IRQ_DISABLED;
desc->chip->disable(irq);
}
__disable_irq(desc, irq, false);
spin_unlock_irqrestore(&desc->lock, flags);
}
EXPORT_SYMBOL(disable_irq_nosync);
@ -238,15 +249,21 @@ void disable_irq(unsigned int irq)
}
EXPORT_SYMBOL(disable_irq);
static void __enable_irq(struct irq_desc *desc, unsigned int irq)
void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
{
if (resume)
desc->status &= ~IRQ_SUSPENDED;
switch (desc->depth) {
case 0:
err_out:
WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
break;
case 1: {
unsigned int status = desc->status & ~IRQ_DISABLED;
if (desc->status & IRQ_SUSPENDED)
goto err_out;
/* Prevent probing on this irq: */
desc->status = status | IRQ_NOPROBE;
check_irq_resend(desc, irq);
@ -276,7 +293,7 @@ void enable_irq(unsigned int irq)
return;
spin_lock_irqsave(&desc->lock, flags);
__enable_irq(desc, irq);
__enable_irq(desc, irq, false);
spin_unlock_irqrestore(&desc->lock, flags);
}
EXPORT_SYMBOL(enable_irq);
@ -638,7 +655,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
*/
if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) {
desc->status &= ~IRQ_SPURIOUS_DISABLED;
__enable_irq(desc, irq);
__enable_irq(desc, irq, false);
}
spin_unlock_irqrestore(&desc->lock, flags);

79
kernel/irq/pm.c Normal file
View file

@ -0,0 +1,79 @@
/*
* linux/kernel/irq/pm.c
*
* Copyright (C) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
*
* This file contains power management functions related to interrupts.
*/
#include <linux/irq.h>
#include <linux/module.h>
#include <linux/interrupt.h>
#include "internals.h"
/**
* suspend_device_irqs - disable all currently enabled interrupt lines
*
* During system-wide suspend or hibernation device interrupts need to be
* disabled at the chip level and this function is provided for this purpose.
* It disables all interrupt lines that are enabled at the moment and sets the
* IRQ_SUSPENDED flag for them.
*/
void suspend_device_irqs(void)
{
struct irq_desc *desc;
int irq;
for_each_irq_desc(irq, desc) {
unsigned long flags;
spin_lock_irqsave(&desc->lock, flags);
__disable_irq(desc, irq, true);
spin_unlock_irqrestore(&desc->lock, flags);
}
for_each_irq_desc(irq, desc)
if (desc->status & IRQ_SUSPENDED)
synchronize_irq(irq);
}
EXPORT_SYMBOL_GPL(suspend_device_irqs);
/**
* resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
*
* Enable all interrupt lines previously disabled by suspend_device_irqs() that
* have the IRQ_SUSPENDED flag set.
*/
void resume_device_irqs(void)
{
struct irq_desc *desc;
int irq;
for_each_irq_desc(irq, desc) {
unsigned long flags;
if (!(desc->status & IRQ_SUSPENDED))
continue;
spin_lock_irqsave(&desc->lock, flags);
__enable_irq(desc, irq, true);
spin_unlock_irqrestore(&desc->lock, flags);
}
}
EXPORT_SYMBOL_GPL(resume_device_irqs);
/**
* check_wakeup_irqs - check if any wake-up interrupts are pending
*/
int check_wakeup_irqs(void)
{
struct irq_desc *desc;
int irq;
for_each_irq_desc(irq, desc)
if ((desc->status & IRQ_WAKEUP) && (desc->status & IRQ_PENDING))
return -EBUSY;
return 0;
}

View file

@ -161,6 +161,25 @@ unsigned long kallsyms_lookup_name(const char *name)
return module_kallsyms_lookup_name(name);
}
int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
unsigned long),
void *data)
{
char namebuf[KSYM_NAME_LEN];
unsigned long i;
unsigned int off;
int ret;
for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
off = kallsyms_expand_symbol(off, namebuf);
ret = fn(data, namebuf, NULL, kallsyms_addresses[i]);
if (ret != 0)
return ret;
}
return module_kallsyms_on_each_symbol(fn, data);
}
EXPORT_SYMBOL_GPL(kallsyms_on_each_symbol);
static unsigned long get_symbol_pos(unsigned long addr,
unsigned long *symbolsize,
unsigned long *offset)

View file

@ -42,7 +42,7 @@
note_buf_t* crash_notes;
/* vmcoreinfo stuff */
unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
size_t vmcoreinfo_size;
size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
@ -1409,6 +1409,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_OFFSET(list_head, prev);
VMCOREINFO_OFFSET(vm_struct, addr);
VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
log_buf_kexec_setup();
VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
VMCOREINFO_NUMBER(NR_FREE_PAGES);
VMCOREINFO_NUMBER(PG_lru);
@ -1450,11 +1451,7 @@ int kernel_kexec(void)
error = device_suspend(PMSG_FREEZE);
if (error)
goto Resume_console;
error = disable_nonboot_cpus();
if (error)
goto Resume_devices;
device_pm_lock();
local_irq_disable();
/* At this point, device_suspend() has been called,
* but *not* device_power_down(). We *must*
* device_power_down() now. Otherwise, drivers for
@ -1464,12 +1461,15 @@ int kernel_kexec(void)
*/
error = device_power_down(PMSG_FREEZE);
if (error)
goto Enable_irqs;
goto Resume_devices;
error = disable_nonboot_cpus();
if (error)
goto Enable_cpus;
local_irq_disable();
/* Suspend system devices */
error = sysdev_suspend(PMSG_FREEZE);
if (error)
goto Power_up_devices;
goto Enable_irqs;
} else
#endif
{
@ -1483,13 +1483,13 @@ int kernel_kexec(void)
#ifdef CONFIG_KEXEC_JUMP
if (kexec_image->preserve_context) {
sysdev_resume();
Power_up_devices:
device_power_up(PMSG_RESTORE);
Enable_irqs:
local_irq_enable();
device_pm_unlock();
Enable_cpus:
enable_nonboot_cpus();
device_power_up(PMSG_RESTORE);
Resume_devices:
device_pm_unlock();
device_resume(PMSG_RESTORE);
Resume_console:
resume_console();

View file

@ -50,7 +50,8 @@ static struct workqueue_struct *khelper_wq;
char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
/**
* request_module - try to load a kernel module
* __request_module - try to load a kernel module
* @wait: wait (or not) for the operation to complete
* @fmt: printf style format string for the name of the module
* @...: arguments as specified in the format string
*
@ -63,7 +64,7 @@ char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
* If module auto-loading support is disabled then this function
* becomes a no-operation.
*/
int request_module(const char *fmt, ...)
int __request_module(bool wait, const char *fmt, ...)
{
va_list args;
char module_name[MODULE_NAME_LEN];
@ -108,11 +109,12 @@ int request_module(const char *fmt, ...)
return -ENOMEM;
}
ret = call_usermodehelper(modprobe_path, argv, envp, 1);
ret = call_usermodehelper(modprobe_path, argv, envp,
wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
atomic_dec(&kmod_concurrent);
return ret;
}
EXPORT_SYMBOL(request_module);
EXPORT_SYMBOL(__request_module);
#endif /* CONFIG_MODULES */
struct subprocess_info {
@ -167,7 +169,7 @@ static int ____call_usermodehelper(void *data)
}
/* We can run anywhere, unlike our parent keventd(). */
set_cpus_allowed_ptr(current, CPU_MASK_ALL_PTR);
set_cpus_allowed_ptr(current, cpu_all_mask);
/*
* Our parent is keventd, which runs with elevated scheduling priority.

View file

@ -43,6 +43,7 @@
#include <linux/seq_file.h>
#include <linux/debugfs.h>
#include <linux/kdebug.h>
#include <linux/memory.h>
#include <asm-generic/sections.h>
#include <asm/cacheflush.h>
@ -699,9 +700,10 @@ int __kprobes register_kprobe(struct kprobe *p)
goto out;
}
mutex_lock(&text_mutex);
ret = arch_prepare_kprobe(p);
if (ret)
goto out;
goto out_unlock_text;
INIT_HLIST_NODE(&p->hlist);
hlist_add_head_rcu(&p->hlist,
@ -710,6 +712,8 @@ int __kprobes register_kprobe(struct kprobe *p)
if (kprobe_enabled)
arch_arm_kprobe(p);
out_unlock_text:
mutex_unlock(&text_mutex);
out:
mutex_unlock(&kprobe_mutex);
@ -746,8 +750,11 @@ valid_p:
* enabled and not gone - otherwise, the breakpoint would
* already have been removed. We save on flushing icache.
*/
if (kprobe_enabled && !kprobe_gone(old_p))
if (kprobe_enabled && !kprobe_gone(old_p)) {
mutex_lock(&text_mutex);
arch_disarm_kprobe(p);
mutex_unlock(&text_mutex);
}
hlist_del_rcu(&old_p->hlist);
} else {
if (p->break_handler && !kprobe_gone(p))
@ -912,10 +919,8 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
ri->rp = rp;
ri->task = current;
if (rp->entry_handler && rp->entry_handler(ri, regs)) {
spin_unlock_irqrestore(&rp->lock, flags);
if (rp->entry_handler && rp->entry_handler(ri, regs))
return 0;
}
arch_prepare_kretprobe(ri, regs);
@ -1280,12 +1285,14 @@ static void __kprobes enable_all_kprobes(void)
if (kprobe_enabled)
goto already_enabled;
mutex_lock(&text_mutex);
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
hlist_for_each_entry_rcu(p, node, head, hlist)
if (!kprobe_gone(p))
arch_arm_kprobe(p);
}
mutex_unlock(&text_mutex);
kprobe_enabled = true;
printk(KERN_INFO "Kprobes globally enabled\n");
@ -1310,6 +1317,7 @@ static void __kprobes disable_all_kprobes(void)
kprobe_enabled = false;
printk(KERN_INFO "Kprobes globally disabled\n");
mutex_lock(&text_mutex);
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
hlist_for_each_entry_rcu(p, node, head, hlist) {
@ -1318,6 +1326,7 @@ static void __kprobes disable_all_kprobes(void)
}
}
mutex_unlock(&text_mutex);
mutex_unlock(&kprobe_mutex);
/* Allow all currently running kprobes to complete */
synchronize_sched();

View file

@ -110,7 +110,7 @@ static void create_kthread(struct kthread_create_info *create)
*/
sched_setscheduler(create->result, SCHED_NORMAL, &param);
set_user_nice(create->result, KTHREAD_NICE_LEVEL);
set_cpus_allowed_ptr(create->result, CPU_MASK_ALL_PTR);
set_cpus_allowed_ptr(create->result, cpu_all_mask);
}
complete(&create->done);
}
@ -240,7 +240,7 @@ int kthreadd(void *unused)
set_task_comm(tsk, "kthreadd");
ignore_signals(tsk);
set_user_nice(tsk, KTHREAD_NICE_LEVEL);
set_cpus_allowed_ptr(tsk, CPU_MASK_ALL_PTR);
set_cpus_allowed_ptr(tsk, cpu_all_mask);
current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;

View file

@ -9,6 +9,44 @@
* as published by the Free Software Foundation; version 2
* of the License.
*/
/*
* CONFIG_LATENCYTOP enables a kernel latency tracking infrastructure that is
* used by the "latencytop" userspace tool. The latency that is tracked is not
* the 'traditional' interrupt latency (which is primarily caused by something
* else consuming CPU), but instead, it is the latency an application encounters
* because the kernel sleeps on its behalf for various reasons.
*
* This code tracks 2 levels of statistics:
* 1) System level latency
* 2) Per process latency
*
* The latency is stored in fixed sized data structures in an accumulated form;
* if the "same" latency cause is hit twice, this will be tracked as one entry
* in the data structure. Both the count, total accumulated latency and maximum
* latency are tracked in this data structure. When the fixed size structure is
* full, no new causes are tracked until the buffer is flushed by writing to
* the /proc file; the userspace tool does this on a regular basis.
*
* A latency cause is identified by a stringified backtrace at the point that
* the scheduler gets invoked. The userland tool will use this string to
* identify the cause of the latency in human readable form.
*
* The information is exported via /proc/latency_stats and /proc/<pid>/latency.
* These files look like this:
*
* Latency Top version : v0.1
* 70 59433 4897 i915_irq_wait drm_ioctl vfs_ioctl do_vfs_ioctl sys_ioctl
* | | | |
* | | | +----> the stringified backtrace
* | | +---------> The maximum latency for this entry in microseconds
* | +--------------> The accumulated latency for this entry (microseconds)
* +-------------------> The number of times this entry is hit
*
* (note: the average latency is the accumulated latency divided by the number
* of times)
*/
#include <linux/latencytop.h>
#include <linux/kallsyms.h>
#include <linux/seq_file.h>
@ -72,7 +110,7 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record
firstnonnull = i;
continue;
}
for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
unsigned long record = lat->backtrace[q];
if (latency_record[i].backtrace[q] != record) {
@ -101,31 +139,52 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record
memcpy(&latency_record[i], lat, sizeof(struct latency_record));
}
static inline void store_stacktrace(struct task_struct *tsk, struct latency_record *lat)
/*
* Iterator to store a backtrace into a latency record entry
*/
static inline void store_stacktrace(struct task_struct *tsk,
struct latency_record *lat)
{
struct stack_trace trace;
memset(&trace, 0, sizeof(trace));
trace.max_entries = LT_BACKTRACEDEPTH;
trace.entries = &lat->backtrace[0];
trace.skip = 0;
save_stack_trace_tsk(tsk, &trace);
}
/**
* __account_scheduler_latency - record an occured latency
* @tsk - the task struct of the task hitting the latency
* @usecs - the duration of the latency in microseconds
* @inter - 1 if the sleep was interruptible, 0 if uninterruptible
*
* This function is the main entry point for recording latency entries
* as called by the scheduler.
*
* This function has a few special cases to deal with normal 'non-latency'
* sleeps: specifically, interruptible sleep longer than 5 msec is skipped
* since this usually is caused by waiting for events via select() and co.
*
* Negative latencies (caused by time going backwards) are also explicitly
* skipped.
*/
void __sched
account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
__account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
{
unsigned long flags;
int i, q;
struct latency_record lat;
if (!latencytop_enabled)
return;
/* Long interruptible waits are generally user requested... */
if (inter && usecs > 5000)
return;
/* Negative sleeps are time going backwards */
/* Zero-time sleeps are non-interesting */
if (usecs <= 0)
return;
memset(&lat, 0, sizeof(lat));
lat.count = 1;
lat.time = usecs;
@ -143,12 +202,12 @@ account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
if (tsk->latency_record_count >= LT_SAVECOUNT)
goto out_unlock;
for (i = 0; i < LT_SAVECOUNT ; i++) {
for (i = 0; i < LT_SAVECOUNT; i++) {
struct latency_record *mylat;
int same = 1;
mylat = &tsk->latency_record[i];
for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
unsigned long record = lat.backtrace[q];
if (mylat->backtrace[q] != record) {
@ -186,7 +245,7 @@ static int lstats_show(struct seq_file *m, void *v)
for (i = 0; i < MAXLR; i++) {
if (latency_record[i].backtrace[0]) {
int q;
seq_printf(m, "%i %li %li ",
seq_printf(m, "%i %lu %lu ",
latency_record[i].count,
latency_record[i].time,
latency_record[i].max);
@ -223,7 +282,7 @@ static int lstats_open(struct inode *inode, struct file *filp)
return single_open(filp, lstats_show, NULL);
}
static struct file_operations lstats_fops = {
static const struct file_operations lstats_fops = {
.open = lstats_open,
.read = seq_read,
.write = lstats_write,
@ -236,4 +295,4 @@ static int __init init_lstats_procfs(void)
proc_create("latency_stats", 0644, NULL, &lstats_fops);
return 0;
}
__initcall(init_lstats_procfs);
device_initcall(init_lstats_procfs);

View file

@ -41,6 +41,8 @@
#include <linux/utsname.h>
#include <linux/hash.h>
#include <linux/ftrace.h>
#include <linux/stringify.h>
#include <trace/lockdep.h>
#include <asm/sections.h>
@ -310,12 +312,14 @@ EXPORT_SYMBOL(lockdep_on);
#if VERBOSE
# define HARDIRQ_VERBOSE 1
# define SOFTIRQ_VERBOSE 1
# define RECLAIM_VERBOSE 1
#else
# define HARDIRQ_VERBOSE 0
# define SOFTIRQ_VERBOSE 0
# define RECLAIM_VERBOSE 0
#endif
#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE
#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE || RECLAIM_VERBOSE
/*
* Quick filtering for interesting events:
*/
@ -430,30 +434,24 @@ atomic_t nr_find_usage_forwards_checks;
atomic_t nr_find_usage_forwards_recursions;
atomic_t nr_find_usage_backwards_checks;
atomic_t nr_find_usage_backwards_recursions;
# define debug_atomic_inc(ptr) atomic_inc(ptr)
# define debug_atomic_dec(ptr) atomic_dec(ptr)
# define debug_atomic_read(ptr) atomic_read(ptr)
#else
# define debug_atomic_inc(ptr) do { } while (0)
# define debug_atomic_dec(ptr) do { } while (0)
# define debug_atomic_read(ptr) 0
#endif
/*
* Locking printouts:
*/
#define __USAGE(__STATE) \
[LOCK_USED_IN_##__STATE] = "IN-"__stringify(__STATE)"-W", \
[LOCK_ENABLED_##__STATE] = __stringify(__STATE)"-ON-W", \
[LOCK_USED_IN_##__STATE##_READ] = "IN-"__stringify(__STATE)"-R",\
[LOCK_ENABLED_##__STATE##_READ] = __stringify(__STATE)"-ON-R",
static const char *usage_str[] =
{
[LOCK_USED] = "initial-use ",
[LOCK_USED_IN_HARDIRQ] = "in-hardirq-W",
[LOCK_USED_IN_SOFTIRQ] = "in-softirq-W",
[LOCK_ENABLED_SOFTIRQS] = "softirq-on-W",
[LOCK_ENABLED_HARDIRQS] = "hardirq-on-W",
[LOCK_USED_IN_HARDIRQ_READ] = "in-hardirq-R",
[LOCK_USED_IN_SOFTIRQ_READ] = "in-softirq-R",
[LOCK_ENABLED_SOFTIRQS_READ] = "softirq-on-R",
[LOCK_ENABLED_HARDIRQS_READ] = "hardirq-on-R",
#define LOCKDEP_STATE(__STATE) __USAGE(__STATE)
#include "lockdep_states.h"
#undef LOCKDEP_STATE
[LOCK_USED] = "INITIAL USE",
};
const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
@ -461,46 +459,45 @@ const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str);
}
void
get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4)
static inline unsigned long lock_flag(enum lock_usage_bit bit)
{
*c1 = '.', *c2 = '.', *c3 = '.', *c4 = '.';
return 1UL << bit;
}
if (class->usage_mask & LOCKF_USED_IN_HARDIRQ)
*c1 = '+';
else
if (class->usage_mask & LOCKF_ENABLED_HARDIRQS)
*c1 = '-';
static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit)
{
char c = '.';
if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ)
*c2 = '+';
else
if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS)
*c2 = '-';
if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
*c3 = '-';
if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ) {
*c3 = '+';
if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
*c3 = '?';
if (class->usage_mask & lock_flag(bit + 2))
c = '+';
if (class->usage_mask & lock_flag(bit)) {
c = '-';
if (class->usage_mask & lock_flag(bit + 2))
c = '?';
}
if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ)
*c4 = '-';
if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ) {
*c4 = '+';
if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ)
*c4 = '?';
}
return c;
}
void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS])
{
int i = 0;
#define LOCKDEP_STATE(__STATE) \
usage[i++] = get_usage_char(class, LOCK_USED_IN_##__STATE); \
usage[i++] = get_usage_char(class, LOCK_USED_IN_##__STATE##_READ);
#include "lockdep_states.h"
#undef LOCKDEP_STATE
usage[i] = '\0';
}
static void print_lock_name(struct lock_class *class)
{
char str[KSYM_NAME_LEN], c1, c2, c3, c4;
char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS];
const char *name;
get_usage_chars(class, &c1, &c2, &c3, &c4);
get_usage_chars(class, usage);
name = class->name;
if (!name) {
@ -513,7 +510,7 @@ static void print_lock_name(struct lock_class *class)
if (class->subclass)
printk("/%d", class->subclass);
}
printk("){%c%c%c%c}", c1, c2, c3, c4);
printk("){%s}", usage);
}
static void print_lockdep_cache(struct lockdep_map *lock)
@ -1263,9 +1260,49 @@ check_usage(struct task_struct *curr, struct held_lock *prev,
bit_backwards, bit_forwards, irqclass);
}
static int
check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
struct held_lock *next)
static const char *state_names[] = {
#define LOCKDEP_STATE(__STATE) \
__stringify(__STATE),
#include "lockdep_states.h"
#undef LOCKDEP_STATE
};
static const char *state_rnames[] = {
#define LOCKDEP_STATE(__STATE) \
__stringify(__STATE)"-READ",
#include "lockdep_states.h"
#undef LOCKDEP_STATE
};
static inline const char *state_name(enum lock_usage_bit bit)
{
return (bit & 1) ? state_rnames[bit >> 2] : state_names[bit >> 2];
}
static int exclusive_bit(int new_bit)
{
/*
* USED_IN
* USED_IN_READ
* ENABLED
* ENABLED_READ
*
* bit 0 - write/read
* bit 1 - used_in/enabled
* bit 2+ state
*/
int state = new_bit & ~3;
int dir = new_bit & 2;
/*
* keep state, bit flip the direction and strip read.
*/
return state | (dir ^ 2);
}
static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
struct held_lock *next, enum lock_usage_bit bit)
{
/*
* Prove that the new dependency does not connect a hardirq-safe
@ -1273,38 +1310,34 @@ check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
* the backwards-subgraph starting at <prev>, and the
* forwards-subgraph starting at <next>:
*/
if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ,
LOCK_ENABLED_HARDIRQS, "hard"))
if (!check_usage(curr, prev, next, bit,
exclusive_bit(bit), state_name(bit)))
return 0;
bit++; /* _READ */
/*
* Prove that the new dependency does not connect a hardirq-safe-read
* lock with a hardirq-unsafe lock - to achieve this we search
* the backwards-subgraph starting at <prev>, and the
* forwards-subgraph starting at <next>:
*/
if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ_READ,
LOCK_ENABLED_HARDIRQS, "hard-read"))
if (!check_usage(curr, prev, next, bit,
exclusive_bit(bit), state_name(bit)))
return 0;
/*
* Prove that the new dependency does not connect a softirq-safe
* lock with a softirq-unsafe lock - to achieve this we search
* the backwards-subgraph starting at <prev>, and the
* forwards-subgraph starting at <next>:
*/
if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ,
LOCK_ENABLED_SOFTIRQS, "soft"))
return 0;
/*
* Prove that the new dependency does not connect a softirq-safe-read
* lock with a softirq-unsafe lock - to achieve this we search
* the backwards-subgraph starting at <prev>, and the
* forwards-subgraph starting at <next>:
*/
if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ_READ,
LOCK_ENABLED_SOFTIRQS, "soft"))
return 1;
}
static int
check_prev_add_irq(struct task_struct *curr, struct held_lock *prev,
struct held_lock *next)
{
#define LOCKDEP_STATE(__STATE) \
if (!check_irq_usage(curr, prev, next, LOCK_USED_IN_##__STATE)) \
return 0;
#include "lockdep_states.h"
#undef LOCKDEP_STATE
return 1;
}
@ -1861,9 +1894,9 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
curr->comm, task_pid_nr(curr));
print_lock(this);
if (forwards)
printk("but this lock took another, %s-irq-unsafe lock in the past:\n", irqclass);
printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass);
else
printk("but this lock was taken by another, %s-irq-safe lock in the past:\n", irqclass);
printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass);
print_lock_name(other);
printk("\n\nand interrupts could create inverse lock ordering between them.\n\n");
@ -1933,7 +1966,7 @@ void print_irqtrace_events(struct task_struct *curr)
print_ip_sym(curr->softirq_disable_ip);
}
static int hardirq_verbose(struct lock_class *class)
static int HARDIRQ_verbose(struct lock_class *class)
{
#if HARDIRQ_VERBOSE
return class_filter(class);
@ -1941,7 +1974,7 @@ static int hardirq_verbose(struct lock_class *class)
return 0;
}
static int softirq_verbose(struct lock_class *class)
static int SOFTIRQ_verbose(struct lock_class *class)
{
#if SOFTIRQ_VERBOSE
return class_filter(class);
@ -1949,185 +1982,95 @@ static int softirq_verbose(struct lock_class *class)
return 0;
}
static int RECLAIM_FS_verbose(struct lock_class *class)
{
#if RECLAIM_VERBOSE
return class_filter(class);
#endif
return 0;
}
#define STRICT_READ_CHECKS 1
static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
static int (*state_verbose_f[])(struct lock_class *class) = {
#define LOCKDEP_STATE(__STATE) \
__STATE##_verbose,
#include "lockdep_states.h"
#undef LOCKDEP_STATE
};
static inline int state_verbose(enum lock_usage_bit bit,
struct lock_class *class)
{
return state_verbose_f[bit >> 2](class);
}
typedef int (*check_usage_f)(struct task_struct *, struct held_lock *,
enum lock_usage_bit bit, const char *name);
static int
mark_lock_irq(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit new_bit)
{
int ret = 1;
int excl_bit = exclusive_bit(new_bit);
int read = new_bit & 1;
int dir = new_bit & 2;
switch(new_bit) {
case LOCK_USED_IN_HARDIRQ:
if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS))
/*
* mark USED_IN has to look forwards -- to ensure no dependency
* has ENABLED state, which would allow recursion deadlocks.
*
* mark ENABLED has to look backwards -- to ensure no dependee
* has USED_IN state, which, again, would allow recursion deadlocks.
*/
check_usage_f usage = dir ?
check_usage_backwards : check_usage_forwards;
/*
* Validate that this particular lock does not have conflicting
* usage states.
*/
if (!valid_state(curr, this, new_bit, excl_bit))
return 0;
/*
* Validate that the lock dependencies don't have conflicting usage
* states.
*/
if ((!read || !dir || STRICT_READ_CHECKS) &&
!usage(curr, this, excl_bit, state_name(new_bit & ~1)))
return 0;
/*
* Check for read in write conflicts
*/
if (!read) {
if (!valid_state(curr, this, new_bit, excl_bit + 1))
return 0;
if (!valid_state(curr, this, new_bit,
LOCK_ENABLED_HARDIRQS_READ))
if (STRICT_READ_CHECKS &&
!usage(curr, this, excl_bit + 1,
state_name(new_bit + 1)))
return 0;
/*
* just marked it hardirq-safe, check that this lock
* took no hardirq-unsafe lock in the past:
*/
if (!check_usage_forwards(curr, this,
LOCK_ENABLED_HARDIRQS, "hard"))
return 0;
#if STRICT_READ_CHECKS
/*
* just marked it hardirq-safe, check that this lock
* took no hardirq-unsafe-read lock in the past:
*/
if (!check_usage_forwards(curr, this,
LOCK_ENABLED_HARDIRQS_READ, "hard-read"))
return 0;
#endif
if (hardirq_verbose(hlock_class(this)))
ret = 2;
break;
case LOCK_USED_IN_SOFTIRQ:
if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS))
return 0;
if (!valid_state(curr, this, new_bit,
LOCK_ENABLED_SOFTIRQS_READ))
return 0;
/*
* just marked it softirq-safe, check that this lock
* took no softirq-unsafe lock in the past:
*/
if (!check_usage_forwards(curr, this,
LOCK_ENABLED_SOFTIRQS, "soft"))
return 0;
#if STRICT_READ_CHECKS
/*
* just marked it softirq-safe, check that this lock
* took no softirq-unsafe-read lock in the past:
*/
if (!check_usage_forwards(curr, this,
LOCK_ENABLED_SOFTIRQS_READ, "soft-read"))
return 0;
#endif
if (softirq_verbose(hlock_class(this)))
ret = 2;
break;
case LOCK_USED_IN_HARDIRQ_READ:
if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS))
return 0;
/*
* just marked it hardirq-read-safe, check that this lock
* took no hardirq-unsafe lock in the past:
*/
if (!check_usage_forwards(curr, this,
LOCK_ENABLED_HARDIRQS, "hard"))
return 0;
if (hardirq_verbose(hlock_class(this)))
ret = 2;
break;
case LOCK_USED_IN_SOFTIRQ_READ:
if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS))
return 0;
/*
* just marked it softirq-read-safe, check that this lock
* took no softirq-unsafe lock in the past:
*/
if (!check_usage_forwards(curr, this,
LOCK_ENABLED_SOFTIRQS, "soft"))
return 0;
if (softirq_verbose(hlock_class(this)))
ret = 2;
break;
case LOCK_ENABLED_HARDIRQS:
if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ))
return 0;
if (!valid_state(curr, this, new_bit,
LOCK_USED_IN_HARDIRQ_READ))
return 0;
/*
* just marked it hardirq-unsafe, check that no hardirq-safe
* lock in the system ever took it in the past:
*/
if (!check_usage_backwards(curr, this,
LOCK_USED_IN_HARDIRQ, "hard"))
return 0;
#if STRICT_READ_CHECKS
/*
* just marked it hardirq-unsafe, check that no
* hardirq-safe-read lock in the system ever took
* it in the past:
*/
if (!check_usage_backwards(curr, this,
LOCK_USED_IN_HARDIRQ_READ, "hard-read"))
return 0;
#endif
if (hardirq_verbose(hlock_class(this)))
ret = 2;
break;
case LOCK_ENABLED_SOFTIRQS:
if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ))
return 0;
if (!valid_state(curr, this, new_bit,
LOCK_USED_IN_SOFTIRQ_READ))
return 0;
/*
* just marked it softirq-unsafe, check that no softirq-safe
* lock in the system ever took it in the past:
*/
if (!check_usage_backwards(curr, this,
LOCK_USED_IN_SOFTIRQ, "soft"))
return 0;
#if STRICT_READ_CHECKS
/*
* just marked it softirq-unsafe, check that no
* softirq-safe-read lock in the system ever took
* it in the past:
*/
if (!check_usage_backwards(curr, this,
LOCK_USED_IN_SOFTIRQ_READ, "soft-read"))
return 0;
#endif
if (softirq_verbose(hlock_class(this)))
ret = 2;
break;
case LOCK_ENABLED_HARDIRQS_READ:
if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ))
return 0;
#if STRICT_READ_CHECKS
/*
* just marked it hardirq-read-unsafe, check that no
* hardirq-safe lock in the system ever took it in the past:
*/
if (!check_usage_backwards(curr, this,
LOCK_USED_IN_HARDIRQ, "hard"))
return 0;
#endif
if (hardirq_verbose(hlock_class(this)))
ret = 2;
break;
case LOCK_ENABLED_SOFTIRQS_READ:
if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ))
return 0;
#if STRICT_READ_CHECKS
/*
* just marked it softirq-read-unsafe, check that no
* softirq-safe lock in the system ever took it in the past:
*/
if (!check_usage_backwards(curr, this,
LOCK_USED_IN_SOFTIRQ, "soft"))
return 0;
#endif
if (softirq_verbose(hlock_class(this)))
ret = 2;
break;
default:
WARN_ON(1);
break;
}
return ret;
if (state_verbose(new_bit, hlock_class(this)))
return 2;
return 1;
}
enum mark_type {
#define LOCKDEP_STATE(__STATE) __STATE,
#include "lockdep_states.h"
#undef LOCKDEP_STATE
};
/*
* Mark all held locks with a usage bit:
*/
static int
mark_held_locks(struct task_struct *curr, int hardirq)
mark_held_locks(struct task_struct *curr, enum mark_type mark)
{
enum lock_usage_bit usage_bit;
struct held_lock *hlock;
@ -2136,17 +2079,12 @@ mark_held_locks(struct task_struct *curr, int hardirq)
for (i = 0; i < curr->lockdep_depth; i++) {
hlock = curr->held_locks + i;
if (hardirq) {
if (hlock->read)
usage_bit = LOCK_ENABLED_HARDIRQS_READ;
else
usage_bit = LOCK_ENABLED_HARDIRQS;
} else {
if (hlock->read)
usage_bit = LOCK_ENABLED_SOFTIRQS_READ;
else
usage_bit = LOCK_ENABLED_SOFTIRQS;
}
usage_bit = 2 + (mark << 2); /* ENABLED */
if (hlock->read)
usage_bit += 1; /* READ */
BUG_ON(usage_bit >= LOCK_USAGE_STATES);
if (!mark_lock(curr, hlock, usage_bit))
return 0;
}
@ -2200,7 +2138,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
* We are going to turn hardirqs on, so set the
* usage bit for all held locks:
*/
if (!mark_held_locks(curr, 1))
if (!mark_held_locks(curr, HARDIRQ))
return;
/*
* If we have softirqs enabled, then set the usage
@ -2208,7 +2146,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
* this bit from being set before)
*/
if (curr->softirqs_enabled)
if (!mark_held_locks(curr, 0))
if (!mark_held_locks(curr, SOFTIRQ))
return;
curr->hardirq_enable_ip = ip;
@ -2288,7 +2226,7 @@ void trace_softirqs_on(unsigned long ip)
* enabled too:
*/
if (curr->hardirqs_enabled)
mark_held_locks(curr, 0);
mark_held_locks(curr, SOFTIRQ);
}
/*
@ -2317,6 +2255,48 @@ void trace_softirqs_off(unsigned long ip)
debug_atomic_inc(&redundant_softirqs_off);
}
static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
{
struct task_struct *curr = current;
if (unlikely(!debug_locks))
return;
/* no reclaim without waiting on it */
if (!(gfp_mask & __GFP_WAIT))
return;
/* this guy won't enter reclaim */
if ((curr->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
return;
/* We're only interested __GFP_FS allocations for now */
if (!(gfp_mask & __GFP_FS))
return;
if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags)))
return;
mark_held_locks(curr, RECLAIM_FS);
}
static void check_flags(unsigned long flags);
void lockdep_trace_alloc(gfp_t gfp_mask)
{
unsigned long flags;
if (unlikely(current->lockdep_recursion))
return;
raw_local_irq_save(flags);
check_flags(flags);
current->lockdep_recursion = 1;
__lockdep_trace_alloc(gfp_mask, flags);
current->lockdep_recursion = 0;
raw_local_irq_restore(flags);
}
static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
{
/*
@ -2345,19 +2325,35 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
if (!hlock->hardirqs_off) {
if (hlock->read) {
if (!mark_lock(curr, hlock,
LOCK_ENABLED_HARDIRQS_READ))
LOCK_ENABLED_HARDIRQ_READ))
return 0;
if (curr->softirqs_enabled)
if (!mark_lock(curr, hlock,
LOCK_ENABLED_SOFTIRQS_READ))
LOCK_ENABLED_SOFTIRQ_READ))
return 0;
} else {
if (!mark_lock(curr, hlock,
LOCK_ENABLED_HARDIRQS))
LOCK_ENABLED_HARDIRQ))
return 0;
if (curr->softirqs_enabled)
if (!mark_lock(curr, hlock,
LOCK_ENABLED_SOFTIRQS))
LOCK_ENABLED_SOFTIRQ))
return 0;
}
}
/*
* We reuse the irq context infrastructure more broadly as a general
* context checking code. This tests GFP_FS recursion (a lock taken
* during reclaim for a GFP_FS allocation is held over a GFP_FS
* allocation).
*/
if (!hlock->trylock && (curr->lockdep_reclaim_gfp & __GFP_FS)) {
if (hlock->read) {
if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS_READ))
return 0;
} else {
if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS))
return 0;
}
}
@ -2412,6 +2408,10 @@ static inline int separate_irq_context(struct task_struct *curr,
return 0;
}
void lockdep_trace_alloc(gfp_t gfp_mask)
{
}
#endif
/*
@ -2445,14 +2445,13 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
return 0;
switch (new_bit) {
case LOCK_USED_IN_HARDIRQ:
case LOCK_USED_IN_SOFTIRQ:
case LOCK_USED_IN_HARDIRQ_READ:
case LOCK_USED_IN_SOFTIRQ_READ:
case LOCK_ENABLED_HARDIRQS:
case LOCK_ENABLED_SOFTIRQS:
case LOCK_ENABLED_HARDIRQS_READ:
case LOCK_ENABLED_SOFTIRQS_READ:
#define LOCKDEP_STATE(__STATE) \
case LOCK_USED_IN_##__STATE: \
case LOCK_USED_IN_##__STATE##_READ: \
case LOCK_ENABLED_##__STATE: \
case LOCK_ENABLED_##__STATE##_READ:
#include "lockdep_states.h"
#undef LOCKDEP_STATE
ret = mark_lock_irq(curr, this, new_bit);
if (!ret)
return 0;
@ -2925,6 +2924,8 @@ void lock_set_class(struct lockdep_map *lock, const char *name,
}
EXPORT_SYMBOL_GPL(lock_set_class);
DEFINE_TRACE(lock_acquire);
/*
* We are not always called with irqs disabled - do that here,
* and also avoid lockdep recursion:
@ -2935,6 +2936,8 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
{
unsigned long flags;
trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
if (unlikely(current->lockdep_recursion))
return;
@ -2949,11 +2952,15 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
}
EXPORT_SYMBOL_GPL(lock_acquire);
DEFINE_TRACE(lock_release);
void lock_release(struct lockdep_map *lock, int nested,
unsigned long ip)
{
unsigned long flags;
trace_lock_release(lock, nested, ip);
if (unlikely(current->lockdep_recursion))
return;
@ -2966,6 +2973,16 @@ void lock_release(struct lockdep_map *lock, int nested,
}
EXPORT_SYMBOL_GPL(lock_release);
void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
{
current->lockdep_reclaim_gfp = gfp_mask;
}
void lockdep_clear_current_reclaim_state(void)
{
current->lockdep_reclaim_gfp = 0;
}
#ifdef CONFIG_LOCK_STAT
static int
print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
@ -3092,10 +3109,14 @@ found_it:
lock->ip = ip;
}
DEFINE_TRACE(lock_contended);
void lock_contended(struct lockdep_map *lock, unsigned long ip)
{
unsigned long flags;
trace_lock_contended(lock, ip);
if (unlikely(!lock_stat))
return;
@ -3111,10 +3132,14 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
}
EXPORT_SYMBOL_GPL(lock_contended);
DEFINE_TRACE(lock_acquired);
void lock_acquired(struct lockdep_map *lock, unsigned long ip)
{
unsigned long flags;
trace_lock_acquired(lock, ip);
if (unlikely(!lock_stat))
return;

View file

@ -6,6 +6,45 @@
* lockdep subsystem internal functions and variables.
*/
/*
* Lock-class usage-state bits:
*/
enum lock_usage_bit {
#define LOCKDEP_STATE(__STATE) \
LOCK_USED_IN_##__STATE, \
LOCK_USED_IN_##__STATE##_READ, \
LOCK_ENABLED_##__STATE, \
LOCK_ENABLED_##__STATE##_READ,
#include "lockdep_states.h"
#undef LOCKDEP_STATE
LOCK_USED,
LOCK_USAGE_STATES
};
/*
* Usage-state bitmasks:
*/
#define __LOCKF(__STATE) LOCKF_##__STATE = (1 << LOCK_##__STATE),
enum {
#define LOCKDEP_STATE(__STATE) \
__LOCKF(USED_IN_##__STATE) \
__LOCKF(USED_IN_##__STATE##_READ) \
__LOCKF(ENABLED_##__STATE) \
__LOCKF(ENABLED_##__STATE##_READ)
#include "lockdep_states.h"
#undef LOCKDEP_STATE
__LOCKF(USED)
};
#define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ)
#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ)
#define LOCKF_ENABLED_IRQ_READ \
(LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ)
#define LOCKF_USED_IN_IRQ_READ \
(LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ)
/*
* MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies
* we track.
@ -31,8 +70,10 @@
extern struct list_head all_lock_classes;
extern struct lock_chain lock_chains[];
extern void
get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4);
#define LOCK_USAGE_CHARS (1+LOCK_USAGE_STATES/2)
extern void get_usage_chars(struct lock_class *class,
char usage[LOCK_USAGE_CHARS]);
extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str);

View file

@ -84,7 +84,7 @@ static int l_show(struct seq_file *m, void *v)
{
struct lock_class *class = v;
struct lock_list *entry;
char c1, c2, c3, c4;
char usage[LOCK_USAGE_CHARS];
if (v == SEQ_START_TOKEN) {
seq_printf(m, "all lock classes:\n");
@ -100,8 +100,8 @@ static int l_show(struct seq_file *m, void *v)
seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class));
#endif
get_usage_chars(class, &c1, &c2, &c3, &c4);
seq_printf(m, " %c%c%c%c", c1, c2, c3, c4);
get_usage_chars(class, usage);
seq_printf(m, " %s", usage);
seq_printf(m, ": ");
print_name(m, class);
@ -300,27 +300,27 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
nr_uncategorized++;
if (class->usage_mask & LOCKF_USED_IN_IRQ)
nr_irq_safe++;
if (class->usage_mask & LOCKF_ENABLED_IRQS)
if (class->usage_mask & LOCKF_ENABLED_IRQ)
nr_irq_unsafe++;
if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ)
nr_softirq_safe++;
if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS)
if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ)
nr_softirq_unsafe++;
if (class->usage_mask & LOCKF_USED_IN_HARDIRQ)
nr_hardirq_safe++;
if (class->usage_mask & LOCKF_ENABLED_HARDIRQS)
if (class->usage_mask & LOCKF_ENABLED_HARDIRQ)
nr_hardirq_unsafe++;
if (class->usage_mask & LOCKF_USED_IN_IRQ_READ)
nr_irq_read_safe++;
if (class->usage_mask & LOCKF_ENABLED_IRQS_READ)
if (class->usage_mask & LOCKF_ENABLED_IRQ_READ)
nr_irq_read_unsafe++;
if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ)
nr_softirq_read_safe++;
if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ)
if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ_READ)
nr_softirq_read_unsafe++;
if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ)
nr_hardirq_read_safe++;
if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ)
nr_hardirq_read_unsafe++;
#ifdef CONFIG_PROVE_LOCKING
@ -601,6 +601,10 @@ static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
static void seq_header(struct seq_file *m)
{
seq_printf(m, "lock_stat version 0.3\n");
if (unlikely(!debug_locks))
seq_printf(m, "*WARNING* lock debugging disabled!! - possibly due to a lockdep warning\n");
seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1));
seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s "
"%14s %14s\n",

9
kernel/lockdep_states.h Normal file
View file

@ -0,0 +1,9 @@
/*
* Lockdep states,
*
* please update XXX_LOCK_USAGE_STATES in include/linux/lockdep.h whenever
* you add one, or come up with a nice dynamic solution.
*/
LOCKDEP_STATE(HARDIRQ)
LOCKDEP_STATE(SOFTIRQ)
LOCKDEP_STATE(RECLAIM_FS)

View file

@ -68,7 +68,8 @@
/* List of modules, protected by module_mutex or preempt_disable
* (delete uses stop_machine/add uses RCU list operations). */
static DEFINE_MUTEX(module_mutex);
DEFINE_MUTEX(module_mutex);
EXPORT_SYMBOL_GPL(module_mutex);
static LIST_HEAD(modules);
/* Waiting for a module to finish initializing? */
@ -76,7 +77,7 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq);
static BLOCKING_NOTIFIER_HEAD(module_notify_list);
/* Bounds of module allocation, for speeding __module_text_address */
/* Bounds of module allocation, for speeding __module_address */
static unsigned long module_addr_min = -1UL, module_addr_max = 0;
int register_module_notifier(struct notifier_block * nb)
@ -186,17 +187,6 @@ extern const unsigned long __start___kcrctab_unused_gpl[];
#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
#endif
struct symsearch {
const struct kernel_symbol *start, *stop;
const unsigned long *crcs;
enum {
NOT_GPL_ONLY,
GPL_ONLY,
WILL_BE_GPL_ONLY,
} licence;
bool unused;
};
static bool each_symbol_in_section(const struct symsearch *arr,
unsigned int arrsize,
struct module *owner,
@ -217,10 +207,8 @@ static bool each_symbol_in_section(const struct symsearch *arr,
}
/* Returns true as soon as fn returns true, otherwise false. */
static bool each_symbol(bool (*fn)(const struct symsearch *arr,
struct module *owner,
unsigned int symnum, void *data),
void *data)
bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner,
unsigned int symnum, void *data), void *data)
{
struct module *mod;
const struct symsearch arr[] = {
@ -273,6 +261,7 @@ static bool each_symbol(bool (*fn)(const struct symsearch *arr,
}
return false;
}
EXPORT_SYMBOL_GPL(each_symbol);
struct find_symbol_arg {
/* Input */
@ -283,7 +272,7 @@ struct find_symbol_arg {
/* Output */
struct module *owner;
const unsigned long *crc;
unsigned long value;
const struct kernel_symbol *sym;
};
static bool find_symbol_in_section(const struct symsearch *syms,
@ -324,17 +313,17 @@ static bool find_symbol_in_section(const struct symsearch *syms,
fsa->owner = owner;
fsa->crc = symversion(syms->crcs, symnum);
fsa->value = syms->start[symnum].value;
fsa->sym = &syms->start[symnum];
return true;
}
/* Find a symbol, return value, (optional) crc and (optional) module
* which owns it */
static unsigned long find_symbol(const char *name,
struct module **owner,
const unsigned long **crc,
bool gplok,
bool warn)
/* Find a symbol and return it, along with, (optional) crc and
* (optional) module which owns it */
const struct kernel_symbol *find_symbol(const char *name,
struct module **owner,
const unsigned long **crc,
bool gplok,
bool warn)
{
struct find_symbol_arg fsa;
@ -347,15 +336,16 @@ static unsigned long find_symbol(const char *name,
*owner = fsa.owner;
if (crc)
*crc = fsa.crc;
return fsa.value;
return fsa.sym;
}
DEBUGP("Failed to find symbol %s\n", name);
return -ENOENT;
return NULL;
}
EXPORT_SYMBOL_GPL(find_symbol);
/* Search for module by name: must hold module_mutex. */
static struct module *find_module(const char *name)
struct module *find_module(const char *name)
{
struct module *mod;
@ -365,6 +355,7 @@ static struct module *find_module(const char *name)
}
return NULL;
}
EXPORT_SYMBOL_GPL(find_module);
#ifdef CONFIG_SMP
@ -641,7 +632,7 @@ static int already_uses(struct module *a, struct module *b)
}
/* Module a uses b */
static int use_module(struct module *a, struct module *b)
int use_module(struct module *a, struct module *b)
{
struct module_use *use;
int no_warn, err;
@ -674,6 +665,7 @@ static int use_module(struct module *a, struct module *b)
no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name);
return 1;
}
EXPORT_SYMBOL_GPL(use_module);
/* Clear the unload stuff of the module. */
static void module_unload_free(struct module *mod)
@ -856,7 +848,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
mutex_lock(&module_mutex);
/* Store the name of the last unloaded module for diagnostic purposes */
strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
unregister_dynamic_debug_module(mod->name);
ddebug_remove_module(mod->name);
free_module(mod);
out:
@ -894,7 +886,7 @@ void __symbol_put(const char *symbol)
struct module *owner;
preempt_disable();
if (IS_ERR_VALUE(find_symbol(symbol, &owner, NULL, true, false)))
if (!find_symbol(symbol, &owner, NULL, true, false))
BUG();
module_put(owner);
preempt_enable();
@ -908,8 +900,10 @@ void symbol_put_addr(void *addr)
if (core_kernel_text((unsigned long)addr))
return;
if (!(modaddr = module_text_address((unsigned long)addr)))
BUG();
/* module_text_address is safe here: we're supposed to have reference
* to module from symbol_get, so it can't go away. */
modaddr = __module_text_address((unsigned long)addr);
BUG_ON(!modaddr);
module_put(modaddr);
}
EXPORT_SYMBOL_GPL(symbol_put_addr);
@ -949,10 +943,11 @@ static inline void module_unload_free(struct module *mod)
{
}
static inline int use_module(struct module *a, struct module *b)
int use_module(struct module *a, struct module *b)
{
return strong_try_module_get(b) == 0;
}
EXPORT_SYMBOL_GPL(use_module);
static inline void module_unload_init(struct module *mod)
{
@ -995,12 +990,12 @@ static struct module_attribute *modinfo_attrs[] = {
static const char vermagic[] = VERMAGIC_STRING;
static int try_to_force_load(struct module *mod, const char *symname)
static int try_to_force_load(struct module *mod, const char *reason)
{
#ifdef CONFIG_MODULE_FORCE_LOAD
if (!test_taint(TAINT_FORCED_MODULE))
printk("%s: no version for \"%s\" found: kernel tainted.\n",
mod->name, symname);
printk(KERN_WARNING "%s: %s: kernel tainted.\n",
mod->name, reason);
add_taint_module(mod, TAINT_FORCED_MODULE);
return 0;
#else
@ -1057,9 +1052,9 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
{
const unsigned long *crc;
if (IS_ERR_VALUE(find_symbol("struct_module", NULL, &crc, true, false)))
if (!find_symbol("module_layout", NULL, &crc, true, false))
BUG();
return check_version(sechdrs, versindex, "struct_module", mod, crc);
return check_version(sechdrs, versindex, "module_layout", mod, crc);
}
/* First part is kernel version, which we ignore if module has crcs. */
@ -1098,25 +1093,25 @@ static inline int same_magic(const char *amagic, const char *bmagic,
/* Resolve a symbol for this module. I.e. if we find one, record usage.
Must be holding module_mutex. */
static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
unsigned int versindex,
const char *name,
struct module *mod)
static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
unsigned int versindex,
const char *name,
struct module *mod)
{
struct module *owner;
unsigned long ret;
const struct kernel_symbol *sym;
const unsigned long *crc;
ret = find_symbol(name, &owner, &crc,
sym = find_symbol(name, &owner, &crc,
!(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
if (!IS_ERR_VALUE(ret)) {
/* use_module can fail due to OOM,
or module initialization or unloading */
/* use_module can fail due to OOM,
or module initialization or unloading */
if (sym) {
if (!check_version(sechdrs, versindex, name, mod, crc) ||
!use_module(mod, owner))
ret = -EINVAL;
sym = NULL;
}
return ret;
return sym;
}
/*
@ -1491,6 +1486,9 @@ static void free_module(struct module *mod)
/* Module unload stuff */
module_unload_free(mod);
/* Free any allocated parameters. */
destroy_params(mod->kp, mod->num_kp);
/* release any pointers to mcount in this module */
ftrace_release(mod->module_core, mod->core_size);
@ -1513,17 +1511,15 @@ static void free_module(struct module *mod)
void *__symbol_get(const char *symbol)
{
struct module *owner;
unsigned long value;
const struct kernel_symbol *sym;
preempt_disable();
value = find_symbol(symbol, &owner, NULL, true, true);
if (IS_ERR_VALUE(value))
value = 0;
else if (strong_try_module_get(owner))
value = 0;
sym = find_symbol(symbol, &owner, NULL, true, true);
if (sym && strong_try_module_get(owner))
sym = NULL;
preempt_enable();
return (void *)value;
return sym ? (void *)sym->value : NULL;
}
EXPORT_SYMBOL_GPL(__symbol_get);
@ -1551,8 +1547,7 @@ static int verify_export_symbols(struct module *mod)
for (i = 0; i < ARRAY_SIZE(arr); i++) {
for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
if (!IS_ERR_VALUE(find_symbol(s->name, &owner,
NULL, true, false))) {
if (find_symbol(s->name, &owner, NULL, true, false)) {
printk(KERN_ERR
"%s: exports duplicate symbol %s"
" (owned by %s)\n",
@ -1576,6 +1571,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
unsigned long secbase;
unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
int ret = 0;
const struct kernel_symbol *ksym;
for (i = 1; i < n; i++) {
switch (sym[i].st_shndx) {
@ -1595,13 +1591,14 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
break;
case SHN_UNDEF:
sym[i].st_value
= resolve_symbol(sechdrs, versindex,
strtab + sym[i].st_name, mod);
ksym = resolve_symbol(sechdrs, versindex,
strtab + sym[i].st_name, mod);
/* Ok if resolved. */
if (!IS_ERR_VALUE(sym[i].st_value))
if (ksym) {
sym[i].st_value = ksym->value;
break;
}
/* Ok if weak. */
if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
break;
@ -1676,8 +1673,7 @@ static void layout_sections(struct module *mod,
if ((s->sh_flags & masks[m][0]) != masks[m][0]
|| (s->sh_flags & masks[m][1])
|| s->sh_entsize != ~0UL
|| strncmp(secstrings + s->sh_name,
".init", 5) == 0)
|| strstarts(secstrings + s->sh_name, ".init"))
continue;
s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
DEBUGP("\t%s\n", secstrings + s->sh_name);
@ -1694,8 +1690,7 @@ static void layout_sections(struct module *mod,
if ((s->sh_flags & masks[m][0]) != masks[m][0]
|| (s->sh_flags & masks[m][1])
|| s->sh_entsize != ~0UL
|| strncmp(secstrings + s->sh_name,
".init", 5) != 0)
|| !strstarts(secstrings + s->sh_name, ".init"))
continue;
s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
| INIT_OFFSET_MASK);
@ -1828,8 +1823,7 @@ static char elf_type(const Elf_Sym *sym,
else
return 'b';
}
if (strncmp(secstrings + sechdrs[sym->st_shndx].sh_name,
".debug", strlen(".debug")) == 0)
if (strstarts(secstrings + sechdrs[sym->st_shndx].sh_name, ".debug"))
return 'n';
return '?';
}
@ -1861,19 +1855,13 @@ static inline void add_kallsyms(struct module *mod,
}
#endif /* CONFIG_KALLSYMS */
static void dynamic_printk_setup(struct mod_debug *debug, unsigned int num)
static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
{
#ifdef CONFIG_DYNAMIC_PRINTK_DEBUG
unsigned int i;
for (i = 0; i < num; i++) {
register_dynamic_debug_module(debug[i].modname,
debug[i].type,
debug[i].logical_modname,
debug[i].flag_names,
debug[i].hash, debug[i].hash2);
}
#endif /* CONFIG_DYNAMIC_PRINTK_DEBUG */
#ifdef CONFIG_DYNAMIC_DEBUG
if (ddebug_add_module(debug, num, debug->modname))
printk(KERN_ERR "dynamic debug error adding module: %s\n",
debug->modname);
#endif
}
static void *module_alloc_update_bounds(unsigned long size)
@ -1904,8 +1892,7 @@ static noinline struct module *load_module(void __user *umod,
unsigned int symindex = 0;
unsigned int strindex = 0;
unsigned int modindex, versindex, infoindex, pcpuindex;
unsigned int num_kp, num_mcount;
struct kernel_param *kp;
unsigned int num_mcount;
struct module *mod;
long err = 0;
void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
@ -1922,12 +1909,6 @@ static noinline struct module *load_module(void __user *umod,
if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
return ERR_PTR(-ENOMEM);
/* Create stop_machine threads since the error path relies on
* a non-failing stop_machine call. */
err = stop_machine_create();
if (err)
goto free_hdr;
if (copy_from_user(hdr, umod, len) != 0) {
err = -EFAULT;
goto free_hdr;
@ -1968,9 +1949,12 @@ static noinline struct module *load_module(void __user *umod,
}
#ifndef CONFIG_MODULE_UNLOAD
/* Don't load .exit sections */
if (strncmp(secstrings+sechdrs[i].sh_name, ".exit", 5) == 0)
if (strstarts(secstrings+sechdrs[i].sh_name, ".exit"))
sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;
#endif
/* Don't keep __versions around; it's just for loading. */
if (strcmp(secstrings + sechdrs[i].sh_name, "__versions") == 0)
sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;
}
modindex = find_sec(hdr, sechdrs, secstrings,
@ -2012,7 +1996,7 @@ static noinline struct module *load_module(void __user *umod,
modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
/* This is allowed: modprobe --force will invalidate it. */
if (!modmagic) {
err = try_to_force_load(mod, "magic");
err = try_to_force_load(mod, "bad vermagic");
if (err)
goto free_hdr;
} else if (!same_magic(modmagic, vermagic, versindex)) {
@ -2150,8 +2134,8 @@ static noinline struct module *load_module(void __user *umod,
/* Now we've got everything in the final locations, we can
* find optional sections. */
kp = section_objs(hdr, sechdrs, secstrings, "__param", sizeof(*kp),
&num_kp);
mod->kp = section_objs(hdr, sechdrs, secstrings, "__param",
sizeof(*mod->kp), &mod->num_kp);
mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab",
sizeof(*mod->syms), &mod->num_syms);
mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab");
@ -2201,8 +2185,8 @@ static noinline struct module *load_module(void __user *umod,
|| (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs)
#endif
) {
printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name);
err = try_to_force_load(mod, "nocrc");
err = try_to_force_load(mod,
"no versions for exported symbols");
if (err)
goto cleanup;
}
@ -2247,12 +2231,13 @@ static noinline struct module *load_module(void __user *umod,
add_kallsyms(mod, sechdrs, symindex, strindex, secstrings);
if (!mod->taints) {
struct mod_debug *debug;
struct _ddebug *debug;
unsigned int num_debug;
debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
sizeof(*debug), &num_debug);
dynamic_printk_setup(debug, num_debug);
if (debug)
dynamic_debug_setup(debug, num_debug);
}
/* sechdrs[0].sh_size is always zero */
@ -2296,11 +2281,11 @@ static noinline struct module *load_module(void __user *umod,
*/
list_add_rcu(&mod->list, &modules);
err = parse_args(mod->name, mod->args, kp, num_kp, NULL);
err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL);
if (err < 0)
goto unlink;
err = mod_sysfs_setup(mod, kp, num_kp);
err = mod_sysfs_setup(mod, mod->kp, mod->num_kp);
if (err < 0)
goto unlink;
add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
@ -2309,12 +2294,13 @@ static noinline struct module *load_module(void __user *umod,
/* Get rid of temporary copy */
vfree(hdr);
stop_machine_destroy();
/* Done! */
return mod;
unlink:
stop_machine(__unlink_module, mod, NULL);
/* Unlink carefully: kallsyms could be walking list. */
list_del_rcu(&mod->list);
synchronize_sched();
module_arch_cleanup(mod);
cleanup:
kobject_del(&mod->mkobj.kobj);
@ -2322,8 +2308,8 @@ static noinline struct module *load_module(void __user *umod,
ftrace_release(mod->module_core, mod->core_size);
free_unload:
module_unload_free(mod);
free_init:
#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
free_init:
percpu_modfree(mod->refptr);
#endif
module_free(mod, mod->module_init);
@ -2337,7 +2323,6 @@ static noinline struct module *load_module(void __user *umod,
kfree(args);
free_hdr:
vfree(hdr);
stop_machine_destroy();
return ERR_PTR(err);
truncated:
@ -2614,6 +2599,25 @@ unsigned long module_kallsyms_lookup_name(const char *name)
preempt_enable();
return ret;
}
int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
struct module *, unsigned long),
void *data)
{
struct module *mod;
unsigned int i;
int ret;
list_for_each_entry(mod, &modules, list) {
for (i = 0; i < mod->num_symtab; i++) {
ret = fn(data, mod->strtab + mod->symtab[i].st_name,
mod, mod->symtab[i].st_value);
if (ret != 0)
return ret;
}
}
return 0;
}
#endif /* CONFIG_KALLSYMS */
static char *module_flags(struct module *mod, char *buf)
@ -2749,29 +2753,31 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
}
/*
* Is this a valid module address?
* is_module_address - is this address inside a module?
* @addr: the address to check.
*
* See is_module_text_address() if you simply want to see if the address
* is code (not data).
*/
int is_module_address(unsigned long addr)
bool is_module_address(unsigned long addr)
{
struct module *mod;
bool ret;
preempt_disable();
list_for_each_entry_rcu(mod, &modules, list) {
if (within_module_core(addr, mod)) {
preempt_enable();
return 1;
}
}
ret = __module_address(addr) != NULL;
preempt_enable();
return 0;
return ret;
}
/* Is this a valid kernel address? */
__notrace_funcgraph struct module *__module_text_address(unsigned long addr)
/*
* __module_address - get the module which contains an address.
* @addr: the address.
*
* Must be called with preempt disabled or module mutex held so that
* module doesn't get freed during this.
*/
struct module *__module_address(unsigned long addr)
{
struct module *mod;
@ -2779,22 +2785,51 @@ __notrace_funcgraph struct module *__module_text_address(unsigned long addr)
return NULL;
list_for_each_entry_rcu(mod, &modules, list)
if (within(addr, mod->module_init, mod->init_text_size)
|| within(addr, mod->module_core, mod->core_text_size))
if (within_module_core(addr, mod)
|| within_module_init(addr, mod))
return mod;
return NULL;
}
EXPORT_SYMBOL_GPL(__module_address);
struct module *module_text_address(unsigned long addr)
/*
* is_module_text_address - is this address inside module code?
* @addr: the address to check.
*
* See is_module_address() if you simply want to see if the address is
* anywhere in a module. See kernel_text_address() for testing if an
* address corresponds to kernel or module code.
*/
bool is_module_text_address(unsigned long addr)
{
struct module *mod;
bool ret;
preempt_disable();
mod = __module_text_address(addr);
ret = __module_text_address(addr) != NULL;
preempt_enable();
return ret;
}
/*
* __module_text_address - get the module whose code contains an address.
* @addr: the address.
*
* Must be called with preempt disabled or module mutex held so that
* module doesn't get freed during this.
*/
struct module *__module_text_address(unsigned long addr)
{
struct module *mod = __module_address(addr);
if (mod) {
/* Make sure it's within the text section. */
if (!within(addr, mod->module_init, mod->init_text_size)
&& !within(addr, mod->module_core, mod->core_text_size))
mod = NULL;
}
return mod;
}
EXPORT_SYMBOL_GPL(__module_text_address);
/* Don't grab lock, we're oopsing. */
void print_modules(void)
@ -2814,9 +2849,17 @@ void print_modules(void)
}
#ifdef CONFIG_MODVERSIONS
/* Generate the signature for struct module here, too, for modversions. */
void struct_module(struct module *mod) { return; }
EXPORT_SYMBOL(struct_module);
/* Generate the signature for all relevant module structures here.
* If these change, we don't want to try to parse the module. */
void module_layout(struct module *mod,
struct modversion_info *ver,
struct kernel_param *kp,
struct kernel_symbol *ks,
struct marker *marker,
struct tracepoint *tp)
{
}
EXPORT_SYMBOL(module_layout);
#endif
#ifdef CONFIG_MARKERS

View file

@ -26,11 +26,6 @@
/*
* Must be called with lock->wait_lock held.
*/
void debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner)
{
lock->owner = new_owner;
}
void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
{
memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
@ -59,7 +54,6 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
/* Mark the current thread as blocked on the lock: */
ti->task->blocked_on = waiter;
waiter->lock = lock;
}
void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
@ -82,7 +76,7 @@ void debug_mutex_unlock(struct mutex *lock)
DEBUG_LOCKS_WARN_ON(lock->magic != lock);
DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
mutex_clear_owner(lock);
}
void debug_mutex_init(struct mutex *lock, const char *name,
@ -95,7 +89,6 @@ void debug_mutex_init(struct mutex *lock, const char *name,
debug_check_no_locks_freed((void *)lock, sizeof(*lock));
lockdep_init_map(&lock->dep_map, name, key, 0);
#endif
lock->owner = NULL;
lock->magic = lock;
}

View file

@ -13,14 +13,6 @@
/*
* This must be called with lock->wait_lock held.
*/
extern void
debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner);
static inline void debug_mutex_clear_owner(struct mutex *lock)
{
lock->owner = NULL;
}
extern void debug_mutex_lock_common(struct mutex *lock,
struct mutex_waiter *waiter);
extern void debug_mutex_wake_waiter(struct mutex *lock,
@ -35,6 +27,16 @@ extern void debug_mutex_unlock(struct mutex *lock);
extern void debug_mutex_init(struct mutex *lock, const char *name,
struct lock_class_key *key);
static inline void mutex_set_owner(struct mutex *lock)
{
lock->owner = current_thread_info();
}
static inline void mutex_clear_owner(struct mutex *lock)
{
lock->owner = NULL;
}
#define spin_lock_mutex(lock, flags) \
do { \
struct mutex *l = container_of(lock, struct mutex, wait_lock); \

View file

@ -10,6 +10,11 @@
* Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and
* David Howells for suggestions and improvements.
*
* - Adaptive spinning for mutexes by Peter Zijlstra. (Ported to mainline
* from the -rt tree, where it was originally implemented for rtmutexes
* by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale
* and Sven Dietrich.
*
* Also see Documentation/mutex-design.txt.
*/
#include <linux/mutex.h>
@ -46,6 +51,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
atomic_set(&lock->count, 1);
spin_lock_init(&lock->wait_lock);
INIT_LIST_HEAD(&lock->wait_list);
mutex_clear_owner(lock);
debug_mutex_init(lock, name, key);
}
@ -91,6 +97,7 @@ void inline __sched mutex_lock(struct mutex *lock)
* 'unlocked' into 'locked' state.
*/
__mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath);
mutex_set_owner(lock);
}
EXPORT_SYMBOL(mutex_lock);
@ -115,6 +122,14 @@ void __sched mutex_unlock(struct mutex *lock)
* The unlocking fastpath is the 0->1 transition from 'locked'
* into 'unlocked' state:
*/
#ifndef CONFIG_DEBUG_MUTEXES
/*
* When debugging is enabled we must not clear the owner before time,
* the slow path will always be taken, and that clears the owner field
* after verifying that it was indeed current.
*/
mutex_clear_owner(lock);
#endif
__mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath);
}
@ -129,21 +144,75 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
{
struct task_struct *task = current;
struct mutex_waiter waiter;
unsigned int old_val;
unsigned long flags;
preempt_disable();
mutex_acquire(&lock->dep_map, subclass, 0, ip);
#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES)
/*
* Optimistic spinning.
*
* We try to spin for acquisition when we find that there are no
* pending waiters and the lock owner is currently running on a
* (different) CPU.
*
* The rationale is that if the lock owner is running, it is likely to
* release the lock soon.
*
* Since this needs the lock owner, and this mutex implementation
* doesn't track the owner atomically in the lock field, we need to
* track it non-atomically.
*
* We can't do this for DEBUG_MUTEXES because that relies on wait_lock
* to serialize everything.
*/
for (;;) {
struct thread_info *owner;
/*
* If there's an owner, wait for it to either
* release the lock or go to sleep.
*/
owner = ACCESS_ONCE(lock->owner);
if (owner && !mutex_spin_on_owner(lock, owner))
break;
if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
lock_acquired(&lock->dep_map, ip);
mutex_set_owner(lock);
preempt_enable();
return 0;
}
/*
* When there's no owner, we might have preempted between the
* owner acquiring the lock and setting the owner field. If
* we're an RT task that will live-lock because we won't let
* the owner complete.
*/
if (!owner && (need_resched() || rt_task(task)))
break;
/*
* The cpu_relax() call is a compiler barrier which forces
* everything in this loop to be re-loaded. We don't need
* memory barriers as we'll eventually observe the right
* values at the cost of a few extra spins.
*/
cpu_relax();
}
#endif
spin_lock_mutex(&lock->wait_lock, flags);
debug_mutex_lock_common(lock, &waiter);
mutex_acquire(&lock->dep_map, subclass, 0, ip);
debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
/* add waiting tasks to the end of the waitqueue (FIFO): */
list_add_tail(&waiter.list, &lock->wait_list);
waiter.task = task;
old_val = atomic_xchg(&lock->count, -1);
if (old_val == 1)
if (atomic_xchg(&lock->count, -1) == 1)
goto done;
lock_contended(&lock->dep_map, ip);
@ -158,8 +227,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
* that when we release the lock, we properly wake up the
* other waiters:
*/
old_val = atomic_xchg(&lock->count, -1);
if (old_val == 1)
if (atomic_xchg(&lock->count, -1) == 1)
break;
/*
@ -173,21 +241,22 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
spin_unlock_mutex(&lock->wait_lock, flags);
debug_mutex_free_waiter(&waiter);
preempt_enable();
return -EINTR;
}
__set_task_state(task, state);
/* didnt get the lock, go to sleep: */
spin_unlock_mutex(&lock->wait_lock, flags);
schedule();
__schedule();
spin_lock_mutex(&lock->wait_lock, flags);
}
done:
lock_acquired(&lock->dep_map, ip);
/* got the lock - rejoice! */
mutex_remove_waiter(lock, &waiter, task_thread_info(task));
debug_mutex_set_owner(lock, task_thread_info(task));
mutex_remove_waiter(lock, &waiter, current_thread_info());
mutex_set_owner(lock);
/* set it to 0 if there are no waiters left: */
if (likely(list_empty(&lock->wait_list)))
@ -196,6 +265,7 @@ done:
spin_unlock_mutex(&lock->wait_lock, flags);
debug_mutex_free_waiter(&waiter);
preempt_enable();
return 0;
}
@ -222,7 +292,8 @@ int __sched
mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
{
might_sleep();
return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, _RET_IP_);
return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
subclass, _RET_IP_);
}
EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
@ -260,8 +331,6 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
wake_up_process(waiter->task);
}
debug_mutex_clear_owner(lock);
spin_unlock_mutex(&lock->wait_lock, flags);
}
@ -298,18 +367,30 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count);
*/
int __sched mutex_lock_interruptible(struct mutex *lock)
{
int ret;
might_sleep();
return __mutex_fastpath_lock_retval
ret = __mutex_fastpath_lock_retval
(&lock->count, __mutex_lock_interruptible_slowpath);
if (!ret)
mutex_set_owner(lock);
return ret;
}
EXPORT_SYMBOL(mutex_lock_interruptible);
int __sched mutex_lock_killable(struct mutex *lock)
{
int ret;
might_sleep();
return __mutex_fastpath_lock_retval
ret = __mutex_fastpath_lock_retval
(&lock->count, __mutex_lock_killable_slowpath);
if (!ret)
mutex_set_owner(lock);
return ret;
}
EXPORT_SYMBOL(mutex_lock_killable);
@ -352,9 +433,10 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
prev = atomic_xchg(&lock->count, -1);
if (likely(prev == 1)) {
debug_mutex_set_owner(lock, current_thread_info());
mutex_set_owner(lock);
mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
}
/* Set it back to 0 if there are no waiters: */
if (likely(list_empty(&lock->wait_list)))
atomic_set(&lock->count, 0);
@ -380,8 +462,13 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
*/
int __sched mutex_trylock(struct mutex *lock)
{
return __mutex_fastpath_trylock(&lock->count,
__mutex_trylock_slowpath);
int ret;
ret = __mutex_fastpath_trylock(&lock->count, __mutex_trylock_slowpath);
if (ret)
mutex_set_owner(lock);
return ret;
}
EXPORT_SYMBOL(mutex_trylock);

View file

@ -16,8 +16,26 @@
#define mutex_remove_waiter(lock, waiter, ti) \
__list_del((waiter)->list.prev, (waiter)->list.next)
#define debug_mutex_set_owner(lock, new_owner) do { } while (0)
#define debug_mutex_clear_owner(lock) do { } while (0)
#ifdef CONFIG_SMP
static inline void mutex_set_owner(struct mutex *lock)
{
lock->owner = current_thread_info();
}
static inline void mutex_clear_owner(struct mutex *lock)
{
lock->owner = NULL;
}
#else
static inline void mutex_set_owner(struct mutex *lock)
{
}
static inline void mutex_clear_owner(struct mutex *lock)
{
}
#endif
#define debug_mutex_wake_waiter(lock, waiter) do { } while (0)
#define debug_mutex_free_waiter(waiter) do { } while (0)
#define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0)

View file

@ -34,7 +34,7 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
/*
* Rules:
* 1. you can only enter a cgroup which is a child of your current
* 1. you can only enter a cgroup which is a descendant of your current
* cgroup
* 2. you can only place another process into a cgroup if
* a. you have CAP_SYS_ADMIN
@ -45,21 +45,15 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
static int ns_can_attach(struct cgroup_subsys *ss,
struct cgroup *new_cgroup, struct task_struct *task)
{
struct cgroup *orig;
if (current != task) {
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (!cgroup_is_descendant(new_cgroup))
if (!cgroup_is_descendant(new_cgroup, current))
return -EPERM;
}
if (atomic_read(&new_cgroup->count) != 0)
return -EPERM;
orig = task_cgroup(task, ns_subsys_id);
if (orig && orig != new_cgroup->parent)
if (!cgroup_is_descendant(new_cgroup, task))
return -EPERM;
return 0;
@ -77,7 +71,7 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
if (!capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM);
if (!cgroup_is_descendant(cgroup))
if (!cgroup_is_descendant(cgroup, current))
return ERR_PTR(-EPERM);
ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);

View file

@ -8,19 +8,19 @@
* This function is used through-out the kernel (including mm and fs)
* to indicate a major problem.
*/
#include <linux/module.h>
#include <linux/sched.h>
#include <linux/delay.h>
#include <linux/reboot.h>
#include <linux/notifier.h>
#include <linux/init.h>
#include <linux/sysrq.h>
#include <linux/interrupt.h>
#include <linux/nmi.h>
#include <linux/kexec.h>
#include <linux/debug_locks.h>
#include <linux/random.h>
#include <linux/interrupt.h>
#include <linux/kallsyms.h>
#include <linux/notifier.h>
#include <linux/module.h>
#include <linux/random.h>
#include <linux/reboot.h>
#include <linux/delay.h>
#include <linux/kexec.h>
#include <linux/sched.h>
#include <linux/sysrq.h>
#include <linux/init.h>
#include <linux/nmi.h>
#include <linux/dmi.h>
int panic_on_oops;
@ -52,19 +52,15 @@ EXPORT_SYMBOL(panic_blink);
*
* This function never returns.
*/
NORET_TYPE void panic(const char * fmt, ...)
{
long i;
static char buf[1024];
va_list args;
#if defined(CONFIG_S390)
unsigned long caller = (unsigned long) __builtin_return_address(0);
#endif
long i;
/*
* It's possible to come here directly from a panic-assertion and not
* have preempt disabled. Some functions called from here want
* It's possible to come here directly from a panic-assertion and
* not have preempt disabled. Some functions called from here want
* preempt to be disabled. No point enabling it later though...
*/
preempt_disable();
@ -77,7 +73,6 @@ NORET_TYPE void panic(const char * fmt, ...)
#ifdef CONFIG_DEBUG_BUGVERBOSE
dump_stack();
#endif
bust_spinlocks(0);
/*
* If we have crashed and we have a crash kernel loaded let it handle
@ -86,14 +81,12 @@ NORET_TYPE void panic(const char * fmt, ...)
*/
crash_kexec(NULL);
#ifdef CONFIG_SMP
/*
* Note smp_send_stop is the usual smp shutdown function, which
* unfortunately means it may not be hardened to work in a panic
* situation.
*/
smp_send_stop();
#endif
atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
@ -102,19 +95,21 @@ NORET_TYPE void panic(const char * fmt, ...)
if (panic_timeout > 0) {
/*
* Delay timeout seconds before rebooting the machine.
* We can't use the "normal" timers since we just panicked..
*/
printk(KERN_EMERG "Rebooting in %d seconds..",panic_timeout);
* Delay timeout seconds before rebooting the machine.
* We can't use the "normal" timers since we just panicked.
*/
printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
for (i = 0; i < panic_timeout*1000; ) {
touch_nmi_watchdog();
i += panic_blink(i);
mdelay(1);
i++;
}
/* This will not be a clean reboot, with everything
* shutting down. But if there is a chance of
* rebooting the system it will be rebooted.
/*
* This will not be a clean reboot, with everything
* shutting down. But if there is a chance of
* rebooting the system it will be rebooted.
*/
emergency_restart();
}
@ -127,38 +122,44 @@ NORET_TYPE void panic(const char * fmt, ...)
}
#endif
#if defined(CONFIG_S390)
disabled_wait(caller);
{
unsigned long caller;
caller = (unsigned long)__builtin_return_address(0);
disabled_wait(caller);
}
#endif
local_irq_enable();
for (i = 0;;) {
for (i = 0; ; ) {
touch_softlockup_watchdog();
i += panic_blink(i);
mdelay(1);
i++;
}
bust_spinlocks(0);
}
EXPORT_SYMBOL(panic);
struct tnt {
u8 bit;
char true;
char false;
u8 bit;
char true;
char false;
};
static const struct tnt tnts[] = {
{ TAINT_PROPRIETARY_MODULE, 'P', 'G' },
{ TAINT_FORCED_MODULE, 'F', ' ' },
{ TAINT_UNSAFE_SMP, 'S', ' ' },
{ TAINT_FORCED_RMMOD, 'R', ' ' },
{ TAINT_MACHINE_CHECK, 'M', ' ' },
{ TAINT_BAD_PAGE, 'B', ' ' },
{ TAINT_USER, 'U', ' ' },
{ TAINT_DIE, 'D', ' ' },
{ TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' },
{ TAINT_WARN, 'W', ' ' },
{ TAINT_CRAP, 'C', ' ' },
{ TAINT_PROPRIETARY_MODULE, 'P', 'G' },
{ TAINT_FORCED_MODULE, 'F', ' ' },
{ TAINT_UNSAFE_SMP, 'S', ' ' },
{ TAINT_FORCED_RMMOD, 'R', ' ' },
{ TAINT_MACHINE_CHECK, 'M', ' ' },
{ TAINT_BAD_PAGE, 'B', ' ' },
{ TAINT_USER, 'U', ' ' },
{ TAINT_DIE, 'D', ' ' },
{ TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' },
{ TAINT_WARN, 'W', ' ' },
{ TAINT_CRAP, 'C', ' ' },
};
/**
@ -195,7 +196,8 @@ const char *print_tainted(void)
*s = 0;
} else
snprintf(buf, sizeof(buf), "Not tainted");
return(buf);
return buf;
}
int test_taint(unsigned flag)
@ -211,7 +213,8 @@ unsigned long get_taint(void)
void add_taint(unsigned flag)
{
debug_locks = 0; /* can't trust the integrity of the kernel anymore */
/* can't trust the integrity of the kernel anymore: */
debug_locks = 0;
set_bit(flag, &tainted_mask);
}
EXPORT_SYMBOL(add_taint);
@ -266,8 +269,8 @@ static void do_oops_enter_exit(void)
}
/*
* Return true if the calling CPU is allowed to print oops-related info. This
* is a bit racy..
* Return true if the calling CPU is allowed to print oops-related info.
* This is a bit racy..
*/
int oops_may_print(void)
{
@ -276,20 +279,22 @@ int oops_may_print(void)
/*
* Called when the architecture enters its oops handler, before it prints
* anything. If this is the first CPU to oops, and it's oopsing the first time
* then let it proceed.
* anything. If this is the first CPU to oops, and it's oopsing the first
* time then let it proceed.
*
* This is all enabled by the pause_on_oops kernel boot option. We do all this
* to ensure that oopses don't scroll off the screen. It has the side-effect
* of preventing later-oopsing CPUs from mucking up the display, too.
* This is all enabled by the pause_on_oops kernel boot option. We do all
* this to ensure that oopses don't scroll off the screen. It has the
* side-effect of preventing later-oopsing CPUs from mucking up the display,
* too.
*
* It turns out that the CPU which is allowed to print ends up pausing for the
* right duration, whereas all the other CPUs pause for twice as long: once in
* oops_enter(), once in oops_exit().
* It turns out that the CPU which is allowed to print ends up pausing for
* the right duration, whereas all the other CPUs pause for twice as long:
* once in oops_enter(), once in oops_exit().
*/
void oops_enter(void)
{
debug_locks_off(); /* can't trust the integrity of the kernel anymore */
/* can't trust the integrity of the kernel anymore: */
debug_locks_off();
do_oops_enter_exit();
}

View file

@ -24,6 +24,9 @@
#include <linux/err.h>
#include <linux/slab.h>
/* We abuse the high bits of "perm" to record whether we kmalloc'ed. */
#define KPARAM_KMALLOCED 0x80000000
#if 0
#define DEBUGP printk
#else
@ -217,7 +220,19 @@ int param_set_charp(const char *val, struct kernel_param *kp)
return -ENOSPC;
}
*(char **)kp->arg = (char *)val;
if (kp->perm & KPARAM_KMALLOCED)
kfree(*(char **)kp->arg);
/* This is a hack. We can't need to strdup in early boot, and we
* don't need to; this mangled commandline is preserved. */
if (slab_is_available()) {
kp->perm |= KPARAM_KMALLOCED;
*(char **)kp->arg = kstrdup(val, GFP_KERNEL);
if (!kp->arg)
return -ENOMEM;
} else
*(const char **)kp->arg = val;
return 0;
}
@ -571,6 +586,15 @@ void module_param_sysfs_remove(struct module *mod)
}
#endif
void destroy_params(const struct kernel_param *params, unsigned num)
{
unsigned int i;
for (i = 0; i < num; i++)
if (params[i].perm & KPARAM_KMALLOCED)
kfree(*(char **)params[i].arg);
}
static void __init kernel_add_sysfs_param(const char *name,
struct kernel_param *kparam,
unsigned int name_skip)

View file

@ -403,6 +403,8 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
{
struct pid *pid;
rcu_read_lock();
if (type != PIDTYPE_PID)
task = task->group_leader;
pid = get_pid(task->pids[type].pid);
rcu_read_unlock();
return pid;
@ -450,11 +452,24 @@ pid_t pid_vnr(struct pid *pid)
}
EXPORT_SYMBOL_GPL(pid_vnr);
pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
struct pid_namespace *ns)
{
return pid_nr_ns(task_pid(tsk), ns);
pid_t nr = 0;
rcu_read_lock();
if (!ns)
ns = current->nsproxy->pid_ns;
if (likely(pid_alive(task))) {
if (type != PIDTYPE_PID)
task = task->group_leader;
nr = pid_nr_ns(task->pids[type].pid, ns);
}
rcu_read_unlock();
return nr;
}
EXPORT_SYMBOL(task_pid_nr_ns);
EXPORT_SYMBOL(__task_pid_nr_ns);
pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
@ -462,18 +477,6 @@ pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
}
EXPORT_SYMBOL(task_tgid_nr_ns);
pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
return pid_nr_ns(task_pgrp(tsk), ns);
}
EXPORT_SYMBOL(task_pgrp_nr_ns);
pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
return pid_nr_ns(task_session(tsk), ns);
}
EXPORT_SYMBOL(task_session_nr_ns);
struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
{
return ns_of_pid(task_pid(tsk));

View file

@ -152,6 +152,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
{
int nr;
int rc;
struct task_struct *task;
/*
* The last thread in the cgroup-init thread group is terminating.
@ -169,7 +170,19 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
read_lock(&tasklist_lock);
nr = next_pidmap(pid_ns, 1);
while (nr > 0) {
kill_proc_info(SIGKILL, SEND_SIG_PRIV, nr);
rcu_read_lock();
/*
* Use force_sig() since it clears SIGNAL_UNKILLABLE ensuring
* any nested-container's init processes don't ignore the
* signal
*/
task = pid_task(find_vpid(nr), PIDTYPE_PID);
if (task)
force_sig(SIGKILL, task);
rcu_read_unlock();
nr = next_pidmap(pid_ns, nr);
}
read_unlock(&tasklist_lock);

View file

@ -1370,7 +1370,8 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
if (task_cputime_expired(&group_sample, &sig->cputime_expires))
return 1;
}
return 0;
return sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY;
}
/*

View file

@ -22,6 +22,7 @@
#include <linux/console.h>
#include <linux/cpu.h>
#include <linux/freezer.h>
#include <asm/suspend.h>
#include "power.h"
@ -214,7 +215,7 @@ static int create_image(int platform_mode)
return error;
device_pm_lock();
local_irq_disable();
/* At this point, device_suspend() has been called, but *not*
* device_power_down(). We *must* call device_power_down() now.
* Otherwise, drivers for some devices (e.g. interrupt controllers)
@ -225,13 +226,25 @@ static int create_image(int platform_mode)
if (error) {
printk(KERN_ERR "PM: Some devices failed to power down, "
"aborting hibernation\n");
goto Enable_irqs;
goto Unlock;
}
error = platform_pre_snapshot(platform_mode);
if (error || hibernation_test(TEST_PLATFORM))
goto Platform_finish;
error = disable_nonboot_cpus();
if (error || hibernation_test(TEST_CPUS)
|| hibernation_testmode(HIBERNATION_TEST))
goto Enable_cpus;
local_irq_disable();
sysdev_suspend(PMSG_FREEZE);
if (error) {
printk(KERN_ERR "PM: Some devices failed to power down, "
"aborting hibernation\n");
goto Power_up_devices;
goto Enable_irqs;
}
if (hibernation_test(TEST_CORE))
@ -247,17 +260,28 @@ static int create_image(int platform_mode)
restore_processor_state();
if (!in_suspend)
platform_leave(platform_mode);
Power_up:
sysdev_resume();
/* NOTE: device_power_up() is just a resume() for devices
* that suspended with irqs off ... no overall powerup.
*/
Power_up_devices:
device_power_up(in_suspend ?
(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
Enable_irqs:
local_irq_enable();
Enable_cpus:
enable_nonboot_cpus();
Platform_finish:
platform_finish(platform_mode);
device_power_up(in_suspend ?
(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
Unlock:
device_pm_unlock();
return error;
}
@ -265,7 +289,7 @@ static int create_image(int platform_mode)
* hibernation_snapshot - quiesce devices and create the hibernation
* snapshot image.
* @platform_mode - if set, use the platform driver, if available, to
* prepare the platform frimware for the power transition.
* prepare the platform firmware for the power transition.
*
* Must be called with pm_mutex held
*/
@ -291,25 +315,9 @@ int hibernation_snapshot(int platform_mode)
if (hibernation_test(TEST_DEVICES))
goto Recover_platform;
error = platform_pre_snapshot(platform_mode);
if (error || hibernation_test(TEST_PLATFORM))
goto Finish;
error = create_image(platform_mode);
/* Control returns here after successful restore */
error = disable_nonboot_cpus();
if (!error) {
if (hibernation_test(TEST_CPUS))
goto Enable_cpus;
if (hibernation_testmode(HIBERNATION_TEST))
goto Enable_cpus;
error = create_image(platform_mode);
/* Control returns here after successful restore */
}
Enable_cpus:
enable_nonboot_cpus();
Finish:
platform_finish(platform_mode);
Resume_devices:
device_resume(in_suspend ?
(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
@ -331,19 +339,33 @@ int hibernation_snapshot(int platform_mode)
* kernel.
*/
static int resume_target_kernel(void)
static int resume_target_kernel(bool platform_mode)
{
int error;
device_pm_lock();
local_irq_disable();
error = device_power_down(PMSG_QUIESCE);
if (error) {
printk(KERN_ERR "PM: Some devices failed to power down, "
"aborting resume\n");
goto Enable_irqs;
goto Unlock;
}
sysdev_suspend(PMSG_QUIESCE);
error = platform_pre_restore(platform_mode);
if (error)
goto Cleanup;
error = disable_nonboot_cpus();
if (error)
goto Enable_cpus;
local_irq_disable();
error = sysdev_suspend(PMSG_QUIESCE);
if (error)
goto Enable_irqs;
/* We'll ignore saved state, but this gets preempt count (etc) right */
save_processor_state();
error = restore_highmem();
@ -366,11 +388,23 @@ static int resume_target_kernel(void)
swsusp_free();
restore_processor_state();
touch_softlockup_watchdog();
sysdev_resume();
device_power_up(PMSG_RECOVER);
Enable_irqs:
local_irq_enable();
Enable_cpus:
enable_nonboot_cpus();
Cleanup:
platform_restore_cleanup(platform_mode);
device_power_up(PMSG_RECOVER);
Unlock:
device_pm_unlock();
return error;
}
@ -378,7 +412,7 @@ static int resume_target_kernel(void)
* hibernation_restore - quiesce devices and restore the hibernation
* snapshot image. If successful, control returns in hibernation_snaphot()
* @platform_mode - if set, use the platform driver, if available, to
* prepare the platform frimware for the transition.
* prepare the platform firmware for the transition.
*
* Must be called with pm_mutex held
*/
@ -390,19 +424,10 @@ int hibernation_restore(int platform_mode)
pm_prepare_console();
suspend_console();
error = device_suspend(PMSG_QUIESCE);
if (error)
goto Finish;
error = platform_pre_restore(platform_mode);
if (!error) {
error = disable_nonboot_cpus();
if (!error)
error = resume_target_kernel();
enable_nonboot_cpus();
error = resume_target_kernel(platform_mode);
device_resume(PMSG_RECOVER);
}
platform_restore_cleanup(platform_mode);
device_resume(PMSG_RECOVER);
Finish:
resume_console();
pm_restore_console();
return error;
@ -438,38 +463,46 @@ int hibernation_platform_enter(void)
goto Resume_devices;
}
device_pm_lock();
error = device_power_down(PMSG_HIBERNATE);
if (error)
goto Unlock;
error = hibernation_ops->prepare();
if (error)
goto Resume_devices;
goto Platofrm_finish;
error = disable_nonboot_cpus();
if (error)
goto Finish;
goto Platofrm_finish;
device_pm_lock();
local_irq_disable();
error = device_power_down(PMSG_HIBERNATE);
if (!error) {
sysdev_suspend(PMSG_HIBERNATE);
hibernation_ops->enter();
/* We should never get here */
while (1);
}
local_irq_enable();
device_pm_unlock();
sysdev_suspend(PMSG_HIBERNATE);
hibernation_ops->enter();
/* We should never get here */
while (1);
/*
* We don't need to reenable the nonboot CPUs or resume consoles, since
* the system is going to be halted anyway.
*/
Finish:
Platofrm_finish:
hibernation_ops->finish();
device_power_up(PMSG_RESTORE);
Unlock:
device_pm_unlock();
Resume_devices:
entering_platform_hibernation = false;
device_resume(PMSG_RESTORE);
resume_console();
Close:
hibernation_ops->end();
return error;
}

View file

@ -287,17 +287,32 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
*/
static int suspend_enter(suspend_state_t state)
{
int error = 0;
int error;
device_pm_lock();
arch_suspend_disable_irqs();
BUG_ON(!irqs_disabled());
if ((error = device_power_down(PMSG_SUSPEND))) {
error = device_power_down(PMSG_SUSPEND);
if (error) {
printk(KERN_ERR "PM: Some devices failed to power down\n");
goto Done;
}
if (suspend_ops->prepare) {
error = suspend_ops->prepare();
if (error)
goto Power_up_devices;
}
if (suspend_test(TEST_PLATFORM))
goto Platfrom_finish;
error = disable_nonboot_cpus();
if (error || suspend_test(TEST_CPUS))
goto Enable_cpus;
arch_suspend_disable_irqs();
BUG_ON(!irqs_disabled());
error = sysdev_suspend(PMSG_SUSPEND);
if (!error) {
if (!suspend_test(TEST_CORE))
@ -305,11 +320,22 @@ static int suspend_enter(suspend_state_t state)
sysdev_resume();
}
device_power_up(PMSG_RESUME);
Done:
arch_suspend_enable_irqs();
BUG_ON(irqs_disabled());
Enable_cpus:
enable_nonboot_cpus();
Platfrom_finish:
if (suspend_ops->finish)
suspend_ops->finish();
Power_up_devices:
device_power_up(PMSG_RESUME);
Done:
device_pm_unlock();
return error;
}
@ -341,23 +367,8 @@ int suspend_devices_and_enter(suspend_state_t state)
if (suspend_test(TEST_DEVICES))
goto Recover_platform;
if (suspend_ops->prepare) {
error = suspend_ops->prepare();
if (error)
goto Resume_devices;
}
suspend_enter(state);
if (suspend_test(TEST_PLATFORM))
goto Finish;
error = disable_nonboot_cpus();
if (!error && !suspend_test(TEST_CPUS))
suspend_enter(state);
enable_nonboot_cpus();
Finish:
if (suspend_ops->finish)
suspend_ops->finish();
Resume_devices:
suspend_test_start();
device_resume(PMSG_RESUME);

View file

@ -321,13 +321,10 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
INIT_LIST_HEAD(list);
for_each_zone(zone) {
for_each_populated_zone(zone) {
unsigned long zone_start, zone_end;
struct mem_extent *ext, *cur, *aux;
if (!populated_zone(zone))
continue;
zone_start = zone->zone_start_pfn;
zone_end = zone->zone_start_pfn + zone->spanned_pages;
@ -804,8 +801,8 @@ static unsigned int count_free_highmem_pages(void)
struct zone *zone;
unsigned int cnt = 0;
for_each_zone(zone)
if (populated_zone(zone) && is_highmem(zone))
for_each_populated_zone(zone)
if (is_highmem(zone))
cnt += zone_page_state(zone, NR_FREE_PAGES);
return cnt;

View file

@ -51,6 +51,7 @@
#include <linux/highmem.h>
#include <linux/time.h>
#include <linux/rbtree.h>
#include <linux/io.h>
#include "power.h"
@ -229,17 +230,16 @@ int swsusp_shrink_memory(void)
size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES;
tmp = size;
size += highmem_size;
for_each_zone (zone)
if (populated_zone(zone)) {
tmp += snapshot_additional_pages(zone);
if (is_highmem(zone)) {
highmem_size -=
for_each_populated_zone(zone) {
tmp += snapshot_additional_pages(zone);
if (is_highmem(zone)) {
highmem_size -=
zone_page_state(zone, NR_FREE_PAGES);
} else {
tmp -= zone_page_state(zone, NR_FREE_PAGES);
tmp += zone->lowmem_reserve[ZONE_NORMAL];
}
} else {
tmp -= zone_page_state(zone, NR_FREE_PAGES);
tmp += zone->lowmem_reserve[ZONE_NORMAL];
}
}
if (highmem_size < 0)
highmem_size = 0;

View file

@ -32,6 +32,7 @@
#include <linux/security.h>
#include <linux/bootmem.h>
#include <linux/syscalls.h>
#include <linux/kexec.h>
#include <asm/uaccess.h>
@ -135,6 +136,24 @@ static char *log_buf = __log_buf;
static int log_buf_len = __LOG_BUF_LEN;
static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
#ifdef CONFIG_KEXEC
/*
* This appends the listed symbols to /proc/vmcoreinfo
*
* /proc/vmcoreinfo is used by various utiilties, like crash and makedumpfile to
* obtain access to symbols that are otherwise very difficult to locate. These
* symbols are specifically used so that utilities can access and extract the
* dmesg log from a vmcore file after a crash.
*/
void log_buf_kexec_setup(void)
{
VMCOREINFO_SYMBOL(log_buf);
VMCOREINFO_SYMBOL(log_end);
VMCOREINFO_SYMBOL(log_buf_len);
VMCOREINFO_SYMBOL(logged_chars);
}
#endif
static int __init log_buf_len_setup(char *str)
{
unsigned size = memparse(str, &str);
@ -1292,8 +1311,11 @@ EXPORT_SYMBOL(printk_ratelimit);
bool printk_timed_ratelimit(unsigned long *caller_jiffies,
unsigned int interval_msecs)
{
if (*caller_jiffies == 0 || time_after(jiffies, *caller_jiffies)) {
*caller_jiffies = jiffies + msecs_to_jiffies(interval_msecs);
if (*caller_jiffies == 0
|| !time_in_range(jiffies, *caller_jiffies,
*caller_jiffies
+ msecs_to_jiffies(interval_msecs))) {
*caller_jiffies = jiffies;
return true;
}
return false;

View file

@ -60,11 +60,15 @@ static void ptrace_untrace(struct task_struct *child)
{
spin_lock(&child->sighand->siglock);
if (task_is_traced(child)) {
if (child->signal->flags & SIGNAL_STOP_STOPPED) {
/*
* If the group stop is completed or in progress,
* this thread was already counted as stopped.
*/
if (child->signal->flags & SIGNAL_STOP_STOPPED ||
child->signal->group_stop_count)
__set_task_state(child, TASK_STOPPED);
} else {
else
signal_wake_up(child, 1);
}
}
spin_unlock(&child->sighand->siglock);
}
@ -235,18 +239,58 @@ out:
return retval;
}
static inline void __ptrace_detach(struct task_struct *child, unsigned int data)
/*
* Called with irqs disabled, returns true if childs should reap themselves.
*/
static int ignoring_children(struct sighand_struct *sigh)
{
child->exit_code = data;
/* .. re-parent .. */
__ptrace_unlink(child);
/* .. and wake it up. */
if (child->exit_state != EXIT_ZOMBIE)
wake_up_process(child);
int ret;
spin_lock(&sigh->siglock);
ret = (sigh->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) ||
(sigh->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT);
spin_unlock(&sigh->siglock);
return ret;
}
/*
* Called with tasklist_lock held for writing.
* Unlink a traced task, and clean it up if it was a traced zombie.
* Return true if it needs to be reaped with release_task().
* (We can't call release_task() here because we already hold tasklist_lock.)
*
* If it's a zombie, our attachedness prevented normal parent notification
* or self-reaping. Do notification now if it would have happened earlier.
* If it should reap itself, return true.
*
* If it's our own child, there is no notification to do.
* But if our normal children self-reap, then this child
* was prevented by ptrace and we must reap it now.
*/
static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
{
__ptrace_unlink(p);
if (p->exit_state == EXIT_ZOMBIE) {
if (!task_detached(p) && thread_group_empty(p)) {
if (!same_thread_group(p->real_parent, tracer))
do_notify_parent(p, p->exit_signal);
else if (ignoring_children(tracer->sighand))
p->exit_signal = -1;
}
if (task_detached(p)) {
/* Mark it as in the process of being reaped. */
p->exit_state = EXIT_DEAD;
return true;
}
}
return false;
}
int ptrace_detach(struct task_struct *child, unsigned int data)
{
bool dead = false;
if (!valid_signal(data))
return -EIO;
@ -255,14 +299,45 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
write_lock_irq(&tasklist_lock);
/* protect against de_thread()->release_task() */
if (child->ptrace)
__ptrace_detach(child, data);
/*
* This child can be already killed. Make sure de_thread() or
* our sub-thread doing do_wait() didn't do release_task() yet.
*/
if (child->ptrace) {
child->exit_code = data;
dead = __ptrace_detach(current, child);
}
write_unlock_irq(&tasklist_lock);
if (unlikely(dead))
release_task(child);
return 0;
}
/*
* Detach all tasks we were using ptrace on.
*/
void exit_ptrace(struct task_struct *tracer)
{
struct task_struct *p, *n;
LIST_HEAD(ptrace_dead);
write_lock_irq(&tasklist_lock);
list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
if (__ptrace_detach(tracer, p))
list_add(&p->ptrace_entry, &ptrace_dead);
}
write_unlock_irq(&tasklist_lock);
BUG_ON(!list_empty(&tracer->ptraced));
list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) {
list_del_init(&p->ptrace_entry);
release_task(p);
}
}
int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
{
int copied = 0;
@ -612,8 +687,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
goto out_put_task_struct;
ret = arch_ptrace(child, request, addr, data);
if (ret < 0)
goto out_put_task_struct;
out_put_task_struct:
put_task_struct(child);

View file

@ -122,6 +122,8 @@ static void rcu_barrier_func(void *type)
}
}
static inline void wait_migrated_callbacks(void);
/*
* Orchestrate the specified type of RCU barrier, waiting for all
* RCU callbacks of the specified type to complete.
@ -147,6 +149,7 @@ static void _rcu_barrier(enum rcu_barrier type)
complete(&rcu_barrier_completion);
wait_for_completion(&rcu_barrier_completion);
mutex_unlock(&rcu_barrier_mutex);
wait_migrated_callbacks();
}
/**
@ -176,9 +179,50 @@ void rcu_barrier_sched(void)
}
EXPORT_SYMBOL_GPL(rcu_barrier_sched);
static atomic_t rcu_migrate_type_count = ATOMIC_INIT(0);
static struct rcu_head rcu_migrate_head[3];
static DECLARE_WAIT_QUEUE_HEAD(rcu_migrate_wq);
static void rcu_migrate_callback(struct rcu_head *notused)
{
if (atomic_dec_and_test(&rcu_migrate_type_count))
wake_up(&rcu_migrate_wq);
}
static inline void wait_migrated_callbacks(void)
{
wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count));
}
static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
unsigned long action, void *hcpu)
{
if (action == CPU_DYING) {
/*
* preempt_disable() in on_each_cpu() prevents stop_machine(),
* so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);"
* returns, all online cpus have queued rcu_barrier_func(),
* and the dead cpu(if it exist) queues rcu_migrate_callback()s.
*
* These callbacks ensure _rcu_barrier() waits for all
* RCU callbacks of the specified type to complete.
*/
atomic_set(&rcu_migrate_type_count, 3);
call_rcu_bh(rcu_migrate_head, rcu_migrate_callback);
call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback);
call_rcu(rcu_migrate_head + 2, rcu_migrate_callback);
} else if (action == CPU_POST_DEAD) {
/* rcu_migrate_head is protected by cpu_add_remove_lock */
wait_migrated_callbacks();
}
return NOTIFY_OK;
}
void __init rcu_init(void)
{
__rcu_init();
hotcpu_notifier(rcu_barrier_cpu_hotplug, 0);
}
void rcu_scheduler_starting(void)

View file

@ -126,6 +126,7 @@ static atomic_t n_rcu_torture_mberror;
static atomic_t n_rcu_torture_error;
static long n_rcu_torture_timers = 0;
static struct list_head rcu_torture_removed;
static cpumask_var_t shuffle_tmp_mask;
static int stutter_pause_test = 0;
@ -889,10 +890,9 @@ static int rcu_idle_cpu; /* Force all torture tasks off this CPU */
*/
static void rcu_torture_shuffle_tasks(void)
{
cpumask_t tmp_mask;
int i;
cpus_setall(tmp_mask);
cpumask_setall(shuffle_tmp_mask);
get_online_cpus();
/* No point in shuffling if there is only one online CPU (ex: UP) */
@ -902,29 +902,29 @@ static void rcu_torture_shuffle_tasks(void)
}
if (rcu_idle_cpu != -1)
cpu_clear(rcu_idle_cpu, tmp_mask);
cpumask_clear_cpu(rcu_idle_cpu, shuffle_tmp_mask);
set_cpus_allowed_ptr(current, &tmp_mask);
set_cpus_allowed_ptr(current, shuffle_tmp_mask);
if (reader_tasks) {
for (i = 0; i < nrealreaders; i++)
if (reader_tasks[i])
set_cpus_allowed_ptr(reader_tasks[i],
&tmp_mask);
shuffle_tmp_mask);
}
if (fakewriter_tasks) {
for (i = 0; i < nfakewriters; i++)
if (fakewriter_tasks[i])
set_cpus_allowed_ptr(fakewriter_tasks[i],
&tmp_mask);
shuffle_tmp_mask);
}
if (writer_task)
set_cpus_allowed_ptr(writer_task, &tmp_mask);
set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask);
if (stats_task)
set_cpus_allowed_ptr(stats_task, &tmp_mask);
set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask);
if (rcu_idle_cpu == -1)
rcu_idle_cpu = num_online_cpus() - 1;
@ -1012,6 +1012,7 @@ rcu_torture_cleanup(void)
if (shuffler_task) {
VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
kthread_stop(shuffler_task);
free_cpumask_var(shuffle_tmp_mask);
}
shuffler_task = NULL;
@ -1190,10 +1191,18 @@ rcu_torture_init(void)
}
if (test_no_idle_hz) {
rcu_idle_cpu = num_online_cpus() - 1;
if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) {
firsterr = -ENOMEM;
VERBOSE_PRINTK_ERRSTRING("Failed to alloc mask");
goto unwind;
}
/* Create the shuffler thread */
shuffler_task = kthread_run(rcu_torture_shuffle, NULL,
"rcu_torture_shuffle");
if (IS_ERR(shuffler_task)) {
free_cpumask_var(shuffle_tmp_mask);
firsterr = PTR_ERR(shuffler_task);
VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler");
shuffler_task = NULL;

View file

@ -677,9 +677,7 @@ int relay_late_setup_files(struct rchan *chan,
*/
for_each_online_cpu(i) {
if (unlikely(!chan->buf[i])) {
printk(KERN_ERR "relay_late_setup_files: CPU %u "
"has no buffer, it must have!\n", i);
BUG();
WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n");
err = -EINVAL;
break;
}
@ -750,7 +748,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
* from the scheduler (trying to re-grab
* rq->lock), so defer it.
*/
__mod_timer(&buf->timer, jiffies + 1);
mod_timer(&buf->timer, jiffies + 1);
}
old = buf->data;
@ -797,13 +795,15 @@ void relay_subbufs_consumed(struct rchan *chan,
if (!chan)
return;
if (cpu >= NR_CPUS || !chan->buf[cpu])
if (cpu >= NR_CPUS || !chan->buf[cpu] ||
subbufs_consumed > chan->n_subbufs)
return;
buf = chan->buf[cpu];
buf->subbufs_consumed += subbufs_consumed;
if (buf->subbufs_consumed > buf->subbufs_produced)
if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed)
buf->subbufs_consumed = buf->subbufs_produced;
else
buf->subbufs_consumed += subbufs_consumed;
}
EXPORT_SYMBOL_GPL(relay_subbufs_consumed);

File diff suppressed because it is too large Load diff

View file

@ -24,11 +24,12 @@
* The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
* consistent between cpus (never more than 2 jiffies difference).
*/
#include <linux/sched.h>
#include <linux/percpu.h>
#include <linux/spinlock.h>
#include <linux/ktime.h>
#include <linux/hardirq.h>
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/ktime.h>
#include <linux/sched.h>
/*
* Scheduler clock - returns current time in nanosec units.
@ -43,6 +44,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
static __read_mostly int sched_clock_running;
#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
__read_mostly int sched_clock_stable;
struct sched_clock_data {
/*
@ -87,7 +89,7 @@ void sched_clock_init(void)
}
/*
* min,max except they take wrapping into account
* min, max except they take wrapping into account
*/
static inline u64 wrap_min(u64 x, u64 y)
@ -111,15 +113,13 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
s64 delta = now - scd->tick_raw;
u64 clock, min_clock, max_clock;
WARN_ON_ONCE(!irqs_disabled());
if (unlikely(delta < 0))
delta = 0;
/*
* scd->clock = clamp(scd->tick_gtod + delta,
* max(scd->tick_gtod, scd->clock),
* scd->tick_gtod + TICK_NSEC);
* max(scd->tick_gtod, scd->clock),
* scd->tick_gtod + TICK_NSEC);
*/
clock = scd->tick_gtod + delta;
@ -148,8 +148,20 @@ static void lock_double_clock(struct sched_clock_data *data1,
u64 sched_clock_cpu(int cpu)
{
struct sched_clock_data *scd = cpu_sdc(cpu);
u64 now, clock, this_clock, remote_clock;
struct sched_clock_data *scd;
if (sched_clock_stable)
return sched_clock();
scd = cpu_sdc(cpu);
/*
* Normally this is not called in NMI context - but if it is,
* trying to do any locking here is totally lethal.
*/
if (unlikely(in_nmi()))
return scd->clock;
if (unlikely(!sched_clock_running))
return 0ull;
@ -195,14 +207,18 @@ u64 sched_clock_cpu(int cpu)
void sched_clock_tick(void)
{
struct sched_clock_data *scd = this_scd();
struct sched_clock_data *scd;
u64 now, now_gtod;
if (sched_clock_stable)
return;
if (unlikely(!sched_clock_running))
return;
WARN_ON_ONCE(!irqs_disabled());
scd = this_scd();
now_gtod = ktime_to_ns(ktime_get());
now = sched_clock();
@ -250,7 +266,7 @@ u64 sched_clock_cpu(int cpu)
return sched_clock();
}
#endif
#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
unsigned long long cpu_clock(int cpu)
{

View file

@ -25,7 +25,7 @@ struct cpupri {
#ifdef CONFIG_SMP
int cpupri_find(struct cpupri *cp,
struct task_struct *p, cpumask_t *lowest_mask);
struct task_struct *p, struct cpumask *lowest_mask);
void cpupri_set(struct cpupri *cp, int cpu, int pri);
int cpupri_init(struct cpupri *cp, bool bootmem);
void cpupri_cleanup(struct cpupri *cp);

View file

@ -272,7 +272,6 @@ static void print_cpu(struct seq_file *m, int cpu)
P(nr_switches);
P(nr_load_updates);
P(nr_uninterruptible);
SEQ_printf(m, " .%-30s: %lu\n", "jiffies", jiffies);
PN(next_balance);
P(curr->pid);
PN(clock);
@ -287,9 +286,6 @@ static void print_cpu(struct seq_file *m, int cpu)
#ifdef CONFIG_SCHEDSTATS
#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
P(yld_exp_empty);
P(yld_act_empty);
P(yld_both_empty);
P(yld_count);
P(sched_switch);
@ -314,7 +310,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
u64 now = ktime_to_ns(ktime_get());
int cpu;
SEQ_printf(m, "Sched Debug Version: v0.08, %s %.*s\n",
SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n",
init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
init_utsname()->version);
@ -325,6 +321,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
#define PN(x) \
SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
P(jiffies);
PN(sysctl_sched_latency);
PN(sysctl_sched_min_granularity);
PN(sysctl_sched_wakeup_granularity);
@ -397,6 +394,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
PN(se.vruntime);
PN(se.sum_exec_runtime);
PN(se.avg_overlap);
PN(se.avg_wakeup);
nr_switches = p->nvcsw + p->nivcsw;

View file

@ -1314,16 +1314,63 @@ out:
}
#endif /* CONFIG_SMP */
static unsigned long wakeup_gran(struct sched_entity *se)
/*
* Adaptive granularity
*
* se->avg_wakeup gives the average time a task runs until it does a wakeup,
* with the limit of wakeup_gran -- when it never does a wakeup.
*
* So the smaller avg_wakeup is the faster we want this task to preempt,
* but we don't want to treat the preemptee unfairly and therefore allow it
* to run for at least the amount of time we'd like to run.
*
* NOTE: we use 2*avg_wakeup to increase the probability of actually doing one
*
* NOTE: we use *nr_running to scale with load, this nicely matches the
* degrading latency on load.
*/
static unsigned long
adaptive_gran(struct sched_entity *curr, struct sched_entity *se)
{
u64 this_run = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
u64 expected_wakeup = 2*se->avg_wakeup * cfs_rq_of(se)->nr_running;
u64 gran = 0;
if (this_run < expected_wakeup)
gran = expected_wakeup - this_run;
return min_t(s64, gran, sysctl_sched_wakeup_granularity);
}
static unsigned long
wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
{
unsigned long gran = sysctl_sched_wakeup_granularity;
if (cfs_rq_of(curr)->curr && sched_feat(ADAPTIVE_GRAN))
gran = adaptive_gran(curr, se);
/*
* More easily preempt - nice tasks, while not making it harder for
* + nice tasks.
* Since its curr running now, convert the gran from real-time
* to virtual-time in his units.
*/
if (!sched_feat(ASYM_GRAN) || se->load.weight > NICE_0_LOAD)
gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
if (sched_feat(ASYM_GRAN)) {
/*
* By using 'se' instead of 'curr' we penalize light tasks, so
* they get preempted easier. That is, if 'se' < 'curr' then
* the resulting gran will be larger, therefore penalizing the
* lighter, if otoh 'se' > 'curr' then the resulting gran will
* be smaller, again penalizing the lighter task.
*
* This is especially important for buddies when the leftmost
* task is higher priority than the buddy.
*/
if (unlikely(se->load.weight != NICE_0_LOAD))
gran = calc_delta_fair(gran, se);
} else {
if (unlikely(curr->load.weight != NICE_0_LOAD))
gran = calc_delta_fair(gran, curr);
}
return gran;
}
@ -1350,7 +1397,7 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
if (vdiff <= 0)
return -1;
gran = wakeup_gran(curr);
gran = wakeup_gran(curr, se);
if (vdiff > gran)
return 1;

View file

@ -1,5 +1,6 @@
SCHED_FEAT(NEW_FAIR_SLEEPERS, 1)
SCHED_FEAT(NORMALIZED_SLEEPER, 1)
SCHED_FEAT(NORMALIZED_SLEEPER, 0)
SCHED_FEAT(ADAPTIVE_GRAN, 1)
SCHED_FEAT(WAKEUP_PREEMPT, 1)
SCHED_FEAT(START_DEBIT, 1)
SCHED_FEAT(AFFINE_WAKEUPS, 1)
@ -13,3 +14,4 @@ SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
SCHED_FEAT(ASYM_EFF_LOAD, 1)
SCHED_FEAT(WAKEUP_OVERLAP, 0)
SCHED_FEAT(LAST_BUDDY, 1)
SCHED_FEAT(OWNER_SPIN, 1)

View file

@ -3,6 +3,40 @@
* policies)
*/
static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
{
return container_of(rt_se, struct task_struct, rt);
}
#ifdef CONFIG_RT_GROUP_SCHED
static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
{
return rt_rq->rq;
}
static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
{
return rt_se->rt_rq;
}
#else /* CONFIG_RT_GROUP_SCHED */
static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
{
return container_of(rt_rq, struct rq, rt);
}
static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
{
struct task_struct *p = rt_task_of(rt_se);
struct rq *rq = task_rq(p);
return &rq->rt;
}
#endif /* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_SMP
static inline int rt_overloaded(struct rq *rq)
@ -37,25 +71,69 @@ static inline void rt_clear_overload(struct rq *rq)
cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
}
static void update_rt_migration(struct rq *rq)
static void update_rt_migration(struct rt_rq *rt_rq)
{
if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) {
if (!rq->rt.overloaded) {
rt_set_overload(rq);
rq->rt.overloaded = 1;
if (rt_rq->rt_nr_migratory && (rt_rq->rt_nr_running > 1)) {
if (!rt_rq->overloaded) {
rt_set_overload(rq_of_rt_rq(rt_rq));
rt_rq->overloaded = 1;
}
} else if (rq->rt.overloaded) {
rt_clear_overload(rq);
rq->rt.overloaded = 0;
} else if (rt_rq->overloaded) {
rt_clear_overload(rq_of_rt_rq(rt_rq));
rt_rq->overloaded = 0;
}
}
#endif /* CONFIG_SMP */
static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
return container_of(rt_se, struct task_struct, rt);
if (rt_se->nr_cpus_allowed > 1)
rt_rq->rt_nr_migratory++;
update_rt_migration(rt_rq);
}
static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
if (rt_se->nr_cpus_allowed > 1)
rt_rq->rt_nr_migratory--;
update_rt_migration(rt_rq);
}
static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
{
plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
plist_node_init(&p->pushable_tasks, p->prio);
plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
}
static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
{
plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
}
#else
static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
{
}
static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
{
}
static inline
void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
}
static inline
void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
}
#endif /* CONFIG_SMP */
static inline int on_rt_rq(struct sched_rt_entity *rt_se)
{
return !list_empty(&rt_se->run_list);
@ -79,16 +157,6 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
#define for_each_leaf_rt_rq(rt_rq, rq) \
list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
{
return rt_rq->rq;
}
static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
{
return rt_se->rt_rq;
}
#define for_each_sched_rt_entity(rt_se) \
for (; rt_se; rt_se = rt_se->parent)
@ -108,7 +176,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
if (rt_rq->rt_nr_running) {
if (rt_se && !on_rt_rq(rt_se))
enqueue_rt_entity(rt_se);
if (rt_rq->highest_prio < curr->prio)
if (rt_rq->highest_prio.curr < curr->prio)
resched_task(curr);
}
}
@ -176,19 +244,6 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
#define for_each_leaf_rt_rq(rt_rq, rq) \
for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
{
return container_of(rt_rq, struct rq, rt);
}
static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
{
struct task_struct *p = rt_task_of(rt_se);
struct rq *rq = task_rq(p);
return &rq->rt;
}
#define for_each_sched_rt_entity(rt_se) \
for (; rt_se; rt_se = NULL)
@ -473,7 +528,7 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)
struct rt_rq *rt_rq = group_rt_rq(rt_se);
if (rt_rq)
return rt_rq->highest_prio;
return rt_rq->highest_prio.curr;
#endif
return rt_task_of(rt_se)->prio;
@ -547,91 +602,174 @@ static void update_curr_rt(struct rq *rq)
}
}
static inline
void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
#if defined CONFIG_SMP
static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu);
static inline int next_prio(struct rq *rq)
{
WARN_ON(!rt_prio(rt_se_prio(rt_se)));
rt_rq->rt_nr_running++;
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
#ifdef CONFIG_SMP
struct rq *rq = rq_of_rt_rq(rt_rq);
#endif
struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu);
if (next && rt_prio(next->prio))
return next->prio;
else
return MAX_RT_PRIO;
}
static void
inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
{
struct rq *rq = rq_of_rt_rq(rt_rq);
if (prio < prev_prio) {
/*
* If the new task is higher in priority than anything on the
* run-queue, we know that the previous high becomes our
* next-highest.
*/
rt_rq->highest_prio.next = prev_prio;
rt_rq->highest_prio = rt_se_prio(rt_se);
#ifdef CONFIG_SMP
if (rq->online)
cpupri_set(&rq->rd->cpupri, rq->cpu,
rt_se_prio(rt_se));
#endif
}
#endif
#ifdef CONFIG_SMP
if (rt_se->nr_cpus_allowed > 1) {
struct rq *rq = rq_of_rt_rq(rt_rq);
cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
rq->rt.rt_nr_migratory++;
}
} else if (prio == rt_rq->highest_prio.curr)
/*
* If the next task is equal in priority to the highest on
* the run-queue, then we implicitly know that the next highest
* task cannot be any lower than current
*/
rt_rq->highest_prio.next = prio;
else if (prio < rt_rq->highest_prio.next)
/*
* Otherwise, we need to recompute next-highest
*/
rt_rq->highest_prio.next = next_prio(rq);
}
static void
dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
{
struct rq *rq = rq_of_rt_rq(rt_rq);
if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next))
rt_rq->highest_prio.next = next_prio(rq);
if (rq->online && rt_rq->highest_prio.curr != prev_prio)
cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
}
#else /* CONFIG_SMP */
static inline
void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
static inline
void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
#endif /* CONFIG_SMP */
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
static void
inc_rt_prio(struct rt_rq *rt_rq, int prio)
{
int prev_prio = rt_rq->highest_prio.curr;
if (prio < prev_prio)
rt_rq->highest_prio.curr = prio;
inc_rt_prio_smp(rt_rq, prio, prev_prio);
}
static void
dec_rt_prio(struct rt_rq *rt_rq, int prio)
{
int prev_prio = rt_rq->highest_prio.curr;
if (rt_rq->rt_nr_running) {
WARN_ON(prio < prev_prio);
/*
* This may have been our highest task, and therefore
* we may have some recomputation to do
*/
if (prio == prev_prio) {
struct rt_prio_array *array = &rt_rq->active;
rt_rq->highest_prio.curr =
sched_find_first_bit(array->bitmap);
}
} else
rt_rq->highest_prio.curr = MAX_RT_PRIO;
dec_rt_prio_smp(rt_rq, prio, prev_prio);
}
#else
static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
update_rt_migration(rq_of_rt_rq(rt_rq));
#endif
#ifdef CONFIG_RT_GROUP_SCHED
static void
inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
if (rt_se_boosted(rt_se))
rt_rq->rt_nr_boosted++;
if (rt_rq->tg)
start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
#else
}
static void
dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
if (rt_se_boosted(rt_se))
rt_rq->rt_nr_boosted--;
WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
}
#else /* CONFIG_RT_GROUP_SCHED */
static void
inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
start_rt_bandwidth(&def_rt_bandwidth);
#endif
}
static inline
void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
#endif /* CONFIG_RT_GROUP_SCHED */
static inline
void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
int prio = rt_se_prio(rt_se);
WARN_ON(!rt_prio(prio));
rt_rq->rt_nr_running++;
inc_rt_prio(rt_rq, prio);
inc_rt_migration(rt_se, rt_rq);
inc_rt_group(rt_se, rt_rq);
}
static inline
void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
#ifdef CONFIG_SMP
int highest_prio = rt_rq->highest_prio;
#endif
WARN_ON(!rt_prio(rt_se_prio(rt_se)));
WARN_ON(!rt_rq->rt_nr_running);
rt_rq->rt_nr_running--;
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
if (rt_rq->rt_nr_running) {
struct rt_prio_array *array;
WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio);
if (rt_se_prio(rt_se) == rt_rq->highest_prio) {
/* recalculate */
array = &rt_rq->active;
rt_rq->highest_prio =
sched_find_first_bit(array->bitmap);
} /* otherwise leave rq->highest prio alone */
} else
rt_rq->highest_prio = MAX_RT_PRIO;
#endif
#ifdef CONFIG_SMP
if (rt_se->nr_cpus_allowed > 1) {
struct rq *rq = rq_of_rt_rq(rt_rq);
rq->rt.rt_nr_migratory--;
}
if (rt_rq->highest_prio != highest_prio) {
struct rq *rq = rq_of_rt_rq(rt_rq);
if (rq->online)
cpupri_set(&rq->rd->cpupri, rq->cpu,
rt_rq->highest_prio);
}
update_rt_migration(rq_of_rt_rq(rt_rq));
#endif /* CONFIG_SMP */
#ifdef CONFIG_RT_GROUP_SCHED
if (rt_se_boosted(rt_se))
rt_rq->rt_nr_boosted--;
WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
#endif
dec_rt_prio(rt_rq, rt_se_prio(rt_se));
dec_rt_migration(rt_se, rt_rq);
dec_rt_group(rt_se, rt_rq);
}
static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
@ -718,6 +856,9 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
enqueue_rt_entity(rt_se);
if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
inc_cpu_load(rq, p->se.load.weight);
}
@ -728,6 +869,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
update_curr_rt(rq);
dequeue_rt_entity(rt_se);
dequeue_pushable_task(rq, p);
dec_cpu_load(rq, p->se.load.weight);
}
@ -878,7 +1021,7 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
return next;
}
static struct task_struct *pick_next_task_rt(struct rq *rq)
static struct task_struct *_pick_next_task_rt(struct rq *rq)
{
struct sched_rt_entity *rt_se;
struct task_struct *p;
@ -900,6 +1043,18 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
p = rt_task_of(rt_se);
p->se.exec_start = rq->clock;
return p;
}
static struct task_struct *pick_next_task_rt(struct rq *rq)
{
struct task_struct *p = _pick_next_task_rt(rq);
/* The running task is never eligible for pushing */
if (p)
dequeue_pushable_task(rq, p);
return p;
}
@ -907,6 +1062,13 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
{
update_curr_rt(rq);
p->se.exec_start = 0;
/*
* The previous task needs to be made eligible for pushing
* if it is still active
*/
if (p->se.on_rq && p->rt.nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
}
#ifdef CONFIG_SMP
@ -1080,7 +1242,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
}
/* If this rq is still suitable use it. */
if (lowest_rq->rt.highest_prio > task->prio)
if (lowest_rq->rt.highest_prio.curr > task->prio)
break;
/* try again */
@ -1091,6 +1253,31 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
return lowest_rq;
}
static inline int has_pushable_tasks(struct rq *rq)
{
return !plist_head_empty(&rq->rt.pushable_tasks);
}
static struct task_struct *pick_next_pushable_task(struct rq *rq)
{
struct task_struct *p;
if (!has_pushable_tasks(rq))
return NULL;
p = plist_first_entry(&rq->rt.pushable_tasks,
struct task_struct, pushable_tasks);
BUG_ON(rq->cpu != task_cpu(p));
BUG_ON(task_current(rq, p));
BUG_ON(p->rt.nr_cpus_allowed <= 1);
BUG_ON(!p->se.on_rq);
BUG_ON(!rt_task(p));
return p;
}
/*
* If the current CPU has more than one RT task, see if the non
* running task can migrate over to a CPU that is running a task
@ -1100,13 +1287,11 @@ static int push_rt_task(struct rq *rq)
{
struct task_struct *next_task;
struct rq *lowest_rq;
int ret = 0;
int paranoid = RT_MAX_TRIES;
if (!rq->rt.overloaded)
return 0;
next_task = pick_next_highest_task_rt(rq, -1);
next_task = pick_next_pushable_task(rq);
if (!next_task)
return 0;
@ -1135,16 +1320,34 @@ static int push_rt_task(struct rq *rq)
struct task_struct *task;
/*
* find lock_lowest_rq releases rq->lock
* so it is possible that next_task has changed.
* If it has, then try again.
* so it is possible that next_task has migrated.
*
* We need to make sure that the task is still on the same
* run-queue and is also still the next task eligible for
* pushing.
*/
task = pick_next_highest_task_rt(rq, -1);
if (unlikely(task != next_task) && task && paranoid--) {
put_task_struct(next_task);
next_task = task;
goto retry;
task = pick_next_pushable_task(rq);
if (task_cpu(next_task) == rq->cpu && task == next_task) {
/*
* If we get here, the task hasnt moved at all, but
* it has failed to push. We will not try again,
* since the other cpus will pull from us when they
* are ready.
*/
dequeue_pushable_task(rq, next_task);
goto out;
}
goto out;
if (!task)
/* No more tasks, just exit */
goto out;
/*
* Something has shifted, try again.
*/
put_task_struct(next_task);
next_task = task;
goto retry;
}
deactivate_task(rq, next_task, 0);
@ -1155,23 +1358,12 @@ static int push_rt_task(struct rq *rq)
double_unlock_balance(rq, lowest_rq);
ret = 1;
out:
put_task_struct(next_task);
return ret;
return 1;
}
/*
* TODO: Currently we just use the second highest prio task on
* the queue, and stop when it can't migrate (or there's
* no more RT tasks). There may be a case where a lower
* priority RT task has a different affinity than the
* higher RT task. In this case the lower RT task could
* possibly be able to migrate where as the higher priority
* RT task could not. We currently ignore this issue.
* Enhancements are welcome!
*/
static void push_rt_tasks(struct rq *rq)
{
/* push_rt_task will return true if it moved an RT */
@ -1182,33 +1374,35 @@ static void push_rt_tasks(struct rq *rq)
static int pull_rt_task(struct rq *this_rq)
{
int this_cpu = this_rq->cpu, ret = 0, cpu;
struct task_struct *p, *next;
struct task_struct *p;
struct rq *src_rq;
if (likely(!rt_overloaded(this_rq)))
return 0;
next = pick_next_task_rt(this_rq);
for_each_cpu(cpu, this_rq->rd->rto_mask) {
if (this_cpu == cpu)
continue;
src_rq = cpu_rq(cpu);
/*
* Don't bother taking the src_rq->lock if the next highest
* task is known to be lower-priority than our current task.
* This may look racy, but if this value is about to go
* logically higher, the src_rq will push this task away.
* And if its going logically lower, we do not care
*/
if (src_rq->rt.highest_prio.next >=
this_rq->rt.highest_prio.curr)
continue;
/*
* We can potentially drop this_rq's lock in
* double_lock_balance, and another CPU could
* steal our next task - hence we must cause
* the caller to recalculate the next task
* in that case:
* alter this_rq
*/
if (double_lock_balance(this_rq, src_rq)) {
struct task_struct *old_next = next;
next = pick_next_task_rt(this_rq);
if (next != old_next)
ret = 1;
}
double_lock_balance(this_rq, src_rq);
/*
* Are there still pullable RT tasks?
@ -1222,7 +1416,7 @@ static int pull_rt_task(struct rq *this_rq)
* Do we have an RT task that preempts
* the to-be-scheduled task?
*/
if (p && (!next || (p->prio < next->prio))) {
if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
WARN_ON(p == src_rq->curr);
WARN_ON(!p->se.on_rq);
@ -1232,12 +1426,9 @@ static int pull_rt_task(struct rq *this_rq)
* This is just that p is wakeing up and hasn't
* had a chance to schedule. We only pull
* p if it is lower in priority than the
* current task on the run queue or
* this_rq next task is lower in prio than
* the current task on that rq.
* current task on the run queue
*/
if (p->prio < src_rq->curr->prio ||
(next && next->prio < src_rq->curr->prio))
if (p->prio < src_rq->curr->prio)
goto skip;
ret = 1;
@ -1250,13 +1441,7 @@ static int pull_rt_task(struct rq *this_rq)
* case there's an even higher prio task
* in another runqueue. (low likelyhood
* but possible)
*
* Update next so that we won't pick a task
* on another cpu with a priority lower (or equal)
* than the one we just picked.
*/
next = p;
}
skip:
double_unlock_balance(this_rq, src_rq);
@ -1268,24 +1453,27 @@ static int pull_rt_task(struct rq *this_rq)
static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
{
/* Try to pull RT tasks here if we lower this rq's prio */
if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio)
if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio)
pull_rt_task(rq);
}
/*
* assumes rq->lock is held
*/
static int needs_post_schedule_rt(struct rq *rq)
{
return has_pushable_tasks(rq);
}
static void post_schedule_rt(struct rq *rq)
{
/*
* If we have more than one rt_task queued, then
* see if we can push the other rt_tasks off to other CPUS.
* Note we may release the rq lock, and since
* the lock was owned by prev, we need to release it
* first via finish_lock_switch and then reaquire it here.
* This is only called if needs_post_schedule_rt() indicates that
* we need to push tasks away
*/
if (unlikely(rq->rt.overloaded)) {
spin_lock_irq(&rq->lock);
push_rt_tasks(rq);
spin_unlock_irq(&rq->lock);
}
spin_lock_irq(&rq->lock);
push_rt_tasks(rq);
spin_unlock_irq(&rq->lock);
}
/*
@ -1296,7 +1484,8 @@ static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
{
if (!task_running(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
rq->rt.overloaded)
has_pushable_tasks(rq) &&
p->rt.nr_cpus_allowed > 1)
push_rt_tasks(rq);
}
@ -1332,6 +1521,24 @@ static void set_cpus_allowed_rt(struct task_struct *p,
if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
struct rq *rq = task_rq(p);
if (!task_current(rq, p)) {
/*
* Make sure we dequeue this task from the pushable list
* before going further. It will either remain off of
* the list because we are no longer pushable, or it
* will be requeued.
*/
if (p->rt.nr_cpus_allowed > 1)
dequeue_pushable_task(rq, p);
/*
* Requeue if our weight is changing and still > 1
*/
if (weight > 1)
enqueue_pushable_task(rq, p);
}
if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
rq->rt.rt_nr_migratory++;
} else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
@ -1339,7 +1546,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
rq->rt.rt_nr_migratory--;
}
update_rt_migration(rq);
update_rt_migration(&rq->rt);
}
cpumask_copy(&p->cpus_allowed, new_mask);
@ -1354,7 +1561,7 @@ static void rq_online_rt(struct rq *rq)
__enable_runtime(rq);
cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio);
cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
}
/* Assumes rq->lock is held */
@ -1446,7 +1653,7 @@ static void prio_changed_rt(struct rq *rq, struct task_struct *p,
* can release the rq lock and p could migrate.
* Only reschedule if p is still on the same runqueue.
*/
if (p->prio > rq->rt.highest_prio && rq->curr == p)
if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
resched_task(p);
#else
/* For UP simply resched on drop of prio */
@ -1517,6 +1724,9 @@ static void set_curr_task_rt(struct rq *rq)
struct task_struct *p = rq->curr;
p->se.exec_start = rq->clock;
/* The running task is never eligible for pushing */
dequeue_pushable_task(rq, p);
}
static const struct sched_class rt_sched_class = {
@ -1539,6 +1749,7 @@ static const struct sched_class rt_sched_class = {
.rq_online = rq_online_rt,
.rq_offline = rq_offline_rt,
.pre_schedule = pre_schedule_rt,
.needs_post_schedule = needs_post_schedule_rt,
.post_schedule = post_schedule_rt,
.task_wake_up = task_wake_up_rt,
.switched_from = switched_from_rt,

View file

@ -4,7 +4,7 @@
* bump this up when changing the output format or the meaning of an existing
* format, so that tools can adapt (or abort)
*/
#define SCHEDSTAT_VERSION 14
#define SCHEDSTAT_VERSION 15
static int show_schedstat(struct seq_file *seq, void *v)
{
@ -26,9 +26,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
/* runqueue-specific stats */
seq_printf(seq,
"cpu%d %u %u %u %u %u %u %u %u %u %llu %llu %lu",
cpu, rq->yld_both_empty,
rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count,
"cpu%d %u %u %u %u %u %u %llu %llu %lu",
cpu, rq->yld_count,
rq->sched_switch, rq->sched_count, rq->sched_goidle,
rq->ttwu_count, rq->ttwu_local,
rq->rq_cpu_time,

View file

@ -55,10 +55,22 @@ static int sig_handler_ignored(void __user *handler, int sig)
(handler == SIG_DFL && sig_kernel_ignore(sig));
}
static int sig_ignored(struct task_struct *t, int sig)
static int sig_task_ignored(struct task_struct *t, int sig,
int from_ancestor_ns)
{
void __user *handler;
handler = sig_handler(t, sig);
if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
handler == SIG_DFL && !from_ancestor_ns)
return 1;
return sig_handler_ignored(handler, sig);
}
static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
{
/*
* Blocked signals are never ignored, since the
* signal handler may change by the time it is
@ -67,14 +79,13 @@ static int sig_ignored(struct task_struct *t, int sig)
if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
return 0;
handler = sig_handler(t, sig);
if (!sig_handler_ignored(handler, sig))
if (!sig_task_ignored(t, sig, from_ancestor_ns))
return 0;
/*
* Tracers may want to know about even ignored signals.
*/
return !tracehook_consider_ignored_signal(t, sig, handler);
return !tracehook_consider_ignored_signal(t, sig);
}
/*
@ -318,7 +329,7 @@ int unhandled_signal(struct task_struct *tsk, int sig)
return 1;
if (handler != SIG_IGN && handler != SIG_DFL)
return 0;
return !tracehook_consider_fatal_signal(tsk, sig, handler);
return !tracehook_consider_fatal_signal(tsk, sig);
}
@ -624,7 +635,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
* Returns true if the signal should be actually delivered, otherwise
* it should be dropped.
*/
static int prepare_signal(int sig, struct task_struct *p)
static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
{
struct signal_struct *signal = p->signal;
struct task_struct *t;
@ -708,7 +719,7 @@ static int prepare_signal(int sig, struct task_struct *p)
}
}
return !sig_ignored(p, sig);
return !sig_ignored(p, sig, from_ancestor_ns);
}
/*
@ -777,7 +788,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
!(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
!sigismember(&t->real_blocked, sig) &&
(sig == SIGKILL ||
!tracehook_consider_fatal_signal(t, sig, SIG_DFL))) {
!tracehook_consider_fatal_signal(t, sig))) {
/*
* This signal will be fatal to the whole group.
*/
@ -813,8 +824,8 @@ static inline int legacy_queue(struct sigpending *signals, int sig)
return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
}
static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
int group)
static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
int group, int from_ancestor_ns)
{
struct sigpending *pending;
struct sigqueue *q;
@ -822,7 +833,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
trace_sched_signal_send(sig, t);
assert_spin_locked(&t->sighand->siglock);
if (!prepare_signal(sig, t))
if (!prepare_signal(sig, t, from_ancestor_ns))
return 0;
pending = group ? &t->signal->shared_pending : &t->pending;
@ -871,6 +883,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
break;
default:
copy_siginfo(&q->info, info);
if (from_ancestor_ns)
q->info.si_pid = 0;
break;
}
} else if (!is_si_special(info)) {
@ -889,6 +903,20 @@ out_set:
return 0;
}
static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
int group)
{
int from_ancestor_ns = 0;
#ifdef CONFIG_PID_NS
if (!is_si_special(info) && SI_FROMUSER(info) &&
task_pid_nr_ns(current, task_active_pid_ns(t)) <= 0)
from_ancestor_ns = 1;
#endif
return __send_signal(sig, info, t, group, from_ancestor_ns);
}
int print_fatal_signals;
static void print_fatal_signal(struct pt_regs *regs, int signr)
@ -1133,7 +1161,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
if (sig && p->sighand) {
unsigned long flags;
spin_lock_irqsave(&p->sighand->siglock, flags);
ret = __group_send_sig_info(sig, info, p);
ret = __send_signal(sig, info, p, 1, 0);
spin_unlock_irqrestore(&p->sighand->siglock, flags);
}
out_unlock:
@ -1320,7 +1348,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
goto ret;
ret = 1; /* the signal is ignored */
if (!prepare_signal(sig, t))
if (!prepare_signal(sig, t, 0))
goto out;
ret = 0;
@ -1575,7 +1603,15 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
read_lock(&tasklist_lock);
if (may_ptrace_stop()) {
do_notify_parent_cldstop(current, CLD_TRAPPED);
/*
* Don't want to allow preemption here, because
* sys_ptrace() needs this task to be inactive.
*
* XXX: implement read_unlock_no_resched().
*/
preempt_disable();
read_unlock(&tasklist_lock);
preempt_enable_no_resched();
schedule();
} else {
/*
@ -1836,9 +1872,16 @@ relock:
/*
* Global init gets no signals it doesn't want.
* Container-init gets no signals it doesn't want from same
* container.
*
* Note that if global/container-init sees a sig_kernel_only()
* signal here, the signal must have been generated internally
* or must have come from an ancestor namespace. In either
* case, the signal cannot be dropped.
*/
if (unlikely(signal->flags & SIGNAL_UNKILLABLE) &&
!signal_group_exit(signal))
!sig_kernel_only(signr))
continue;
if (sig_kernel_stop(signr)) {

640
kernel/slow-work.c Normal file
View file

@ -0,0 +1,640 @@
/* Worker thread pool for slow items, such as filesystem lookups or mkdirs
*
* Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public Licence
* as published by the Free Software Foundation; either version
* 2 of the Licence, or (at your option) any later version.
*
* See Documentation/slow-work.txt
*/
#include <linux/module.h>
#include <linux/slow-work.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/wait.h>
#define SLOW_WORK_CULL_TIMEOUT (5 * HZ) /* cull threads 5s after running out of
* things to do */
#define SLOW_WORK_OOM_TIMEOUT (5 * HZ) /* can't start new threads for 5s after
* OOM */
static void slow_work_cull_timeout(unsigned long);
static void slow_work_oom_timeout(unsigned long);
#ifdef CONFIG_SYSCTL
static int slow_work_min_threads_sysctl(struct ctl_table *, int, struct file *,
void __user *, size_t *, loff_t *);
static int slow_work_max_threads_sysctl(struct ctl_table *, int , struct file *,
void __user *, size_t *, loff_t *);
#endif
/*
* The pool of threads has at least min threads in it as long as someone is
* using the facility, and may have as many as max.
*
* A portion of the pool may be processing very slow operations.
*/
static unsigned slow_work_min_threads = 2;
static unsigned slow_work_max_threads = 4;
static unsigned vslow_work_proportion = 50; /* % of threads that may process
* very slow work */
#ifdef CONFIG_SYSCTL
static const int slow_work_min_min_threads = 2;
static int slow_work_max_max_threads = 255;
static const int slow_work_min_vslow = 1;
static const int slow_work_max_vslow = 99;
ctl_table slow_work_sysctls[] = {
{
.ctl_name = CTL_UNNUMBERED,
.procname = "min-threads",
.data = &slow_work_min_threads,
.maxlen = sizeof(unsigned),
.mode = 0644,
.proc_handler = slow_work_min_threads_sysctl,
.extra1 = (void *) &slow_work_min_min_threads,
.extra2 = &slow_work_max_threads,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "max-threads",
.data = &slow_work_max_threads,
.maxlen = sizeof(unsigned),
.mode = 0644,
.proc_handler = slow_work_max_threads_sysctl,
.extra1 = &slow_work_min_threads,
.extra2 = (void *) &slow_work_max_max_threads,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "vslow-percentage",
.data = &vslow_work_proportion,
.maxlen = sizeof(unsigned),
.mode = 0644,
.proc_handler = &proc_dointvec_minmax,
.extra1 = (void *) &slow_work_min_vslow,
.extra2 = (void *) &slow_work_max_vslow,
},
{ .ctl_name = 0 }
};
#endif
/*
* The active state of the thread pool
*/
static atomic_t slow_work_thread_count;
static atomic_t vslow_work_executing_count;
static bool slow_work_may_not_start_new_thread;
static bool slow_work_cull; /* cull a thread due to lack of activity */
static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0);
static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0);
static struct slow_work slow_work_new_thread; /* new thread starter */
/*
* The queues of work items and the lock governing access to them. These are
* shared between all the CPUs. It doesn't make sense to have per-CPU queues
* as the number of threads bears no relation to the number of CPUs.
*
* There are two queues of work items: one for slow work items, and one for
* very slow work items.
*/
static LIST_HEAD(slow_work_queue);
static LIST_HEAD(vslow_work_queue);
static DEFINE_SPINLOCK(slow_work_queue_lock);
/*
* The thread controls. A variable used to signal to the threads that they
* should exit when the queue is empty, a waitqueue used by the threads to wait
* for signals, and a completion set by the last thread to exit.
*/
static bool slow_work_threads_should_exit;
static DECLARE_WAIT_QUEUE_HEAD(slow_work_thread_wq);
static DECLARE_COMPLETION(slow_work_last_thread_exited);
/*
* The number of users of the thread pool and its lock. Whilst this is zero we
* have no threads hanging around, and when this reaches zero, we wait for all
* active or queued work items to complete and kill all the threads we do have.
*/
static int slow_work_user_count;
static DEFINE_MUTEX(slow_work_user_lock);
/*
* Calculate the maximum number of active threads in the pool that are
* permitted to process very slow work items.
*
* The answer is rounded up to at least 1, but may not equal or exceed the
* maximum number of the threads in the pool. This means we always have at
* least one thread that can process slow work items, and we always have at
* least one thread that won't get tied up doing so.
*/
static unsigned slow_work_calc_vsmax(void)
{
unsigned vsmax;
vsmax = atomic_read(&slow_work_thread_count) * vslow_work_proportion;
vsmax /= 100;
vsmax = max(vsmax, 1U);
return min(vsmax, slow_work_max_threads - 1);
}
/*
* Attempt to execute stuff queued on a slow thread. Return true if we managed
* it, false if there was nothing to do.
*/
static bool slow_work_execute(void)
{
struct slow_work *work = NULL;
unsigned vsmax;
bool very_slow;
vsmax = slow_work_calc_vsmax();
/* see if we can schedule a new thread to be started if we're not
* keeping up with the work */
if (!waitqueue_active(&slow_work_thread_wq) &&
(!list_empty(&slow_work_queue) || !list_empty(&vslow_work_queue)) &&
atomic_read(&slow_work_thread_count) < slow_work_max_threads &&
!slow_work_may_not_start_new_thread)
slow_work_enqueue(&slow_work_new_thread);
/* find something to execute */
spin_lock_irq(&slow_work_queue_lock);
if (!list_empty(&vslow_work_queue) &&
atomic_read(&vslow_work_executing_count) < vsmax) {
work = list_entry(vslow_work_queue.next,
struct slow_work, link);
if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
BUG();
list_del_init(&work->link);
atomic_inc(&vslow_work_executing_count);
very_slow = true;
} else if (!list_empty(&slow_work_queue)) {
work = list_entry(slow_work_queue.next,
struct slow_work, link);
if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
BUG();
list_del_init(&work->link);
very_slow = false;
} else {
very_slow = false; /* avoid the compiler warning */
}
spin_unlock_irq(&slow_work_queue_lock);
if (!work)
return false;
if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags))
BUG();
work->ops->execute(work);
if (very_slow)
atomic_dec(&vslow_work_executing_count);
clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags);
/* if someone tried to enqueue the item whilst we were executing it,
* then it'll be left unenqueued to avoid multiple threads trying to
* execute it simultaneously
*
* there is, however, a race between us testing the pending flag and
* getting the spinlock, and between the enqueuer setting the pending
* flag and getting the spinlock, so we use a deferral bit to tell us
* if the enqueuer got there first
*/
if (test_bit(SLOW_WORK_PENDING, &work->flags)) {
spin_lock_irq(&slow_work_queue_lock);
if (!test_bit(SLOW_WORK_EXECUTING, &work->flags) &&
test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags))
goto auto_requeue;
spin_unlock_irq(&slow_work_queue_lock);
}
work->ops->put_ref(work);
return true;
auto_requeue:
/* we must complete the enqueue operation
* - we transfer our ref on the item back to the appropriate queue
* - don't wake another thread up as we're awake already
*/
if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
list_add_tail(&work->link, &vslow_work_queue);
else
list_add_tail(&work->link, &slow_work_queue);
spin_unlock_irq(&slow_work_queue_lock);
return true;
}
/**
* slow_work_enqueue - Schedule a slow work item for processing
* @work: The work item to queue
*
* Schedule a slow work item for processing. If the item is already undergoing
* execution, this guarantees not to re-enter the execution routine until the
* first execution finishes.
*
* The item is pinned by this function as it retains a reference to it, managed
* through the item operations. The item is unpinned once it has been
* executed.
*
* An item may hog the thread that is running it for a relatively large amount
* of time, sufficient, for example, to perform several lookup, mkdir, create
* and setxattr operations. It may sleep on I/O and may sleep to obtain locks.
*
* Conversely, if a number of items are awaiting processing, it may take some
* time before any given item is given attention. The number of threads in the
* pool may be increased to deal with demand, but only up to a limit.
*
* If SLOW_WORK_VERY_SLOW is set on the work item, then it will be placed in
* the very slow queue, from which only a portion of the threads will be
* allowed to pick items to execute. This ensures that very slow items won't
* overly block ones that are just ordinarily slow.
*
* Returns 0 if successful, -EAGAIN if not.
*/
int slow_work_enqueue(struct slow_work *work)
{
unsigned long flags;
BUG_ON(slow_work_user_count <= 0);
BUG_ON(!work);
BUG_ON(!work->ops);
BUG_ON(!work->ops->get_ref);
/* when honouring an enqueue request, we only promise that we will run
* the work function in the future; we do not promise to run it once
* per enqueue request
*
* we use the PENDING bit to merge together repeat requests without
* having to disable IRQs and take the spinlock, whilst still
* maintaining our promise
*/
if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
spin_lock_irqsave(&slow_work_queue_lock, flags);
/* we promise that we will not attempt to execute the work
* function in more than one thread simultaneously
*
* this, however, leaves us with a problem if we're asked to
* enqueue the work whilst someone is executing the work
* function as simply queueing the work immediately means that
* another thread may try executing it whilst it is already
* under execution
*
* to deal with this, we set the ENQ_DEFERRED bit instead of
* enqueueing, and the thread currently executing the work
* function will enqueue the work item when the work function
* returns and it has cleared the EXECUTING bit
*/
if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
} else {
if (work->ops->get_ref(work) < 0)
goto cant_get_ref;
if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
list_add_tail(&work->link, &vslow_work_queue);
else
list_add_tail(&work->link, &slow_work_queue);
wake_up(&slow_work_thread_wq);
}
spin_unlock_irqrestore(&slow_work_queue_lock, flags);
}
return 0;
cant_get_ref:
spin_unlock_irqrestore(&slow_work_queue_lock, flags);
return -EAGAIN;
}
EXPORT_SYMBOL(slow_work_enqueue);
/*
* Worker thread culling algorithm
*/
static bool slow_work_cull_thread(void)
{
unsigned long flags;
bool do_cull = false;
spin_lock_irqsave(&slow_work_queue_lock, flags);
if (slow_work_cull) {
slow_work_cull = false;
if (list_empty(&slow_work_queue) &&
list_empty(&vslow_work_queue) &&
atomic_read(&slow_work_thread_count) >
slow_work_min_threads) {
mod_timer(&slow_work_cull_timer,
jiffies + SLOW_WORK_CULL_TIMEOUT);
do_cull = true;
}
}
spin_unlock_irqrestore(&slow_work_queue_lock, flags);
return do_cull;
}
/*
* Determine if there is slow work available for dispatch
*/
static inline bool slow_work_available(int vsmax)
{
return !list_empty(&slow_work_queue) ||
(!list_empty(&vslow_work_queue) &&
atomic_read(&vslow_work_executing_count) < vsmax);
}
/*
* Worker thread dispatcher
*/
static int slow_work_thread(void *_data)
{
int vsmax;
DEFINE_WAIT(wait);
set_freezable();
set_user_nice(current, -5);
for (;;) {
vsmax = vslow_work_proportion;
vsmax *= atomic_read(&slow_work_thread_count);
vsmax /= 100;
prepare_to_wait(&slow_work_thread_wq, &wait,
TASK_INTERRUPTIBLE);
if (!freezing(current) &&
!slow_work_threads_should_exit &&
!slow_work_available(vsmax) &&
!slow_work_cull)
schedule();
finish_wait(&slow_work_thread_wq, &wait);
try_to_freeze();
vsmax = vslow_work_proportion;
vsmax *= atomic_read(&slow_work_thread_count);
vsmax /= 100;
if (slow_work_available(vsmax) && slow_work_execute()) {
cond_resched();
if (list_empty(&slow_work_queue) &&
list_empty(&vslow_work_queue) &&
atomic_read(&slow_work_thread_count) >
slow_work_min_threads)
mod_timer(&slow_work_cull_timer,
jiffies + SLOW_WORK_CULL_TIMEOUT);
continue;
}
if (slow_work_threads_should_exit)
break;
if (slow_work_cull && slow_work_cull_thread())
break;
}
if (atomic_dec_and_test(&slow_work_thread_count))
complete_and_exit(&slow_work_last_thread_exited, 0);
return 0;
}
/*
* Handle thread cull timer expiration
*/
static void slow_work_cull_timeout(unsigned long data)
{
slow_work_cull = true;
wake_up(&slow_work_thread_wq);
}
/*
* Get a reference on slow work thread starter
*/
static int slow_work_new_thread_get_ref(struct slow_work *work)
{
return 0;
}
/*
* Drop a reference on slow work thread starter
*/
static void slow_work_new_thread_put_ref(struct slow_work *work)
{
}
/*
* Start a new slow work thread
*/
static void slow_work_new_thread_execute(struct slow_work *work)
{
struct task_struct *p;
if (slow_work_threads_should_exit)
return;
if (atomic_read(&slow_work_thread_count) >= slow_work_max_threads)
return;
if (!mutex_trylock(&slow_work_user_lock))
return;
slow_work_may_not_start_new_thread = true;
atomic_inc(&slow_work_thread_count);
p = kthread_run(slow_work_thread, NULL, "kslowd");
if (IS_ERR(p)) {
printk(KERN_DEBUG "Slow work thread pool: OOM\n");
if (atomic_dec_and_test(&slow_work_thread_count))
BUG(); /* we're running on a slow work thread... */
mod_timer(&slow_work_oom_timer,
jiffies + SLOW_WORK_OOM_TIMEOUT);
} else {
/* ratelimit the starting of new threads */
mod_timer(&slow_work_oom_timer, jiffies + 1);
}
mutex_unlock(&slow_work_user_lock);
}
static const struct slow_work_ops slow_work_new_thread_ops = {
.get_ref = slow_work_new_thread_get_ref,
.put_ref = slow_work_new_thread_put_ref,
.execute = slow_work_new_thread_execute,
};
/*
* post-OOM new thread start suppression expiration
*/
static void slow_work_oom_timeout(unsigned long data)
{
slow_work_may_not_start_new_thread = false;
}
#ifdef CONFIG_SYSCTL
/*
* Handle adjustment of the minimum number of threads
*/
static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
struct file *filp, void __user *buffer,
size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
int n;
if (ret == 0) {
mutex_lock(&slow_work_user_lock);
if (slow_work_user_count > 0) {
/* see if we need to start or stop threads */
n = atomic_read(&slow_work_thread_count) -
slow_work_min_threads;
if (n < 0 && !slow_work_may_not_start_new_thread)
slow_work_enqueue(&slow_work_new_thread);
else if (n > 0)
mod_timer(&slow_work_cull_timer,
jiffies + SLOW_WORK_CULL_TIMEOUT);
}
mutex_unlock(&slow_work_user_lock);
}
return ret;
}
/*
* Handle adjustment of the maximum number of threads
*/
static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
struct file *filp, void __user *buffer,
size_t *lenp, loff_t *ppos)
{
int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
int n;
if (ret == 0) {
mutex_lock(&slow_work_user_lock);
if (slow_work_user_count > 0) {
/* see if we need to stop threads */
n = slow_work_max_threads -
atomic_read(&slow_work_thread_count);
if (n < 0)
mod_timer(&slow_work_cull_timer,
jiffies + SLOW_WORK_CULL_TIMEOUT);
}
mutex_unlock(&slow_work_user_lock);
}
return ret;
}
#endif /* CONFIG_SYSCTL */
/**
* slow_work_register_user - Register a user of the facility
*
* Register a user of the facility, starting up the initial threads if there
* aren't any other users at this point. This will return 0 if successful, or
* an error if not.
*/
int slow_work_register_user(void)
{
struct task_struct *p;
int loop;
mutex_lock(&slow_work_user_lock);
if (slow_work_user_count == 0) {
printk(KERN_NOTICE "Slow work thread pool: Starting up\n");
init_completion(&slow_work_last_thread_exited);
slow_work_threads_should_exit = false;
slow_work_init(&slow_work_new_thread,
&slow_work_new_thread_ops);
slow_work_may_not_start_new_thread = false;
slow_work_cull = false;
/* start the minimum number of threads */
for (loop = 0; loop < slow_work_min_threads; loop++) {
atomic_inc(&slow_work_thread_count);
p = kthread_run(slow_work_thread, NULL, "kslowd");
if (IS_ERR(p))
goto error;
}
printk(KERN_NOTICE "Slow work thread pool: Ready\n");
}
slow_work_user_count++;
mutex_unlock(&slow_work_user_lock);
return 0;
error:
if (atomic_dec_and_test(&slow_work_thread_count))
complete(&slow_work_last_thread_exited);
if (loop > 0) {
printk(KERN_ERR "Slow work thread pool:"
" Aborting startup on ENOMEM\n");
slow_work_threads_should_exit = true;
wake_up_all(&slow_work_thread_wq);
wait_for_completion(&slow_work_last_thread_exited);
printk(KERN_ERR "Slow work thread pool: Aborted\n");
}
mutex_unlock(&slow_work_user_lock);
return PTR_ERR(p);
}
EXPORT_SYMBOL(slow_work_register_user);
/**
* slow_work_unregister_user - Unregister a user of the facility
*
* Unregister a user of the facility, killing all the threads if this was the
* last one.
*/
void slow_work_unregister_user(void)
{
mutex_lock(&slow_work_user_lock);
BUG_ON(slow_work_user_count <= 0);
slow_work_user_count--;
if (slow_work_user_count == 0) {
printk(KERN_NOTICE "Slow work thread pool: Shutting down\n");
slow_work_threads_should_exit = true;
wake_up_all(&slow_work_thread_wq);
wait_for_completion(&slow_work_last_thread_exited);
printk(KERN_NOTICE "Slow work thread pool:"
" Shut down complete\n");
}
del_timer_sync(&slow_work_cull_timer);
mutex_unlock(&slow_work_user_lock);
}
EXPORT_SYMBOL(slow_work_unregister_user);
/*
* Initialise the slow work facility
*/
static int __init init_slow_work(void)
{
unsigned nr_cpus = num_possible_cpus();
if (slow_work_max_threads < nr_cpus)
slow_work_max_threads = nr_cpus;
#ifdef CONFIG_SYSCTL
if (slow_work_max_max_threads < nr_cpus * 2)
slow_work_max_max_threads = nr_cpus * 2;
#endif
return 0;
}
subsys_initcall(init_slow_work);

View file

@ -2,40 +2,82 @@
* Generic helpers for smp ipi calls
*
* (C) Jens Axboe <jens.axboe@oracle.com> 2008
*
*/
#include <linux/init.h>
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/rcupdate.h>
#include <linux/rculist.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/init.h>
#include <linux/smp.h>
#include <linux/cpu.h>
static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
static LIST_HEAD(call_function_queue);
__cacheline_aligned_in_smp DEFINE_SPINLOCK(call_function_lock);
static struct {
struct list_head queue;
spinlock_t lock;
} call_function __cacheline_aligned_in_smp =
{
.queue = LIST_HEAD_INIT(call_function.queue),
.lock = __SPIN_LOCK_UNLOCKED(call_function.lock),
};
enum {
CSD_FLAG_WAIT = 0x01,
CSD_FLAG_ALLOC = 0x02,
CSD_FLAG_LOCK = 0x04,
CSD_FLAG_LOCK = 0x01,
};
struct call_function_data {
struct call_single_data csd;
spinlock_t lock;
unsigned int refs;
struct rcu_head rcu_head;
unsigned long cpumask_bits[];
struct call_single_data csd;
spinlock_t lock;
unsigned int refs;
cpumask_var_t cpumask;
};
struct call_single_queue {
struct list_head list;
spinlock_t lock;
struct list_head list;
spinlock_t lock;
};
static DEFINE_PER_CPU(struct call_function_data, cfd_data) = {
.lock = __SPIN_LOCK_UNLOCKED(cfd_data.lock),
};
static int
hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
if (!alloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
cpu_to_node(cpu)))
return NOTIFY_BAD;
break;
#ifdef CONFIG_CPU_HOTPLUG
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
case CPU_DEAD:
case CPU_DEAD_FROZEN:
free_cpumask_var(cfd->cpumask);
break;
#endif
};
return NOTIFY_OK;
}
static struct notifier_block __cpuinitdata hotplug_cfd_notifier = {
.notifier_call = hotplug_cfd,
};
static int __cpuinit init_call_single_data(void)
{
void *cpu = (void *)(long)smp_processor_id();
int i;
for_each_possible_cpu(i) {
@ -44,29 +86,63 @@ static int __cpuinit init_call_single_data(void)
spin_lock_init(&q->lock);
INIT_LIST_HEAD(&q->list);
}
hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu);
register_cpu_notifier(&hotplug_cfd_notifier);
return 0;
}
early_initcall(init_call_single_data);
static void csd_flag_wait(struct call_single_data *data)
/*
* csd_lock/csd_unlock used to serialize access to per-cpu csd resources
*
* For non-synchronous ipi calls the csd can still be in use by the
* previous function call. For multi-cpu calls its even more interesting
* as we'll have to ensure no other cpu is observing our csd.
*/
static void csd_lock_wait(struct call_single_data *data)
{
/* Wait for response */
do {
if (!(data->flags & CSD_FLAG_WAIT))
break;
while (data->flags & CSD_FLAG_LOCK)
cpu_relax();
} while (1);
}
static void csd_lock(struct call_single_data *data)
{
csd_lock_wait(data);
data->flags = CSD_FLAG_LOCK;
/*
* prevent CPU from reordering the above assignment
* to ->flags with any subsequent assignments to other
* fields of the specified call_single_data structure:
*/
smp_mb();
}
static void csd_unlock(struct call_single_data *data)
{
WARN_ON(!(data->flags & CSD_FLAG_LOCK));
/*
* ensure we're all done before releasing data:
*/
smp_mb();
data->flags &= ~CSD_FLAG_LOCK;
}
/*
* Insert a previously allocated call_single_data element for execution
* on the given CPU. data must already have ->func, ->info, and ->flags set.
* Insert a previously allocated call_single_data element
* for execution on the given CPU. data must already have
* ->func, ->info, and ->flags set.
*/
static void generic_exec_single(int cpu, struct call_single_data *data)
static
void generic_exec_single(int cpu, struct call_single_data *data, int wait)
{
struct call_single_queue *dst = &per_cpu(call_single_queue, cpu);
int wait = data->flags & CSD_FLAG_WAIT, ipi;
unsigned long flags;
int ipi;
spin_lock_irqsave(&dst->lock, flags);
ipi = list_empty(&dst->list);
@ -74,24 +150,21 @@ static void generic_exec_single(int cpu, struct call_single_data *data)
spin_unlock_irqrestore(&dst->lock, flags);
/*
* Make the list addition visible before sending the ipi.
* The list addition should be visible before sending the IPI
* handler locks the list to pull the entry off it because of
* normal cache coherency rules implied by spinlocks.
*
* If IPIs can go out of order to the cache coherency protocol
* in an architecture, sufficient synchronisation should be added
* to arch code to make it appear to obey cache coherency WRT
* locking and barrier primitives. Generic code isn't really
* equipped to do the right thing...
*/
smp_mb();
if (ipi)
arch_send_call_function_single_ipi(cpu);
if (wait)
csd_flag_wait(data);
}
static void rcu_free_call_data(struct rcu_head *head)
{
struct call_function_data *data;
data = container_of(head, struct call_function_data, rcu_head);
kfree(data);
csd_lock_wait(data);
}
/*
@ -104,99 +177,83 @@ void generic_smp_call_function_interrupt(void)
int cpu = get_cpu();
/*
* It's ok to use list_for_each_rcu() here even though we may delete
* 'pos', since list_del_rcu() doesn't clear ->next
* Ensure entry is visible on call_function_queue after we have
* entered the IPI. See comment in smp_call_function_many.
* If we don't have this, then we may miss an entry on the list
* and never get another IPI to process it.
*/
rcu_read_lock();
list_for_each_entry_rcu(data, &call_function_queue, csd.list) {
smp_mb();
/*
* It's ok to use list_for_each_rcu() here even though we may
* delete 'pos', since list_del_rcu() doesn't clear ->next
*/
list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
int refs;
if (!cpumask_test_cpu(cpu, to_cpumask(data->cpumask_bits)))
spin_lock(&data->lock);
if (!cpumask_test_cpu(cpu, data->cpumask)) {
spin_unlock(&data->lock);
continue;
}
cpumask_clear_cpu(cpu, data->cpumask);
spin_unlock(&data->lock);
data->csd.func(data->csd.info);
spin_lock(&data->lock);
cpumask_clear_cpu(cpu, to_cpumask(data->cpumask_bits));
WARN_ON(data->refs == 0);
data->refs--;
refs = data->refs;
refs = --data->refs;
if (!refs) {
spin_lock(&call_function.lock);
list_del_rcu(&data->csd.list);
spin_unlock(&call_function.lock);
}
spin_unlock(&data->lock);
if (refs)
continue;
spin_lock(&call_function_lock);
list_del_rcu(&data->csd.list);
spin_unlock(&call_function_lock);
if (data->csd.flags & CSD_FLAG_WAIT) {
/*
* serialize stores to data with the flag clear
* and wakeup
*/
smp_wmb();
data->csd.flags &= ~CSD_FLAG_WAIT;
}
if (data->csd.flags & CSD_FLAG_ALLOC)
call_rcu(&data->rcu_head, rcu_free_call_data);
csd_unlock(&data->csd);
}
rcu_read_unlock();
put_cpu();
}
/*
* Invoked by arch to handle an IPI for call function single. Must be called
* from the arch with interrupts disabled.
* Invoked by arch to handle an IPI for call function single. Must be
* called from the arch with interrupts disabled.
*/
void generic_smp_call_function_single_interrupt(void)
{
struct call_single_queue *q = &__get_cpu_var(call_single_queue);
unsigned int data_flags;
LIST_HEAD(list);
/*
* Need to see other stores to list head for checking whether
* list is empty without holding q->lock
*/
smp_read_barrier_depends();
while (!list_empty(&q->list)) {
unsigned int data_flags;
spin_lock(&q->lock);
list_replace_init(&q->list, &list);
spin_unlock(&q->lock);
spin_lock(&q->lock);
list_replace_init(&q->list, &list);
spin_unlock(&q->lock);
while (!list_empty(&list)) {
struct call_single_data *data;
while (!list_empty(&list)) {
struct call_single_data *data;
data = list_entry(list.next, struct call_single_data, list);
list_del(&data->list);
data = list_entry(list.next, struct call_single_data,
list);
list_del(&data->list);
/*
* 'data' can be invalid after this call if
* flags == 0 (when called through
* generic_exec_single(), so save them away before
* making the call.
*/
data_flags = data->flags;
data->func(data->info);
if (data_flags & CSD_FLAG_WAIT) {
smp_wmb();
data->flags &= ~CSD_FLAG_WAIT;
} else if (data_flags & CSD_FLAG_LOCK) {
smp_wmb();
data->flags &= ~CSD_FLAG_LOCK;
} else if (data_flags & CSD_FLAG_ALLOC)
kfree(data);
}
/*
* See comment on outer loop
* 'data' can be invalid after this call if flags == 0
* (when called through generic_exec_single()),
* so save them away before making the call:
*/
smp_read_barrier_depends();
data_flags = data->flags;
data->func(data->info);
/*
* Unlocked CSDs are valid through generic_exec_single():
*/
if (data_flags & CSD_FLAG_LOCK)
csd_unlock(data);
}
}
@ -215,65 +272,45 @@ static DEFINE_PER_CPU(struct call_single_data, csd_data);
int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
int wait)
{
struct call_single_data d;
struct call_single_data d = {
.flags = 0,
};
unsigned long flags;
/* prevent preemption and reschedule on another processor,
as well as CPU removal */
int me = get_cpu();
int this_cpu;
int err = 0;
/* Can deadlock when called with interrupts disabled */
WARN_ON(irqs_disabled());
/*
* prevent preemption and reschedule on another processor,
* as well as CPU removal
*/
this_cpu = get_cpu();
if (cpu == me) {
/* Can deadlock when called with interrupts disabled */
WARN_ON_ONCE(irqs_disabled() && !oops_in_progress);
if (cpu == this_cpu) {
local_irq_save(flags);
func(info);
local_irq_restore(flags);
} else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
struct call_single_data *data;
if (!wait) {
/*
* We are calling a function on a single CPU
* and we are not going to wait for it to finish.
* We first try to allocate the data, but if we
* fail, we fall back to use a per cpu data to pass
* the information to that CPU. Since all callers
* of this code will use the same data, we must
* synchronize the callers to prevent a new caller
* from corrupting the data before the callee
* can access it.
*
* The CSD_FLAG_LOCK is used to let us know when
* the IPI handler is done with the data.
* The first caller will set it, and the callee
* will clear it. The next caller must wait for
* it to clear before we set it again. This
* will make sure the callee is done with the
* data before a new caller will use it.
*/
data = kmalloc(sizeof(*data), GFP_ATOMIC);
if (data)
data->flags = CSD_FLAG_ALLOC;
else {
data = &per_cpu(csd_data, me);
while (data->flags & CSD_FLAG_LOCK)
cpu_relax();
data->flags = CSD_FLAG_LOCK;
}
} else {
data = &d;
data->flags = CSD_FLAG_WAIT;
}
data->func = func;
data->info = info;
generic_exec_single(cpu, data);
} else {
err = -ENXIO; /* CPU not online */
if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
struct call_single_data *data = &d;
if (!wait)
data = &__get_cpu_var(csd_data);
csd_lock(data);
data->func = func;
data->info = info;
generic_exec_single(cpu, data, wait);
} else {
err = -ENXIO; /* CPU not online */
}
}
put_cpu();
return err;
}
EXPORT_SYMBOL(smp_call_function_single);
@ -283,23 +320,26 @@ EXPORT_SYMBOL(smp_call_function_single);
* @cpu: The CPU to run on.
* @data: Pre-allocated and setup data structure
*
* Like smp_call_function_single(), but allow caller to pass in a pre-allocated
* data structure. Useful for embedding @data inside other structures, for
* instance.
*
* Like smp_call_function_single(), but allow caller to pass in a
* pre-allocated data structure. Useful for embedding @data inside
* other structures, for instance.
*/
void __smp_call_function_single(int cpu, struct call_single_data *data)
void __smp_call_function_single(int cpu, struct call_single_data *data,
int wait)
{
/* Can deadlock when called with interrupts disabled */
WARN_ON((data->flags & CSD_FLAG_WAIT) && irqs_disabled());
csd_lock(data);
generic_exec_single(cpu, data);
/* Can deadlock when called with interrupts disabled */
WARN_ON_ONCE(wait && irqs_disabled() && !oops_in_progress);
generic_exec_single(cpu, data, wait);
}
/* FIXME: Shim for archs using old arch_send_call_function_ipi API. */
/* Deprecated: shim for archs using old arch_send_call_function_ipi API. */
#ifndef arch_send_call_function_ipi_mask
#define arch_send_call_function_ipi_mask(maskp) \
arch_send_call_function_ipi(*(maskp))
# define arch_send_call_function_ipi_mask(maskp) \
arch_send_call_function_ipi(*(maskp))
#endif
/**
@ -307,7 +347,8 @@ void __smp_call_function_single(int cpu, struct call_single_data *data)
* @mask: The set of cpus to run on (only runs on online subset).
* @func: The function to run. This must be fast and non-blocking.
* @info: An arbitrary pointer to pass to the function.
* @wait: If true, wait (atomically) until function has completed on other CPUs.
* @wait: If true, wait (atomically) until function has completed
* on other CPUs.
*
* If @wait is true, then returns once @func has returned. Note that @wait
* will be implicitly turned on in case of allocation failures, since
@ -318,27 +359,27 @@ void __smp_call_function_single(int cpu, struct call_single_data *data)
* must be disabled when calling this function.
*/
void smp_call_function_many(const struct cpumask *mask,
void (*func)(void *), void *info,
bool wait)
void (*func)(void *), void *info, bool wait)
{
struct call_function_data *data;
unsigned long flags;
int cpu, next_cpu;
int cpu, next_cpu, this_cpu = smp_processor_id();
/* Can deadlock when called with interrupts disabled */
WARN_ON(irqs_disabled());
WARN_ON_ONCE(irqs_disabled() && !oops_in_progress);
/* So, what's a CPU they want? Ignoring this one. */
/* So, what's a CPU they want? Ignoring this one. */
cpu = cpumask_first_and(mask, cpu_online_mask);
if (cpu == smp_processor_id())
if (cpu == this_cpu)
cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
/* No online cpus? We're done. */
if (cpu >= nr_cpu_ids)
return;
/* Do we have another CPU which isn't us? */
next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
if (next_cpu == smp_processor_id())
if (next_cpu == this_cpu)
next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask);
/* Fastpath: do that cpu by itself. */
@ -347,43 +388,40 @@ void smp_call_function_many(const struct cpumask *mask,
return;
}
data = kmalloc(sizeof(*data) + cpumask_size(), GFP_ATOMIC);
if (unlikely(!data)) {
/* Slow path. */
for_each_online_cpu(cpu) {
if (cpu == smp_processor_id())
continue;
if (cpumask_test_cpu(cpu, mask))
smp_call_function_single(cpu, func, info, wait);
}
return;
}
data = &__get_cpu_var(cfd_data);
csd_lock(&data->csd);
spin_lock_init(&data->lock);
data->csd.flags = CSD_FLAG_ALLOC;
if (wait)
data->csd.flags |= CSD_FLAG_WAIT;
spin_lock_irqsave(&data->lock, flags);
data->csd.func = func;
data->csd.info = info;
cpumask_and(to_cpumask(data->cpumask_bits), mask, cpu_online_mask);
cpumask_clear_cpu(smp_processor_id(), to_cpumask(data->cpumask_bits));
data->refs = cpumask_weight(to_cpumask(data->cpumask_bits));
cpumask_and(data->cpumask, mask, cpu_online_mask);
cpumask_clear_cpu(this_cpu, data->cpumask);
data->refs = cpumask_weight(data->cpumask);
spin_lock_irqsave(&call_function_lock, flags);
list_add_tail_rcu(&data->csd.list, &call_function_queue);
spin_unlock_irqrestore(&call_function_lock, flags);
spin_lock(&call_function.lock);
/*
* Place entry at the _HEAD_ of the list, so that any cpu still
* observing the entry in generic_smp_call_function_interrupt()
* will not miss any other list entries:
*/
list_add_rcu(&data->csd.list, &call_function.queue);
spin_unlock(&call_function.lock);
spin_unlock_irqrestore(&data->lock, flags);
/*
* Make the list addition visible before sending the ipi.
* (IPIs must obey or appear to obey normal Linux cache
* coherency rules -- see comment in generic_exec_single).
*/
smp_mb();
/* Send a message to all CPUs in the map */
arch_send_call_function_ipi_mask(to_cpumask(data->cpumask_bits));
arch_send_call_function_ipi_mask(data->cpumask);
/* optionally wait for the CPUs to complete */
/* Optionally wait for the CPUs to complete */
if (wait)
csd_flag_wait(&data->csd);
csd_lock_wait(&data->csd);
}
EXPORT_SYMBOL(smp_call_function_many);
@ -391,7 +429,8 @@ EXPORT_SYMBOL(smp_call_function_many);
* smp_call_function(): Run a function on all other CPUs.
* @func: The function to run. This must be fast and non-blocking.
* @info: An arbitrary pointer to pass to the function.
* @wait: If true, wait (atomically) until function has completed on other CPUs.
* @wait: If true, wait (atomically) until function has completed
* on other CPUs.
*
* Returns 0.
*
@ -407,26 +446,27 @@ int smp_call_function(void (*func)(void *), void *info, int wait)
preempt_disable();
smp_call_function_many(cpu_online_mask, func, info, wait);
preempt_enable();
return 0;
}
EXPORT_SYMBOL(smp_call_function);
void ipi_call_lock(void)
{
spin_lock(&call_function_lock);
spin_lock(&call_function.lock);
}
void ipi_call_unlock(void)
{
spin_unlock(&call_function_lock);
spin_unlock(&call_function.lock);
}
void ipi_call_lock_irq(void)
{
spin_lock_irq(&call_function_lock);
spin_lock_irq(&call_function.lock);
}
void ipi_call_unlock_irq(void)
{
spin_unlock_irq(&call_function_lock);
spin_unlock_irq(&call_function.lock);
}

View file

@ -21,8 +21,10 @@
#include <linux/freezer.h>
#include <linux/kthread.h>
#include <linux/rcupdate.h>
#include <linux/ftrace.h>
#include <linux/smp.h>
#include <linux/tick.h>
#include <trace/irq.h>
#include <asm/irq.h>
/*
@ -52,6 +54,11 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
char *softirq_to_name[NR_SOFTIRQS] = {
"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK",
"TASKLET", "SCHED", "HRTIMER", "RCU"
};
/*
* we cannot loop indefinitely here to avoid userspace starvation,
* but we also don't want to introduce a worst case 1/HZ latency
@ -79,13 +86,23 @@ static void __local_bh_disable(unsigned long ip)
WARN_ON_ONCE(in_irq());
raw_local_irq_save(flags);
add_preempt_count(SOFTIRQ_OFFSET);
/*
* The preempt tracer hooks into add_preempt_count and will break
* lockdep because it calls back into lockdep after SOFTIRQ_OFFSET
* is set and before current->softirq_enabled is cleared.
* We must manually increment preempt_count here and manually
* call the trace_preempt_off later.
*/
preempt_count() += SOFTIRQ_OFFSET;
/*
* Were softirqs turned off above:
*/
if (softirq_count() == SOFTIRQ_OFFSET)
trace_softirqs_off(ip);
raw_local_irq_restore(flags);
if (preempt_count() == SOFTIRQ_OFFSET)
trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
}
#else /* !CONFIG_TRACE_IRQFLAGS */
static inline void __local_bh_disable(unsigned long ip)
@ -169,6 +186,9 @@ EXPORT_SYMBOL(local_bh_enable_ip);
*/
#define MAX_SOFTIRQ_RESTART 10
DEFINE_TRACE(softirq_entry);
DEFINE_TRACE(softirq_exit);
asmlinkage void __do_softirq(void)
{
struct softirq_action *h;
@ -180,7 +200,7 @@ asmlinkage void __do_softirq(void)
account_system_vtime(current);
__local_bh_disable((unsigned long)__builtin_return_address(0));
trace_softirq_enter();
lockdep_softirq_enter();
cpu = smp_processor_id();
restart:
@ -195,12 +215,14 @@ restart:
if (pending & 1) {
int prev_count = preempt_count();
trace_softirq_entry(h, softirq_vec);
h->action(h);
trace_softirq_exit(h, softirq_vec);
if (unlikely(prev_count != preempt_count())) {
printk(KERN_ERR "huh, entered softirq %td %p"
printk(KERN_ERR "huh, entered softirq %td %s %p"
"with preempt_count %08x,"
" exited with %08x?\n", h - softirq_vec,
softirq_to_name[h - softirq_vec],
h->action, prev_count, preempt_count());
preempt_count() = prev_count;
}
@ -220,7 +242,7 @@ restart:
if (pending)
wakeup_softirqd();
trace_softirq_exit();
lockdep_softirq_exit();
account_system_vtime(current);
_local_bh_enable();
@ -496,7 +518,7 @@ static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softir
cp->flags = 0;
cp->priv = softirq;
__smp_call_function_single(cpu, cp);
__smp_call_function_single(cpu, cp, 0);
return 0;
}
return 1;

View file

@ -121,7 +121,8 @@ unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
local_irq_save(flags);
preempt_disable();
rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
LOCK_CONTENDED_FLAGS(lock, _raw_read_trylock, _raw_read_lock,
_raw_read_lock_flags, &flags);
return flags;
}
EXPORT_SYMBOL(_read_lock_irqsave);
@ -151,7 +152,8 @@ unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
local_irq_save(flags);
preempt_disable();
rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
LOCK_CONTENDED_FLAGS(lock, _raw_write_trylock, _raw_write_lock,
_raw_write_lock_flags, &flags);
return flags;
}
EXPORT_SYMBOL(_write_lock_irqsave);
@ -299,16 +301,8 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclas
local_irq_save(flags);
preempt_disable();
spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
/*
* On lockdep we dont want the hand-coded irq-enable of
* _raw_spin_lock_flags() code, because lockdep assumes
* that interrupts are not re-enabled during lock-acquire:
*/
#ifdef CONFIG_LOCKDEP
LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
#else
_raw_spin_lock_flags(lock, &flags);
#endif
LOCK_CONTENDED_FLAGS(lock, _raw_spin_trylock, _raw_spin_lock,
_raw_spin_lock_flags, &flags);
return flags;
}
EXPORT_SYMBOL(_spin_lock_irqsave_nested);

View file

@ -44,7 +44,7 @@ static DEFINE_MUTEX(setup_lock);
static int refcount;
static struct workqueue_struct *stop_machine_wq;
static struct stop_machine_data active, idle;
static const cpumask_t *active_cpus;
static const struct cpumask *active_cpus;
static void *stop_machine_work;
static void set_state(enum stopmachine_state newstate)

View file

@ -34,6 +34,7 @@
#include <linux/seccomp.h>
#include <linux/cpu.h>
#include <linux/ptrace.h>
#include <linux/fs_struct.h>
#include <linux/compat.h>
#include <linux/syscalls.h>
@ -1013,10 +1014,8 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
if (err)
goto out;
if (task_pgrp(p) != pgrp) {
if (task_pgrp(p) != pgrp)
change_pid(p, PIDTYPE_PGID, pgrp);
set_task_pgrp(p, pid_nr(pgrp));
}
err = 0;
out:

View file

@ -48,6 +48,7 @@
#include <linux/acpi.h>
#include <linux/reboot.h>
#include <linux/ftrace.h>
#include <linux/slow-work.h>
#include <asm/uaccess.h>
#include <asm/processor.h>
@ -95,12 +96,9 @@ static int sixty = 60;
static int neg_one = -1;
#endif
#if defined(CONFIG_MMU) && defined(CONFIG_FILE_LOCKING)
static int two = 2;
#endif
static int zero;
static int one = 1;
static int two = 2;
static unsigned long one_ul = 1;
static int one_hundred = 100;
@ -900,6 +898,14 @@ static struct ctl_table kern_table[] = {
.proc_handler = &scan_unevictable_handler,
},
#endif
#ifdef CONFIG_SLOW_WORK
{
.ctl_name = CTL_UNNUMBERED,
.procname = "slow-work",
.mode = 0555,
.child = slow_work_sysctls,
},
#endif
/*
* NOTE: do not add new entries to this table unless you have read
* Documentation/sysctl/ctl_unnumbered.txt
@ -1010,7 +1016,7 @@ static struct ctl_table vm_table[] = {
.data = &dirty_expire_interval,
.maxlen = sizeof(dirty_expire_interval),
.mode = 0644,
.proc_handler = &proc_dointvec_userhz_jiffies,
.proc_handler = &proc_dointvec,
},
{
.ctl_name = VM_NR_PDFLUSH_THREADS,
@ -1373,10 +1379,7 @@ static struct ctl_table fs_table[] = {
.data = &lease_break_time,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec_minmax,
.strategy = &sysctl_intvec,
.extra1 = &zero,
.extra2 = &two,
.proc_handler = &proc_dointvec,
},
#endif
#ifdef CONFIG_AIO
@ -1417,7 +1420,10 @@ static struct ctl_table fs_table[] = {
.data = &suid_dumpable,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec,
.proc_handler = &proc_dointvec_minmax,
.strategy = &sysctl_intvec,
.extra1 = &zero,
.extra2 = &two,
},
#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
{

View file

@ -219,6 +219,7 @@ static const struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = {
{ NET_IPV4_CONF_ARP_IGNORE, "arp_ignore" },
{ NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" },
{ NET_IPV4_CONF_ARP_ACCEPT, "arp_accept" },
{ NET_IPV4_CONF_ARP_NOTIFY, "arp_notify" },
{}
};

View file

@ -1,4 +1,4 @@
obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o
obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o

View file

@ -68,6 +68,17 @@ void clockevents_set_mode(struct clock_event_device *dev,
if (dev->mode != mode) {
dev->set_mode(mode, dev);
dev->mode = mode;
/*
* A nsec2cyc multiplicator of 0 is invalid and we'd crash
* on it, so fix it up and emit a warning:
*/
if (mode == CLOCK_EVT_MODE_ONESHOT) {
if (unlikely(!dev->mult)) {
dev->mult = 1;
WARN_ON(1);
}
}
}
}
@ -168,15 +179,6 @@ void clockevents_register_device(struct clock_event_device *dev)
BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
BUG_ON(!dev->cpumask);
/*
* A nsec2cyc multiplicator of 0 is invalid and we'd crash
* on it, so fix it up and emit a warning:
*/
if (unlikely(!dev->mult)) {
dev->mult = 1;
WARN_ON(1);
}
spin_lock(&clockevents_lock);
list_add(&dev->list, &clockevent_devices);

View file

@ -31,6 +31,82 @@
#include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
#include <linux/tick.h>
void timecounter_init(struct timecounter *tc,
const struct cyclecounter *cc,
u64 start_tstamp)
{
tc->cc = cc;
tc->cycle_last = cc->read(cc);
tc->nsec = start_tstamp;
}
EXPORT_SYMBOL(timecounter_init);
/**
* timecounter_read_delta - get nanoseconds since last call of this function
* @tc: Pointer to time counter
*
* When the underlying cycle counter runs over, this will be handled
* correctly as long as it does not run over more than once between
* calls.
*
* The first call to this function for a new time counter initializes
* the time tracking and returns an undefined result.
*/
static u64 timecounter_read_delta(struct timecounter *tc)
{
cycle_t cycle_now, cycle_delta;
u64 ns_offset;
/* read cycle counter: */
cycle_now = tc->cc->read(tc->cc);
/* calculate the delta since the last timecounter_read_delta(): */
cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask;
/* convert to nanoseconds: */
ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta);
/* update time stamp of timecounter_read_delta() call: */
tc->cycle_last = cycle_now;
return ns_offset;
}
u64 timecounter_read(struct timecounter *tc)
{
u64 nsec;
/* increment time by nanoseconds since last call */
nsec = timecounter_read_delta(tc);
nsec += tc->nsec;
tc->nsec = nsec;
return nsec;
}
EXPORT_SYMBOL(timecounter_read);
u64 timecounter_cyc2time(struct timecounter *tc,
cycle_t cycle_tstamp)
{
u64 cycle_delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
u64 nsec;
/*
* Instead of always treating cycle_tstamp as more recent
* than tc->cycle_last, detect when it is too far in the
* future and treat it as old time stamp instead.
*/
if (cycle_delta > tc->cc->mask / 2) {
cycle_delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask;
nsec = tc->nsec - cyclecounter_cyc2ns(tc->cc, cycle_delta);
} else {
nsec = cyclecounter_cyc2ns(tc->cc, cycle_delta) + tc->nsec;
}
return nsec;
}
EXPORT_SYMBOL(timecounter_cyc2time);
/* XXX - Would like a better way for initializing curr_clocksource */
extern struct clocksource clocksource_jiffies;

View file

@ -1,71 +1,129 @@
/*
* linux/kernel/time/ntp.c
*
* NTP state machine interfaces and logic.
*
* This code was mainly moved from kernel/timer.c and kernel/time.c
* Please see those files for relevant copyright info and historical
* changelogs.
*/
#include <linux/mm.h>
#include <linux/time.h>
#include <linux/timex.h>
#include <linux/jiffies.h>
#include <linux/hrtimer.h>
#include <linux/capability.h>
#include <linux/math64.h>
#include <linux/clocksource.h>
#include <linux/workqueue.h>
#include <asm/timex.h>
#include <linux/hrtimer.h>
#include <linux/jiffies.h>
#include <linux/math64.h>
#include <linux/timex.h>
#include <linux/time.h>
#include <linux/mm.h>
/*
* Timekeeping variables
* NTP timekeeping variables:
*/
unsigned long tick_usec = TICK_USEC; /* USER_HZ period (usec) */
unsigned long tick_nsec; /* ACTHZ period (nsec) */
u64 tick_length;
static u64 tick_length_base;
static struct hrtimer leap_timer;
/* USER_HZ period (usecs): */
unsigned long tick_usec = TICK_USEC;
#define MAX_TICKADJ 500 /* microsecs */
#define MAX_TICKADJ_SCALED (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \
NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
/* ACTHZ period (nsecs): */
unsigned long tick_nsec;
u64 tick_length;
static u64 tick_length_base;
static struct hrtimer leap_timer;
#define MAX_TICKADJ 500LL /* usecs */
#define MAX_TICKADJ_SCALED \
(((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
/*
* phase-lock loop variables
*/
/* TIME_ERROR prevents overwriting the CMOS clock */
static int time_state = TIME_OK; /* clock synchronization status */
int time_status = STA_UNSYNC; /* clock status bits */
static long time_tai; /* TAI offset (s) */
static s64 time_offset; /* time adjustment (ns) */
static long time_constant = 2; /* pll time constant */
long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */
long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
static s64 time_freq; /* frequency offset (scaled ns/s)*/
static long time_reftime; /* time at last adjustment (s) */
long time_adjust;
static long ntp_tick_adj;
/*
* clock synchronization status
*
* (TIME_ERROR prevents overwriting the CMOS clock)
*/
static int time_state = TIME_OK;
/* clock status bits: */
int time_status = STA_UNSYNC;
/* TAI offset (secs): */
static long time_tai;
/* time adjustment (nsecs): */
static s64 time_offset;
/* pll time constant: */
static long time_constant = 2;
/* maximum error (usecs): */
long time_maxerror = NTP_PHASE_LIMIT;
/* estimated error (usecs): */
long time_esterror = NTP_PHASE_LIMIT;
/* frequency offset (scaled nsecs/secs): */
static s64 time_freq;
/* time at last adjustment (secs): */
static long time_reftime;
long time_adjust;
/* constant (boot-param configurable) NTP tick adjustment (upscaled) */
static s64 ntp_tick_adj;
/*
* NTP methods:
*/
/*
* Update (tick_length, tick_length_base, tick_nsec), based
* on (tick_usec, ntp_tick_adj, time_freq):
*/
static void ntp_update_frequency(void)
{
u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
<< NTP_SCALE_SHIFT;
second_length += (s64)ntp_tick_adj << NTP_SCALE_SHIFT;
second_length += time_freq;
u64 second_length;
u64 new_base;
tick_length_base = second_length;
second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
<< NTP_SCALE_SHIFT;
tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT;
tick_length_base = div_u64(tick_length_base, NTP_INTERVAL_FREQ);
second_length += ntp_tick_adj;
second_length += time_freq;
tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT;
new_base = div_u64(second_length, NTP_INTERVAL_FREQ);
/*
* Don't wait for the next second_overflow, apply
* the change to the tick length immediately:
*/
tick_length += new_base - tick_length_base;
tick_length_base = new_base;
}
static inline s64 ntp_update_offset_fll(s64 offset64, long secs)
{
time_status &= ~STA_MODE;
if (secs < MINSEC)
return 0;
if (!(time_status & STA_FLL) && (secs <= MAXSEC))
return 0;
time_status |= STA_MODE;
return div_s64(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs);
}
static void ntp_update_offset(long offset)
{
long mtemp;
s64 freq_adj;
s64 offset64;
long secs;
if (!(time_status & STA_PLL))
return;
@ -84,24 +142,23 @@ static void ntp_update_offset(long offset)
* Select how the frequency is to be controlled
* and in which mode (PLL or FLL).
*/
if (time_status & STA_FREQHOLD || time_reftime == 0)
time_reftime = xtime.tv_sec;
mtemp = xtime.tv_sec - time_reftime;
secs = xtime.tv_sec - time_reftime;
if (unlikely(time_status & STA_FREQHOLD))
secs = 0;
time_reftime = xtime.tv_sec;
freq_adj = (s64)offset * mtemp;
freq_adj <<= NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant);
time_status &= ~STA_MODE;
if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) {
freq_adj += div_s64((s64)offset << (NTP_SCALE_SHIFT - SHIFT_FLL),
mtemp);
time_status |= STA_MODE;
}
freq_adj += time_freq;
freq_adj = min(freq_adj, MAXFREQ_SCALED);
time_freq = max(freq_adj, -MAXFREQ_SCALED);
offset64 = offset;
freq_adj = (offset64 * secs) <<
(NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant));
time_offset = div_s64((s64)offset << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ);
freq_adj += ntp_update_offset_fll(offset64, secs);
freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED);
time_freq = max(freq_adj, -MAXFREQ_SCALED);
time_offset = div_s64(offset64 << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ);
}
/**
@ -111,15 +168,15 @@ static void ntp_update_offset(long offset)
*/
void ntp_clear(void)
{
time_adjust = 0; /* stop active adjtime() */
time_status |= STA_UNSYNC;
time_maxerror = NTP_PHASE_LIMIT;
time_esterror = NTP_PHASE_LIMIT;
time_adjust = 0; /* stop active adjtime() */
time_status |= STA_UNSYNC;
time_maxerror = NTP_PHASE_LIMIT;
time_esterror = NTP_PHASE_LIMIT;
ntp_update_frequency();
tick_length = tick_length_base;
time_offset = 0;
tick_length = tick_length_base;
time_offset = 0;
}
/*
@ -140,8 +197,8 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
xtime.tv_sec--;
wall_to_monotonic.tv_sec++;
time_state = TIME_OOP;
printk(KERN_NOTICE "Clock: "
"inserting leap second 23:59:60 UTC\n");
printk(KERN_NOTICE
"Clock: inserting leap second 23:59:60 UTC\n");
hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC);
res = HRTIMER_RESTART;
break;
@ -150,8 +207,8 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
time_tai--;
wall_to_monotonic.tv_sec--;
time_state = TIME_WAIT;
printk(KERN_NOTICE "Clock: "
"deleting leap second 23:59:59 UTC\n");
printk(KERN_NOTICE
"Clock: deleting leap second 23:59:59 UTC\n");
break;
case TIME_OOP:
time_tai++;
@ -179,7 +236,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
*/
void second_overflow(void)
{
s64 time_adj;
s64 delta;
/* Bump the maxerror field */
time_maxerror += MAXFREQ / NSEC_PER_USEC;
@ -192,24 +249,30 @@ void second_overflow(void)
* Compute the phase adjustment for the next second. The offset is
* reduced by a fixed factor times the time constant.
*/
tick_length = tick_length_base;
time_adj = shift_right(time_offset, SHIFT_PLL + time_constant);
time_offset -= time_adj;
tick_length += time_adj;
tick_length = tick_length_base;
if (unlikely(time_adjust)) {
if (time_adjust > MAX_TICKADJ) {
time_adjust -= MAX_TICKADJ;
tick_length += MAX_TICKADJ_SCALED;
} else if (time_adjust < -MAX_TICKADJ) {
time_adjust += MAX_TICKADJ;
tick_length -= MAX_TICKADJ_SCALED;
} else {
tick_length += (s64)(time_adjust * NSEC_PER_USEC /
NTP_INTERVAL_FREQ) << NTP_SCALE_SHIFT;
time_adjust = 0;
}
delta = shift_right(time_offset, SHIFT_PLL + time_constant);
time_offset -= delta;
tick_length += delta;
if (!time_adjust)
return;
if (time_adjust > MAX_TICKADJ) {
time_adjust -= MAX_TICKADJ;
tick_length += MAX_TICKADJ_SCALED;
return;
}
if (time_adjust < -MAX_TICKADJ) {
time_adjust += MAX_TICKADJ;
tick_length -= MAX_TICKADJ_SCALED;
return;
}
tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ)
<< NTP_SCALE_SHIFT;
time_adjust = 0;
}
#ifdef CONFIG_GENERIC_CMOS_UPDATE
@ -233,12 +296,13 @@ static void sync_cmos_clock(struct work_struct *work)
* This code is run on a timer. If the clock is set, that timer
* may not expire at the correct time. Thus, we adjust...
*/
if (!ntp_synced())
if (!ntp_synced()) {
/*
* Not synced, exit, do not restart a timer (if one is
* running, let it run out).
*/
return;
}
getnstimeofday(&now);
if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2)
@ -270,7 +334,116 @@ static void notify_cmos_timer(void)
static inline void notify_cmos_timer(void) { }
#endif
/* adjtimex mainly allows reading (and writing, if superuser) of
/*
* Start the leap seconds timer:
*/
static inline void ntp_start_leap_timer(struct timespec *ts)
{
long now = ts->tv_sec;
if (time_status & STA_INS) {
time_state = TIME_INS;
now += 86400 - now % 86400;
hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
return;
}
if (time_status & STA_DEL) {
time_state = TIME_DEL;
now += 86400 - (now + 1) % 86400;
hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
}
}
/*
* Propagate a new txc->status value into the NTP state:
*/
static inline void process_adj_status(struct timex *txc, struct timespec *ts)
{
if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
time_state = TIME_OK;
time_status = STA_UNSYNC;
}
/*
* If we turn on PLL adjustments then reset the
* reference time to current time.
*/
if (!(time_status & STA_PLL) && (txc->status & STA_PLL))
time_reftime = xtime.tv_sec;
/* only set allowed bits */
time_status &= STA_RONLY;
time_status |= txc->status & ~STA_RONLY;
switch (time_state) {
case TIME_OK:
ntp_start_leap_timer(ts);
break;
case TIME_INS:
case TIME_DEL:
time_state = TIME_OK;
ntp_start_leap_timer(ts);
case TIME_WAIT:
if (!(time_status & (STA_INS | STA_DEL)))
time_state = TIME_OK;
break;
case TIME_OOP:
hrtimer_restart(&leap_timer);
break;
}
}
/*
* Called with the xtime lock held, so we can access and modify
* all the global NTP state:
*/
static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts)
{
if (txc->modes & ADJ_STATUS)
process_adj_status(txc, ts);
if (txc->modes & ADJ_NANO)
time_status |= STA_NANO;
if (txc->modes & ADJ_MICRO)
time_status &= ~STA_NANO;
if (txc->modes & ADJ_FREQUENCY) {
time_freq = txc->freq * PPM_SCALE;
time_freq = min(time_freq, MAXFREQ_SCALED);
time_freq = max(time_freq, -MAXFREQ_SCALED);
}
if (txc->modes & ADJ_MAXERROR)
time_maxerror = txc->maxerror;
if (txc->modes & ADJ_ESTERROR)
time_esterror = txc->esterror;
if (txc->modes & ADJ_TIMECONST) {
time_constant = txc->constant;
if (!(time_status & STA_NANO))
time_constant += 4;
time_constant = min(time_constant, (long)MAXTC);
time_constant = max(time_constant, 0l);
}
if (txc->modes & ADJ_TAI && txc->constant > 0)
time_tai = txc->constant;
if (txc->modes & ADJ_OFFSET)
ntp_update_offset(txc->offset);
if (txc->modes & ADJ_TICK)
tick_usec = txc->tick;
if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET))
ntp_update_frequency();
}
/*
* adjtimex mainly allows reading (and writing, if superuser) of
* kernel time-keeping variables. used by xntpd.
*/
int do_adjtimex(struct timex *txc)
@ -291,11 +464,14 @@ int do_adjtimex(struct timex *txc)
if (txc->modes && !capable(CAP_SYS_TIME))
return -EPERM;
/* if the quartz is off by more than 10% something is VERY wrong! */
/*
* if the quartz is off by more than 10% then
* something is VERY wrong!
*/
if (txc->modes & ADJ_TICK &&
(txc->tick < 900000/USER_HZ ||
txc->tick > 1100000/USER_HZ))
return -EINVAL;
return -EINVAL;
if (txc->modes & ADJ_STATUS && time_state != TIME_OK)
hrtimer_cancel(&leap_timer);
@ -305,7 +481,6 @@ int do_adjtimex(struct timex *txc)
write_seqlock_irq(&xtime_lock);
/* If there are input parameters, then process them */
if (txc->modes & ADJ_ADJTIME) {
long save_adjust = time_adjust;
@ -315,98 +490,24 @@ int do_adjtimex(struct timex *txc)
ntp_update_frequency();
}
txc->offset = save_adjust;
goto adj_done;
}
if (txc->modes) {
long sec;
} else {
if (txc->modes & ADJ_STATUS) {
if ((time_status & STA_PLL) &&
!(txc->status & STA_PLL)) {
time_state = TIME_OK;
time_status = STA_UNSYNC;
}
/* only set allowed bits */
time_status &= STA_RONLY;
time_status |= txc->status & ~STA_RONLY;
/* If there are input parameters, then process them: */
if (txc->modes)
process_adjtimex_modes(txc, &ts);
switch (time_state) {
case TIME_OK:
start_timer:
sec = ts.tv_sec;
if (time_status & STA_INS) {
time_state = TIME_INS;
sec += 86400 - sec % 86400;
hrtimer_start(&leap_timer, ktime_set(sec, 0), HRTIMER_MODE_ABS);
} else if (time_status & STA_DEL) {
time_state = TIME_DEL;
sec += 86400 - (sec + 1) % 86400;
hrtimer_start(&leap_timer, ktime_set(sec, 0), HRTIMER_MODE_ABS);
}
break;
case TIME_INS:
case TIME_DEL:
time_state = TIME_OK;
goto start_timer;
break;
case TIME_WAIT:
if (!(time_status & (STA_INS | STA_DEL)))
time_state = TIME_OK;
break;
case TIME_OOP:
hrtimer_restart(&leap_timer);
break;
}
}
if (txc->modes & ADJ_NANO)
time_status |= STA_NANO;
if (txc->modes & ADJ_MICRO)
time_status &= ~STA_NANO;
if (txc->modes & ADJ_FREQUENCY) {
time_freq = (s64)txc->freq * PPM_SCALE;
time_freq = min(time_freq, MAXFREQ_SCALED);
time_freq = max(time_freq, -MAXFREQ_SCALED);
}
if (txc->modes & ADJ_MAXERROR)
time_maxerror = txc->maxerror;
if (txc->modes & ADJ_ESTERROR)
time_esterror = txc->esterror;
if (txc->modes & ADJ_TIMECONST) {
time_constant = txc->constant;
if (!(time_status & STA_NANO))
time_constant += 4;
time_constant = min(time_constant, (long)MAXTC);
time_constant = max(time_constant, 0l);
}
if (txc->modes & ADJ_TAI && txc->constant > 0)
time_tai = txc->constant;
if (txc->modes & ADJ_OFFSET)
ntp_update_offset(txc->offset);
if (txc->modes & ADJ_TICK)
tick_usec = txc->tick;
if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET))
ntp_update_frequency();
}
txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
NTP_SCALE_SHIFT);
if (!(time_status & STA_NANO))
txc->offset /= NSEC_PER_USEC;
if (!(time_status & STA_NANO))
txc->offset /= NSEC_PER_USEC;
}
adj_done:
result = time_state; /* mostly `TIME_OK' */
if (time_status & (STA_UNSYNC|STA_CLOCKERR))
result = TIME_ERROR;
txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) *
(s64)PPM_SCALE_INV, NTP_SCALE_SHIFT);
PPM_SCALE_INV, NTP_SCALE_SHIFT);
txc->maxerror = time_maxerror;
txc->esterror = time_esterror;
txc->status = time_status;
@ -425,6 +526,7 @@ adj_done:
txc->calcnt = 0;
txc->errcnt = 0;
txc->stbcnt = 0;
write_sequnlock_irq(&xtime_lock);
txc->time.tv_sec = ts.tv_sec;
@ -440,6 +542,8 @@ adj_done:
static int __init ntp_tick_adj_setup(char *str)
{
ntp_tick_adj = simple_strtol(str, NULL, 0);
ntp_tick_adj <<= NTP_SCALE_SHIFT;
return 1;
}

191
kernel/time/timecompare.c Normal file
View file

@ -0,0 +1,191 @@
/*
* Copyright (C) 2009 Intel Corporation.
* Author: Patrick Ohly <patrick.ohly@intel.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
#include <linux/timecompare.h>
#include <linux/module.h>
#include <linux/math64.h>
/*
* fixed point arithmetic scale factor for skew
*
* Usually one would measure skew in ppb (parts per billion, 1e9), but
* using a factor of 2 simplifies the math.
*/
#define TIMECOMPARE_SKEW_RESOLUTION (((s64)1)<<30)
ktime_t timecompare_transform(struct timecompare *sync,
u64 source_tstamp)
{
u64 nsec;
nsec = source_tstamp + sync->offset;
nsec += (s64)(source_tstamp - sync->last_update) * sync->skew /
TIMECOMPARE_SKEW_RESOLUTION;
return ns_to_ktime(nsec);
}
EXPORT_SYMBOL(timecompare_transform);
int timecompare_offset(struct timecompare *sync,
s64 *offset,
u64 *source_tstamp)
{
u64 start_source = 0, end_source = 0;
struct {
s64 offset;
s64 duration_target;
} buffer[10], sample, *samples;
int counter = 0, i;
int used;
int index;
int num_samples = sync->num_samples;
if (num_samples > sizeof(buffer)/sizeof(buffer[0])) {
samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC);
if (!samples) {
samples = buffer;
num_samples = sizeof(buffer)/sizeof(buffer[0]);
}
} else {
samples = buffer;
}
/* run until we have enough valid samples, but do not try forever */
i = 0;
counter = 0;
while (1) {
u64 ts;
ktime_t start, end;
start = sync->target();
ts = timecounter_read(sync->source);
end = sync->target();
if (!i)
start_source = ts;
/* ignore negative durations */
sample.duration_target = ktime_to_ns(ktime_sub(end, start));
if (sample.duration_target >= 0) {
/*
* assume symetric delay to and from source:
* average target time corresponds to measured
* source time
*/
sample.offset =
ktime_to_ns(ktime_add(end, start)) / 2 -
ts;
/* simple insertion sort based on duration */
index = counter - 1;
while (index >= 0) {
if (samples[index].duration_target <
sample.duration_target)
break;
samples[index + 1] = samples[index];
index--;
}
samples[index + 1] = sample;
counter++;
}
i++;
if (counter >= num_samples || i >= 100000) {
end_source = ts;
break;
}
}
*source_tstamp = (end_source + start_source) / 2;
/* remove outliers by only using 75% of the samples */
used = counter * 3 / 4;
if (!used)
used = counter;
if (used) {
/* calculate average */
s64 off = 0;
for (index = 0; index < used; index++)
off += samples[index].offset;
*offset = div_s64(off, used);
}
if (samples && samples != buffer)
kfree(samples);
return used;
}
EXPORT_SYMBOL(timecompare_offset);
void __timecompare_update(struct timecompare *sync,
u64 source_tstamp)
{
s64 offset;
u64 average_time;
if (!timecompare_offset(sync, &offset, &average_time))
return;
if (!sync->last_update) {
sync->last_update = average_time;
sync->offset = offset;
sync->skew = 0;
} else {
s64 delta_nsec = average_time - sync->last_update;
/* avoid division by negative or small deltas */
if (delta_nsec >= 10000) {
s64 delta_offset_nsec = offset - sync->offset;
s64 skew; /* delta_offset_nsec *
TIMECOMPARE_SKEW_RESOLUTION /
delta_nsec */
u64 divisor;
/* div_s64() is limited to 32 bit divisor */
skew = delta_offset_nsec * TIMECOMPARE_SKEW_RESOLUTION;
divisor = delta_nsec;
while (unlikely(divisor >= ((s64)1) << 32)) {
/* divide both by 2; beware, right shift
of negative value has undefined
behavior and can only be used for
the positive divisor */
skew = div_s64(skew, 2);
divisor >>= 1;
}
skew = div_s64(skew, divisor);
/*
* Calculate new overall skew as 4/16 the
* old value and 12/16 the new one. This is
* a rather arbitrary tradeoff between
* only using the latest measurement (0/16 and
* 16/16) and even more weight on past measurements.
*/
#define TIMECOMPARE_NEW_SKEW_PER_16 12
sync->skew =
div_s64((16 - TIMECOMPARE_NEW_SKEW_PER_16) *
sync->skew +
TIMECOMPARE_NEW_SKEW_PER_16 * skew,
16);
sync->last_update = average_time;
sync->offset = offset;
}
}
}
EXPORT_SYMBOL(__timecompare_update);

View file

@ -491,14 +491,18 @@ static inline void debug_timer_free(struct timer_list *timer)
debug_object_free(timer, &timer_debug_descr);
}
static void __init_timer(struct timer_list *timer);
static void __init_timer(struct timer_list *timer,
const char *name,
struct lock_class_key *key);
void init_timer_on_stack(struct timer_list *timer)
void init_timer_on_stack_key(struct timer_list *timer,
const char *name,
struct lock_class_key *key)
{
debug_object_init_on_stack(timer, &timer_debug_descr);
__init_timer(timer);
__init_timer(timer, name, key);
}
EXPORT_SYMBOL_GPL(init_timer_on_stack);
EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
void destroy_timer_on_stack(struct timer_list *timer)
{
@ -512,7 +516,9 @@ static inline void debug_timer_activate(struct timer_list *timer) { }
static inline void debug_timer_deactivate(struct timer_list *timer) { }
#endif
static void __init_timer(struct timer_list *timer)
static void __init_timer(struct timer_list *timer,
const char *name,
struct lock_class_key *key)
{
timer->entry.next = NULL;
timer->base = __raw_get_cpu_var(tvec_bases);
@ -521,6 +527,7 @@ static void __init_timer(struct timer_list *timer)
timer->start_pid = -1;
memset(timer->start_comm, 0, TASK_COMM_LEN);
#endif
lockdep_init_map(&timer->lockdep_map, name, key, 0);
}
/**
@ -530,19 +537,23 @@ static void __init_timer(struct timer_list *timer)
* init_timer() must be done to a timer prior calling *any* of the
* other timer functions.
*/
void init_timer(struct timer_list *timer)
void init_timer_key(struct timer_list *timer,
const char *name,
struct lock_class_key *key)
{
debug_timer_init(timer);
__init_timer(timer);
__init_timer(timer, name, key);
}
EXPORT_SYMBOL(init_timer);
EXPORT_SYMBOL(init_timer_key);
void init_timer_deferrable(struct timer_list *timer)
void init_timer_deferrable_key(struct timer_list *timer,
const char *name,
struct lock_class_key *key)
{
init_timer(timer);
init_timer_key(timer, name, key);
timer_set_deferrable(timer);
}
EXPORT_SYMBOL(init_timer_deferrable);
EXPORT_SYMBOL(init_timer_deferrable_key);
static inline void detach_timer(struct timer_list *timer,
int clear_pending)
@ -589,11 +600,14 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
}
}
int __mod_timer(struct timer_list *timer, unsigned long expires)
static inline int
__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
{
struct tvec_base *base, *new_base;
unsigned long flags;
int ret = 0;
int ret;
ret = 0;
timer_stats_timer_set_start_info(timer);
BUG_ON(!timer->function);
@ -603,6 +617,9 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
if (timer_pending(timer)) {
detach_timer(timer, 0);
ret = 1;
} else {
if (pending_only)
goto out_unlock;
}
debug_timer_activate(timer);
@ -629,12 +646,83 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
timer->expires = expires;
internal_add_timer(base, timer);
out_unlock:
spin_unlock_irqrestore(&base->lock, flags);
return ret;
}
EXPORT_SYMBOL(__mod_timer);
/**
* mod_timer_pending - modify a pending timer's timeout
* @timer: the pending timer to be modified
* @expires: new timeout in jiffies
*
* mod_timer_pending() is the same for pending timers as mod_timer(),
* but will not re-activate and modify already deleted timers.
*
* It is useful for unserialized use of timers.
*/
int mod_timer_pending(struct timer_list *timer, unsigned long expires)
{
return __mod_timer(timer, expires, true);
}
EXPORT_SYMBOL(mod_timer_pending);
/**
* mod_timer - modify a timer's timeout
* @timer: the timer to be modified
* @expires: new timeout in jiffies
*
* mod_timer() is a more efficient way to update the expire field of an
* active timer (if the timer is inactive it will be activated)
*
* mod_timer(timer, expires) is equivalent to:
*
* del_timer(timer); timer->expires = expires; add_timer(timer);
*
* Note that if there are multiple unserialized concurrent users of the
* same timer, then mod_timer() is the only safe way to modify the timeout,
* since add_timer() cannot modify an already running timer.
*
* The function returns whether it has modified a pending timer or not.
* (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an
* active timer returns 1.)
*/
int mod_timer(struct timer_list *timer, unsigned long expires)
{
/*
* This is a common optimization triggered by the
* networking code - if the timer is re-modified
* to be the same thing then just return:
*/
if (timer->expires == expires && timer_pending(timer))
return 1;
return __mod_timer(timer, expires, false);
}
EXPORT_SYMBOL(mod_timer);
/**
* add_timer - start a timer
* @timer: the timer to be added
*
* The kernel will do a ->function(->data) callback from the
* timer interrupt at the ->expires point in the future. The
* current time is 'jiffies'.
*
* The timer's ->expires, ->function (and if the handler uses it, ->data)
* fields must be set prior calling this function.
*
* Timers with an ->expires field in the past will be executed in the next
* timer tick.
*/
void add_timer(struct timer_list *timer)
{
BUG_ON(timer_pending(timer));
mod_timer(timer, timer->expires);
}
EXPORT_SYMBOL(add_timer);
/**
* add_timer_on - start a timer on a particular CPU
@ -666,44 +754,6 @@ void add_timer_on(struct timer_list *timer, int cpu)
spin_unlock_irqrestore(&base->lock, flags);
}
/**
* mod_timer - modify a timer's timeout
* @timer: the timer to be modified
* @expires: new timeout in jiffies
*
* mod_timer() is a more efficient way to update the expire field of an
* active timer (if the timer is inactive it will be activated)
*
* mod_timer(timer, expires) is equivalent to:
*
* del_timer(timer); timer->expires = expires; add_timer(timer);
*
* Note that if there are multiple unserialized concurrent users of the
* same timer, then mod_timer() is the only safe way to modify the timeout,
* since add_timer() cannot modify an already running timer.
*
* The function returns whether it has modified a pending timer or not.
* (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an
* active timer returns 1.)
*/
int mod_timer(struct timer_list *timer, unsigned long expires)
{
BUG_ON(!timer->function);
timer_stats_timer_set_start_info(timer);
/*
* This is a common optimization triggered by the
* networking code - if the timer is re-modified
* to be the same thing then just return:
*/
if (timer->expires == expires && timer_pending(timer))
return 1;
return __mod_timer(timer, expires);
}
EXPORT_SYMBOL(mod_timer);
/**
* del_timer - deactive a timer.
* @timer: the timer to be deactivated
@ -733,7 +783,6 @@ int del_timer(struct timer_list *timer)
return ret;
}
EXPORT_SYMBOL(del_timer);
#ifdef CONFIG_SMP
@ -767,7 +816,6 @@ out:
return ret;
}
EXPORT_SYMBOL(try_to_del_timer_sync);
/**
@ -789,6 +837,15 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
*/
int del_timer_sync(struct timer_list *timer)
{
#ifdef CONFIG_LOCKDEP
unsigned long flags;
local_irq_save(flags);
lock_map_acquire(&timer->lockdep_map);
lock_map_release(&timer->lockdep_map);
local_irq_restore(flags);
#endif
for (;;) {
int ret = try_to_del_timer_sync(timer);
if (ret >= 0)
@ -796,7 +853,6 @@ int del_timer_sync(struct timer_list *timer)
cpu_relax();
}
}
EXPORT_SYMBOL(del_timer_sync);
#endif
@ -861,10 +917,36 @@ static inline void __run_timers(struct tvec_base *base)
set_running_timer(base, timer);
detach_timer(timer, 1);
spin_unlock_irq(&base->lock);
{
int preempt_count = preempt_count();
#ifdef CONFIG_LOCKDEP
/*
* It is permissible to free the timer from
* inside the function that is called from
* it, this we need to take into account for
* lockdep too. To avoid bogus "held lock
* freed" warnings as well as problems when
* looking into timer->lockdep_map, make a
* copy and use that here.
*/
struct lockdep_map lockdep_map =
timer->lockdep_map;
#endif
/*
* Couple the lock chain with the lock chain at
* del_timer_sync() by acquiring the lock_map
* around the fn() call here and in
* del_timer_sync().
*/
lock_map_acquire(&lockdep_map);
fn(data);
lock_map_release(&lockdep_map);
if (preempt_count != preempt_count()) {
printk(KERN_ERR "huh, entered %p "
"with preempt_count %08x, exited"
@ -1268,7 +1350,7 @@ signed long __sched schedule_timeout(signed long timeout)
expire = timeout + jiffies;
setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
__mod_timer(&timer, expire);
__mod_timer(&timer, expire, false);
schedule();
del_singleshot_timer_sync(&timer);

View file

@ -9,6 +9,9 @@ config USER_STACKTRACE_SUPPORT
config NOP_TRACER
bool
config HAVE_FTRACE_NMI_ENTER
bool
config HAVE_FUNCTION_TRACER
bool
@ -31,12 +34,20 @@ config HAVE_FTRACE_MCOUNT_RECORD
config HAVE_HW_BRANCH_TRACER
bool
config HAVE_FTRACE_SYSCALLS
bool
config TRACER_MAX_TRACE
bool
config RING_BUFFER
bool
config FTRACE_NMI_ENTER
bool
depends on HAVE_FTRACE_NMI_ENTER
default y
config TRACING
bool
select DEBUG_FS
@ -44,13 +55,29 @@ config TRACING
select STACKTRACE if STACKTRACE_SUPPORT
select TRACEPOINTS
select NOP_TRACER
select BINARY_PRINTF
#
# Minimum requirements an architecture has to meet for us to
# be able to offer generic tracing facilities:
#
config TRACING_SUPPORT
bool
# PPC32 has no irqflags tracing support, but it can use most of the
# tracers anyway, they were tested to build and work. Note that new
# exceptions to this list aren't welcomed, better implement the
# irqflags tracing for your architecture.
depends on TRACE_IRQFLAGS_SUPPORT || PPC32
depends on STACKTRACE_SUPPORT
default y
if TRACING_SUPPORT
menu "Tracers"
config FUNCTION_TRACER
bool "Kernel Function Tracer"
depends on HAVE_FUNCTION_TRACER
depends on DEBUG_KERNEL
select FRAME_POINTER
select KALLSYMS
select TRACING
@ -72,18 +99,16 @@ config FUNCTION_GRAPH_TRACER
help
Enable the kernel to trace a function at both its return
and its entry.
It's first purpose is to trace the duration of functions and
draw a call graph for each thread with some informations like
the return value.
This is done by setting the current return address on the current
task structure into a stack of calls.
Its first purpose is to trace the duration of functions and
draw a call graph for each thread with some information like
the return value. This is done by setting the current return
address on the current task structure into a stack of calls.
config IRQSOFF_TRACER
bool "Interrupts-off Latency Tracer"
default n
depends on TRACE_IRQFLAGS_SUPPORT
depends on GENERIC_TIME
depends on DEBUG_KERNEL
select TRACE_IRQFLAGS
select TRACING
select TRACER_MAX_TRACE
@ -106,7 +131,6 @@ config PREEMPT_TRACER
default n
depends on GENERIC_TIME
depends on PREEMPT
depends on DEBUG_KERNEL
select TRACING
select TRACER_MAX_TRACE
help
@ -127,13 +151,13 @@ config SYSPROF_TRACER
bool "Sysprof Tracer"
depends on X86
select TRACING
select CONTEXT_SWITCH_TRACER
help
This tracer provides the trace needed by the 'Sysprof' userspace
tool.
config SCHED_TRACER
bool "Scheduling Latency Tracer"
depends on DEBUG_KERNEL
select TRACING
select CONTEXT_SWITCH_TRACER
select TRACER_MAX_TRACE
@ -143,16 +167,30 @@ config SCHED_TRACER
config CONTEXT_SWITCH_TRACER
bool "Trace process context switches"
depends on DEBUG_KERNEL
select TRACING
select MARKERS
help
This tracer gets called from the context switch and records
all switching of tasks.
config EVENT_TRACER
bool "Trace various events in the kernel"
select TRACING
help
This tracer hooks to various trace points in the kernel
allowing the user to pick and choose which trace point they
want to trace.
config FTRACE_SYSCALLS
bool "Trace syscalls"
depends on HAVE_FTRACE_SYSCALLS
select TRACING
select KALLSYMS
help
Basic tracer to catch the syscall entry and exit events.
config BOOT_TRACER
bool "Trace boot initcalls"
depends on DEBUG_KERNEL
select TRACING
select CONTEXT_SWITCH_TRACER
help
@ -165,13 +203,11 @@ config BOOT_TRACER
representation of the delays during initcalls - but the raw
/debug/tracing/trace text output is readable too.
( Note that tracing self tests can't be enabled if this tracer is
selected, because the self-tests are an initcall as well and that
would invalidate the boot trace. )
You must pass in ftrace=initcall to the kernel command line
to enable this on bootup.
config TRACE_BRANCH_PROFILING
bool "Trace likely/unlikely profiler"
depends on DEBUG_KERNEL
select TRACING
help
This tracer profiles all the the likely and unlikely macros
@ -224,7 +260,6 @@ config BRANCH_TRACER
config POWER_TRACER
bool "Trace power consumption behavior"
depends on DEBUG_KERNEL
depends on X86
select TRACING
help
@ -236,7 +271,6 @@ config POWER_TRACER
config STACK_TRACER
bool "Trace max stack"
depends on HAVE_FUNCTION_TRACER
depends on DEBUG_KERNEL
select FUNCTION_TRACER
select STACKTRACE
select KALLSYMS
@ -266,11 +300,66 @@ config HW_BRANCH_TRACER
This tracer records all branches on the system in a circular
buffer giving access to the last N branches for each cpu.
config KMEMTRACE
bool "Trace SLAB allocations"
select TRACING
help
kmemtrace provides tracing for slab allocator functions, such as
kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
data is then fed to the userspace application in order to analyse
allocation hotspots, internal fragmentation and so on, making it
possible to see how well an allocator performs, as well as debug
and profile kernel code.
This requires an userspace application to use. See
Documentation/vm/kmemtrace.txt for more information.
Saying Y will make the kernel somewhat larger and slower. However,
if you disable kmemtrace at run-time or boot-time, the performance
impact is minimal (depending on the arch the kernel is built for).
If unsure, say N.
config WORKQUEUE_TRACER
bool "Trace workqueues"
select TRACING
help
The workqueue tracer provides some statistical informations
about each cpu workqueue thread such as the number of the
works inserted and executed since their creation. It can help
to evaluate the amount of work each of them have to perform.
For example it can help a developer to decide whether he should
choose a per cpu workqueue instead of a singlethreaded one.
config BLK_DEV_IO_TRACE
bool "Support for tracing block io actions"
depends on SYSFS
depends on BLOCK
select RELAY
select DEBUG_FS
select TRACEPOINTS
select TRACING
select STACKTRACE
help
Say Y here if you want to be able to trace the block layer actions
on a given queue. Tracing allows you to see any traffic happening
on a block device queue. For more information (and the userspace
support tools needed), fetch the blktrace tools from:
git://git.kernel.dk/blktrace.git
Tracing also is possible using the ftrace interface, e.g.:
echo 1 > /sys/block/sda/sda1/trace/enable
echo blk > /sys/kernel/debug/tracing/current_tracer
cat /sys/kernel/debug/tracing/trace_pipe
If unsure, say N.
config DYNAMIC_FTRACE
bool "enable/disable ftrace tracepoints dynamically"
depends on FUNCTION_TRACER
depends on HAVE_DYNAMIC_FTRACE
depends on DEBUG_KERNEL
default y
help
This option will modify all the calls to ftrace dynamically
@ -296,7 +385,7 @@ config FTRACE_SELFTEST
config FTRACE_STARTUP_TEST
bool "Perform a startup test on ftrace"
depends on TRACING && DEBUG_KERNEL && !BOOT_TRACER
depends on TRACING
select FTRACE_SELFTEST
help
This option performs a series of startup tests on ftrace. On bootup
@ -306,7 +395,7 @@ config FTRACE_STARTUP_TEST
config MMIOTRACE
bool "Memory mapped IO tracing"
depends on HAVE_MMIOTRACE_SUPPORT && DEBUG_KERNEL && PCI
depends on HAVE_MMIOTRACE_SUPPORT && PCI
select TRACING
help
Mmiotrace traces Memory Mapped I/O access and is meant for
@ -328,3 +417,6 @@ config MMIOTRACE_TEST
Say N, unless you absolutely know what you are doing.
endmenu
endif # TRACING_SUPPORT

View file

@ -19,6 +19,10 @@ obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
obj-$(CONFIG_TRACING) += trace.o
obj-$(CONFIG_TRACING) += trace_clock.o
obj-$(CONFIG_TRACING) += trace_output.o
obj-$(CONFIG_TRACING) += trace_stat.o
obj-$(CONFIG_TRACING) += trace_printk.o
obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
@ -33,5 +37,14 @@ obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
obj-$(CONFIG_POWER_TRACER) += trace_power.o
obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
obj-$(CONFIG_EVENT_TRACER) += trace_events.o
obj-$(CONFIG_EVENT_TRACER) += events.o
obj-$(CONFIG_EVENT_TRACER) += trace_export.o
obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
obj-$(CONFIG_EVENT_TRACER) += trace_events_filter.o
libftrace-y := ftrace.o

1549
kernel/trace/blktrace.c Normal file

File diff suppressed because it is too large Load diff

14
kernel/trace/events.c Normal file
View file

@ -0,0 +1,14 @@
/*
* This is the place to register all trace points as events.
*/
#include <linux/stringify.h>
#include <trace/trace_events.h>
#include "trace_output.h"
#include "trace_events_stage_1.h"
#include "trace_events_stage_2.h"
#include "trace_events_stage_3.h"

File diff suppressed because it is too large Load diff

339
kernel/trace/kmemtrace.c Normal file
View file

@ -0,0 +1,339 @@
/*
* Memory allocator tracing
*
* Copyright (C) 2008 Eduard - Gabriel Munteanu
* Copyright (C) 2008 Pekka Enberg <penberg@cs.helsinki.fi>
* Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
*/
#include <linux/dcache.h>
#include <linux/debugfs.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
#include <trace/kmemtrace.h>
#include "trace.h"
#include "trace_output.h"
/* Select an alternative, minimalistic output than the original one */
#define TRACE_KMEM_OPT_MINIMAL 0x1
static struct tracer_opt kmem_opts[] = {
/* Default disable the minimalistic output */
{ TRACER_OPT(kmem_minimalistic, TRACE_KMEM_OPT_MINIMAL) },
{ }
};
static struct tracer_flags kmem_tracer_flags = {
.val = 0,
.opts = kmem_opts
};
static bool kmem_tracing_enabled __read_mostly;
static struct trace_array *kmemtrace_array;
static int kmem_trace_init(struct trace_array *tr)
{
int cpu;
kmemtrace_array = tr;
for_each_cpu_mask(cpu, cpu_possible_map)
tracing_reset(tr, cpu);
kmem_tracing_enabled = true;
return 0;
}
static void kmem_trace_reset(struct trace_array *tr)
{
kmem_tracing_enabled = false;
}
static void kmemtrace_headers(struct seq_file *s)
{
/* Don't need headers for the original kmemtrace output */
if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
return;
seq_printf(s, "#\n");
seq_printf(s, "# ALLOC TYPE REQ GIVEN FLAGS "
" POINTER NODE CALLER\n");
seq_printf(s, "# FREE | | | | "
" | | | |\n");
seq_printf(s, "# |\n\n");
}
/*
* The two following functions give the original output from kmemtrace,
* or something close to....perhaps they need some missing things
*/
static enum print_line_t
kmemtrace_print_alloc_original(struct trace_iterator *iter,
struct kmemtrace_alloc_entry *entry)
{
struct trace_seq *s = &iter->seq;
int ret;
/* Taken from the old linux/kmemtrace.h */
ret = trace_seq_printf(s, "type_id %d call_site %lu ptr %lu "
"bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n",
entry->type_id, entry->call_site, (unsigned long) entry->ptr,
(unsigned long) entry->bytes_req, (unsigned long) entry->bytes_alloc,
(unsigned long) entry->gfp_flags, entry->node);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
return TRACE_TYPE_HANDLED;
}
static enum print_line_t
kmemtrace_print_free_original(struct trace_iterator *iter,
struct kmemtrace_free_entry *entry)
{
struct trace_seq *s = &iter->seq;
int ret;
/* Taken from the old linux/kmemtrace.h */
ret = trace_seq_printf(s, "type_id %d call_site %lu ptr %lu\n",
entry->type_id, entry->call_site, (unsigned long) entry->ptr);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
return TRACE_TYPE_HANDLED;
}
/* The two other following provide a more minimalistic output */
static enum print_line_t
kmemtrace_print_alloc_compress(struct trace_iterator *iter,
struct kmemtrace_alloc_entry *entry)
{
struct trace_seq *s = &iter->seq;
int ret;
/* Alloc entry */
ret = trace_seq_printf(s, " + ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* Type */
switch (entry->type_id) {
case KMEMTRACE_TYPE_KMALLOC:
ret = trace_seq_printf(s, "K ");
break;
case KMEMTRACE_TYPE_CACHE:
ret = trace_seq_printf(s, "C ");
break;
case KMEMTRACE_TYPE_PAGES:
ret = trace_seq_printf(s, "P ");
break;
default:
ret = trace_seq_printf(s, "? ");
}
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* Requested */
ret = trace_seq_printf(s, "%4zu ", entry->bytes_req);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* Allocated */
ret = trace_seq_printf(s, "%4zu ", entry->bytes_alloc);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* Flags
* TODO: would be better to see the name of the GFP flag names
*/
ret = trace_seq_printf(s, "%08x ", entry->gfp_flags);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* Pointer to allocated */
ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* Node */
ret = trace_seq_printf(s, "%4d ", entry->node);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* Call site */
ret = seq_print_ip_sym(s, entry->call_site, 0);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
if (!trace_seq_printf(s, "\n"))
return TRACE_TYPE_PARTIAL_LINE;
return TRACE_TYPE_HANDLED;
}
static enum print_line_t
kmemtrace_print_free_compress(struct trace_iterator *iter,
struct kmemtrace_free_entry *entry)
{
struct trace_seq *s = &iter->seq;
int ret;
/* Free entry */
ret = trace_seq_printf(s, " - ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* Type */
switch (entry->type_id) {
case KMEMTRACE_TYPE_KMALLOC:
ret = trace_seq_printf(s, "K ");
break;
case KMEMTRACE_TYPE_CACHE:
ret = trace_seq_printf(s, "C ");
break;
case KMEMTRACE_TYPE_PAGES:
ret = trace_seq_printf(s, "P ");
break;
default:
ret = trace_seq_printf(s, "? ");
}
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* Skip requested/allocated/flags */
ret = trace_seq_printf(s, " ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* Pointer to allocated */
ret = trace_seq_printf(s, "0x%tx ", (ptrdiff_t)entry->ptr);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* Skip node */
ret = trace_seq_printf(s, " ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* Call site */
ret = seq_print_ip_sym(s, entry->call_site, 0);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
if (!trace_seq_printf(s, "\n"))
return TRACE_TYPE_PARTIAL_LINE;
return TRACE_TYPE_HANDLED;
}
static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
{
struct trace_entry *entry = iter->ent;
switch (entry->type) {
case TRACE_KMEM_ALLOC: {
struct kmemtrace_alloc_entry *field;
trace_assign_type(field, entry);
if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
return kmemtrace_print_alloc_compress(iter, field);
else
return kmemtrace_print_alloc_original(iter, field);
}
case TRACE_KMEM_FREE: {
struct kmemtrace_free_entry *field;
trace_assign_type(field, entry);
if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
return kmemtrace_print_free_compress(iter, field);
else
return kmemtrace_print_free_original(iter, field);
}
default:
return TRACE_TYPE_UNHANDLED;
}
}
/* Trace allocations */
void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
unsigned long call_site,
const void *ptr,
size_t bytes_req,
size_t bytes_alloc,
gfp_t gfp_flags,
int node)
{
struct ring_buffer_event *event;
struct kmemtrace_alloc_entry *entry;
struct trace_array *tr = kmemtrace_array;
if (!kmem_tracing_enabled)
return;
event = trace_buffer_lock_reserve(tr, TRACE_KMEM_ALLOC,
sizeof(*entry), 0, 0);
if (!event)
return;
entry = ring_buffer_event_data(event);
entry->call_site = call_site;
entry->ptr = ptr;
entry->bytes_req = bytes_req;
entry->bytes_alloc = bytes_alloc;
entry->gfp_flags = gfp_flags;
entry->node = node;
trace_buffer_unlock_commit(tr, event, 0, 0);
}
EXPORT_SYMBOL(kmemtrace_mark_alloc_node);
void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
unsigned long call_site,
const void *ptr)
{
struct ring_buffer_event *event;
struct kmemtrace_free_entry *entry;
struct trace_array *tr = kmemtrace_array;
if (!kmem_tracing_enabled)
return;
event = trace_buffer_lock_reserve(tr, TRACE_KMEM_FREE,
sizeof(*entry), 0, 0);
if (!event)
return;
entry = ring_buffer_event_data(event);
entry->type_id = type_id;
entry->call_site = call_site;
entry->ptr = ptr;
trace_buffer_unlock_commit(tr, event, 0, 0);
}
EXPORT_SYMBOL(kmemtrace_mark_free);
static struct tracer kmem_tracer __read_mostly = {
.name = "kmemtrace",
.init = kmem_trace_init,
.reset = kmem_trace_reset,
.print_line = kmemtrace_print_line,
.print_header = kmemtrace_headers,
.flags = &kmem_tracer_flags
};
void kmemtrace_init(void)
{
/* earliest opportunity to start kmem tracing */
}
static int __init init_kmem_tracer(void)
{
return register_tracer(&kmem_tracer);
}
device_initcall(init_kmem_tracer);

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -9,6 +9,8 @@
#include <linux/mmiotrace.h>
#include <linux/ftrace.h>
#include <trace/boot.h>
#include <trace/kmemtrace.h>
#include <trace/power.h>
enum trace_type {
__TRACE_FIRST_TYPE = 0,
@ -16,9 +18,9 @@ enum trace_type {
TRACE_FN,
TRACE_CTX,
TRACE_WAKE,
TRACE_CONT,
TRACE_STACK,
TRACE_PRINT,
TRACE_BPRINT,
TRACE_SPECIAL,
TRACE_MMIO_RW,
TRACE_MMIO_MAP,
@ -29,9 +31,14 @@ enum trace_type {
TRACE_GRAPH_ENT,
TRACE_USER_STACK,
TRACE_HW_BRANCHES,
TRACE_SYSCALL_ENTER,
TRACE_SYSCALL_EXIT,
TRACE_KMEM_ALLOC,
TRACE_KMEM_FREE,
TRACE_POWER,
TRACE_BLK,
__TRACE_LAST_TYPE
__TRACE_LAST_TYPE,
};
/*
@ -42,7 +49,6 @@ enum trace_type {
*/
struct trace_entry {
unsigned char type;
unsigned char cpu;
unsigned char flags;
unsigned char preempt_count;
int pid;
@ -60,13 +66,13 @@ struct ftrace_entry {
/* Function call entry */
struct ftrace_graph_ent_entry {
struct trace_entry ent;
struct trace_entry ent;
struct ftrace_graph_ent graph_ent;
};
/* Function return entry */
struct ftrace_graph_ret_entry {
struct trace_entry ent;
struct trace_entry ent;
struct ftrace_graph_ret ret;
};
extern struct tracer boot_tracer;
@ -112,12 +118,18 @@ struct userstack_entry {
};
/*
* ftrace_printk entry:
* trace_printk entry:
*/
struct bprint_entry {
struct trace_entry ent;
unsigned long ip;
const char *fmt;
u32 buf[];
};
struct print_entry {
struct trace_entry ent;
unsigned long ip;
int depth;
char buf[];
};
@ -170,15 +182,45 @@ struct trace_power {
struct power_trace state_data;
};
struct kmemtrace_alloc_entry {
struct trace_entry ent;
enum kmemtrace_type_id type_id;
unsigned long call_site;
const void *ptr;
size_t bytes_req;
size_t bytes_alloc;
gfp_t gfp_flags;
int node;
};
struct kmemtrace_free_entry {
struct trace_entry ent;
enum kmemtrace_type_id type_id;
unsigned long call_site;
const void *ptr;
};
struct syscall_trace_enter {
struct trace_entry ent;
int nr;
unsigned long args[];
};
struct syscall_trace_exit {
struct trace_entry ent;
int nr;
unsigned long ret;
};
/*
* trace_flag_type is an enumeration that holds different
* states when a trace occurs. These are:
* IRQS_OFF - interrupts were disabled
* IRQS_NOSUPPORT - arch does not support irqs_disabled_flags
* IRQS_NOSUPPORT - arch does not support irqs_disabled_flags
* NEED_RESCED - reschedule is requested
* HARDIRQ - inside an interrupt handler
* SOFTIRQ - inside a softirq handler
* CONT - multiple entries hold the trace item
*/
enum trace_flag_type {
TRACE_FLAG_IRQS_OFF = 0x01,
@ -186,7 +228,6 @@ enum trace_flag_type {
TRACE_FLAG_NEED_RESCHED = 0x04,
TRACE_FLAG_HARDIRQ = 0x08,
TRACE_FLAG_SOFTIRQ = 0x10,
TRACE_FLAG_CONT = 0x20,
};
#define TRACE_BUF_SIZE 1024
@ -198,6 +239,7 @@ enum trace_flag_type {
*/
struct trace_array_cpu {
atomic_t disabled;
void *buffer_page; /* ring buffer spare */
/* these fields get copied into max-trace: */
unsigned long trace_idx;
@ -262,10 +304,10 @@ extern void __ftrace_bad_type(void);
do { \
IF_ASSIGN(var, ent, struct ftrace_entry, TRACE_FN); \
IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \
IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \
IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \
IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \
IF_ASSIGN(var, ent, struct special_entry, 0); \
IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
TRACE_MMIO_RW); \
@ -279,7 +321,15 @@ extern void __ftrace_bad_type(void);
IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
TRACE_GRAPH_RET); \
IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \
TRACE_KMEM_ALLOC); \
IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
TRACE_KMEM_FREE); \
IF_ASSIGN(var, ent, struct syscall_trace_enter, \
TRACE_SYSCALL_ENTER); \
IF_ASSIGN(var, ent, struct syscall_trace_exit, \
TRACE_SYSCALL_EXIT); \
__ftrace_bad_type(); \
} while (0)
@ -287,7 +337,8 @@ extern void __ftrace_bad_type(void);
enum print_line_t {
TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */
TRACE_TYPE_HANDLED = 1,
TRACE_TYPE_UNHANDLED = 2 /* Relay to other output functions */
TRACE_TYPE_UNHANDLED = 2, /* Relay to other output functions */
TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */
};
@ -297,8 +348,8 @@ enum print_line_t {
* flags value in struct tracer_flags.
*/
struct tracer_opt {
const char *name; /* Will appear on the trace_options file */
u32 bit; /* Mask assigned in val field in tracer_flags */
const char *name; /* Will appear on the trace_options file */
u32 bit; /* Mask assigned in val field in tracer_flags */
};
/*
@ -307,28 +358,51 @@ struct tracer_opt {
*/
struct tracer_flags {
u32 val;
struct tracer_opt *opts;
struct tracer_opt *opts;
};
/* Makes more easy to define a tracer opt */
#define TRACER_OPT(s, b) .name = #s, .bit = b
/*
* A specific tracer, represented by methods that operate on a trace array:
/**
* struct tracer - a specific tracer and its callbacks to interact with debugfs
* @name: the name chosen to select it on the available_tracers file
* @init: called when one switches to this tracer (echo name > current_tracer)
* @reset: called when one switches to another tracer
* @start: called when tracing is unpaused (echo 1 > tracing_enabled)
* @stop: called when tracing is paused (echo 0 > tracing_enabled)
* @open: called when the trace file is opened
* @pipe_open: called when the trace_pipe file is opened
* @wait_pipe: override how the user waits for traces on trace_pipe
* @close: called when the trace file is released
* @read: override the default read callback on trace_pipe
* @splice_read: override the default splice_read callback on trace_pipe
* @selftest: selftest to run on boot (see trace_selftest.c)
* @print_headers: override the first lines that describe your columns
* @print_line: callback that prints a trace
* @set_flag: signals one of your private flags changed (trace_options file)
* @flags: your private flags
*/
struct tracer {
const char *name;
/* Your tracer should raise a warning if init fails */
int (*init)(struct trace_array *tr);
void (*reset)(struct trace_array *tr);
void (*start)(struct trace_array *tr);
void (*stop)(struct trace_array *tr);
void (*open)(struct trace_iterator *iter);
void (*pipe_open)(struct trace_iterator *iter);
void (*wait_pipe)(struct trace_iterator *iter);
void (*close)(struct trace_iterator *iter);
ssize_t (*read)(struct trace_iterator *iter,
struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos);
ssize_t (*splice_read)(struct trace_iterator *iter,
struct file *filp,
loff_t *ppos,
struct pipe_inode_info *pipe,
size_t len,
unsigned int flags);
#ifdef CONFIG_FTRACE_STARTUP_TEST
int (*selftest)(struct tracer *trace,
struct trace_array *tr);
@ -339,7 +413,8 @@ struct tracer {
int (*set_flag)(u32 old_flags, u32 bit, int set);
struct tracer *next;
int print_max;
struct tracer_flags *flags;
struct tracer_flags *flags;
struct tracer_stat *stats;
};
struct trace_seq {
@ -348,6 +423,16 @@ struct trace_seq {
unsigned int readpos;
};
static inline void
trace_seq_init(struct trace_seq *s)
{
s->len = 0;
s->readpos = 0;
}
#define TRACE_PIPE_ALL_CPU -1
/*
* Trace iterator - used by printout routines who present trace
* results to users and which routines might sleep, etc:
@ -356,6 +441,8 @@ struct trace_iterator {
struct trace_array *tr;
struct tracer *trace;
void *private;
int cpu_file;
struct mutex mutex;
struct ring_buffer_iter *buffer_iter[NR_CPUS];
/* The below is zeroed out in pipe_read */
@ -371,6 +458,7 @@ struct trace_iterator {
cpumask_var_t started;
};
int tracer_init(struct tracer *t, struct trace_array *tr);
int tracing_is_enabled(void);
void trace_wake_up(void);
void tracing_reset(struct trace_array *tr, int cpu);
@ -379,26 +467,50 @@ int tracing_open_generic(struct inode *inode, struct file *filp);
struct dentry *tracing_init_dentry(void);
void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
struct ring_buffer_event;
struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
unsigned char type,
unsigned long len,
unsigned long flags,
int pc);
void trace_buffer_unlock_commit(struct trace_array *tr,
struct ring_buffer_event *event,
unsigned long flags, int pc);
struct ring_buffer_event *
trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
unsigned long flags, int pc);
void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
unsigned long flags, int pc);
void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
unsigned long flags, int pc);
struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
struct trace_array_cpu *data);
struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
int *ent_cpu, u64 *ent_ts);
void tracing_generic_entry_update(struct trace_entry *entry,
unsigned long flags,
int pc);
void default_wait_pipe(struct trace_iterator *iter);
void poll_wait_pipe(struct trace_iterator *iter);
void ftrace(struct trace_array *tr,
struct trace_array_cpu *data,
unsigned long ip,
unsigned long parent_ip,
unsigned long flags, int pc);
void tracing_sched_switch_trace(struct trace_array *tr,
struct trace_array_cpu *data,
struct task_struct *prev,
struct task_struct *next,
unsigned long flags, int pc);
void tracing_record_cmdline(struct task_struct *tsk);
void tracing_sched_wakeup_trace(struct trace_array *tr,
struct trace_array_cpu *data,
struct task_struct *wakee,
struct task_struct *cur,
unsigned long flags, int pc);
@ -408,14 +520,12 @@ void trace_special(struct trace_array *tr,
unsigned long arg2,
unsigned long arg3, int pc);
void trace_function(struct trace_array *tr,
struct trace_array_cpu *data,
unsigned long ip,
unsigned long parent_ip,
unsigned long flags, int pc);
void trace_graph_return(struct ftrace_graph_ret *trace);
int trace_graph_entry(struct ftrace_graph_ent *trace);
void trace_hw_branch(struct trace_array *tr, u64 from, u64 to);
void tracing_start_cmdline_record(void);
void tracing_stop_cmdline_record(void);
@ -434,15 +544,11 @@ void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
void update_max_tr_single(struct trace_array *tr,
struct task_struct *tsk, int cpu);
extern cycle_t ftrace_now(int cpu);
void __trace_stack(struct trace_array *tr,
unsigned long flags,
int skip, int pc);
#ifdef CONFIG_FUNCTION_TRACER
void tracing_start_function_trace(void);
void tracing_stop_function_trace(void);
#else
# define tracing_start_function_trace() do { } while (0)
# define tracing_stop_function_trace() do { } while (0)
#endif
extern cycle_t ftrace_now(int cpu);
#ifdef CONFIG_CONTEXT_SWITCH_TRACER
typedef void
@ -456,10 +562,10 @@ struct tracer_switch_ops {
void *private;
struct tracer_switch_ops *next;
};
char *trace_find_cmdline(int pid);
#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
extern void trace_find_cmdline(int pid, char comm[]);
#ifdef CONFIG_DYNAMIC_FTRACE
extern unsigned long ftrace_update_tot_cnt;
#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
@ -469,6 +575,8 @@ extern int DYN_FTRACE_TEST_NAME(void);
#ifdef CONFIG_FTRACE_STARTUP_TEST
extern int trace_selftest_startup_function(struct tracer *trace,
struct trace_array *tr);
extern int trace_selftest_startup_function_graph(struct tracer *trace,
struct trace_array *tr);
extern int trace_selftest_startup_irqsoff(struct tracer *trace,
struct trace_array *tr);
extern int trace_selftest_startup_preemptoff(struct tracer *trace,
@ -488,18 +596,11 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
#endif /* CONFIG_FTRACE_STARTUP_TEST */
extern void *head_page(struct trace_array_cpu *data);
extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
extern void trace_seq_print_cont(struct trace_seq *s,
struct trace_iterator *iter);
extern int
seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
unsigned long sym_flags);
extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
size_t cnt);
extern long ns2usecs(cycle_t nsec);
extern int
trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args);
trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
extern int
trace_vprintk(unsigned long ip, const char *fmt, va_list args);
extern unsigned long trace_flags;
@ -580,7 +681,11 @@ enum trace_iterator_flags {
TRACE_ITER_ANNOTATE = 0x2000,
TRACE_ITER_USERSTACKTRACE = 0x4000,
TRACE_ITER_SYM_USEROBJ = 0x8000,
TRACE_ITER_PRINTK_MSGONLY = 0x10000
TRACE_ITER_PRINTK_MSGONLY = 0x10000,
TRACE_ITER_CONTEXT_INFO = 0x20000, /* Print pid/cpu/time */
TRACE_ITER_LATENCY_FMT = 0x40000,
TRACE_ITER_GLOBAL_CLK = 0x80000,
TRACE_ITER_SLEEP_TIME = 0x100000,
};
/*
@ -601,12 +706,12 @@ extern struct tracer nop_trace;
* preempt_enable (after a disable), a schedule might take place
* causing an infinite recursion.
*
* To prevent this, we read the need_recshed flag before
* To prevent this, we read the need_resched flag before
* disabling preemption. When we want to enable preemption we
* check the flag, if it is set, then we call preempt_enable_no_resched.
* Otherwise, we call preempt_enable.
*
* The rational for doing the above is that if need resched is set
* The rational for doing the above is that if need_resched is set
* and we have yet to reschedule, we are either in an atomic location
* (where we do not need to check for scheduling) or we are inside
* the scheduler and do not want to resched.
@ -627,7 +732,7 @@ static inline int ftrace_preempt_disable(void)
*
* This is a scheduler safe way to enable preemption and not miss
* any preemption checks. The disabled saved the state of preemption.
* If resched is set, then we were either inside an atomic or
* If resched is set, then we are either inside an atomic or
* are inside the scheduler (we would have already scheduled
* otherwise). In this case, we do not want to call normal
* preempt_enable, but preempt_enable_no_resched instead.
@ -664,4 +769,118 @@ static inline void trace_branch_disable(void)
}
#endif /* CONFIG_BRANCH_TRACER */
/* set ring buffers to default size if not already done so */
int tracing_update_buffers(void);
/* trace event type bit fields, not numeric */
enum {
TRACE_EVENT_TYPE_PRINTF = 1,
TRACE_EVENT_TYPE_RAW = 2,
};
struct ftrace_event_field {
struct list_head link;
char *name;
char *type;
int offset;
int size;
};
struct ftrace_event_call {
char *name;
char *system;
struct dentry *dir;
int enabled;
int (*regfunc)(void);
void (*unregfunc)(void);
int id;
int (*raw_init)(void);
int (*show_format)(struct trace_seq *s);
int (*define_fields)(void);
struct list_head fields;
struct filter_pred **preds;
#ifdef CONFIG_EVENT_PROFILE
atomic_t profile_count;
int (*profile_enable)(struct ftrace_event_call *);
void (*profile_disable)(struct ftrace_event_call *);
#endif
};
struct event_subsystem {
struct list_head list;
const char *name;
struct dentry *entry;
struct filter_pred **preds;
};
#define events_for_each(event) \
for (event = __start_ftrace_events; \
(unsigned long)event < (unsigned long)__stop_ftrace_events; \
event++)
#define MAX_FILTER_PRED 8
struct filter_pred;
typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
struct filter_pred {
filter_pred_fn_t fn;
u64 val;
char *str_val;
int str_len;
char *field_name;
int offset;
int not;
int or;
int compound;
int clear;
};
int trace_define_field(struct ftrace_event_call *call, char *type,
char *name, int offset, int size);
extern void filter_free_pred(struct filter_pred *pred);
extern void filter_print_preds(struct filter_pred **preds,
struct trace_seq *s);
extern int filter_parse(char **pbuf, struct filter_pred *pred);
extern int filter_add_pred(struct ftrace_event_call *call,
struct filter_pred *pred);
extern void filter_free_preds(struct ftrace_event_call *call);
extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
extern void filter_free_subsystem_preds(struct event_subsystem *system);
extern int filter_add_subsystem_pred(struct event_subsystem *system,
struct filter_pred *pred);
void event_trace_printk(unsigned long ip, const char *fmt, ...);
extern struct ftrace_event_call __start_ftrace_events[];
extern struct ftrace_event_call __stop_ftrace_events[];
#define for_each_event(event) \
for (event = __start_ftrace_events; \
(unsigned long)event < (unsigned long)__stop_ftrace_events; \
event++)
extern const char *__start___trace_bprintk_fmt[];
extern const char *__stop___trace_bprintk_fmt[];
/*
* The double __builtin_constant_p is because gcc will give us an error
* if we try to allocate the static variable to fmt if it is not a
* constant. Even with the outer if statement optimizing out.
*/
#define event_trace_printk(ip, fmt, args...) \
do { \
__trace_printk_check_format(fmt, ##args); \
tracing_record_cmdline(current); \
if (__builtin_constant_p(fmt)) { \
static const char *trace_printk_fmt \
__attribute__((section("__trace_printk_fmt"))) = \
__builtin_constant_p(fmt) ? fmt : NULL; \
\
__trace_bprintk(ip, trace_printk_fmt, ##args); \
} else \
__trace_printk(ip, fmt, ##args); \
} while (0)
#endif /* _LINUX_KERNEL_TRACE_H */

View file

@ -11,6 +11,7 @@
#include <linux/kallsyms.h>
#include "trace.h"
#include "trace_output.h"
static struct trace_array *boot_trace;
static bool pre_initcalls_finished;
@ -27,13 +28,13 @@ void start_boot_trace(void)
void enable_boot_trace(void)
{
if (pre_initcalls_finished)
if (boot_trace && pre_initcalls_finished)
tracing_start_sched_switch_record();
}
void disable_boot_trace(void)
{
if (pre_initcalls_finished)
if (boot_trace && pre_initcalls_finished)
tracing_stop_sched_switch_record();
}
@ -42,6 +43,9 @@ static int boot_trace_init(struct trace_array *tr)
int cpu;
boot_trace = tr;
if (!tr)
return 0;
for_each_cpu(cpu, cpu_possible_mask)
tracing_reset(tr, cpu);
@ -128,10 +132,9 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
{
struct ring_buffer_event *event;
struct trace_boot_call *entry;
unsigned long irq_flags;
struct trace_array *tr = boot_trace;
if (!pre_initcalls_finished)
if (!tr || !pre_initcalls_finished)
return;
/* Get its name now since this function could
@ -140,18 +143,13 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
sprint_symbol(bt->func, (unsigned long)fn);
preempt_disable();
event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
&irq_flags);
event = trace_buffer_lock_reserve(tr, TRACE_BOOT_CALL,
sizeof(*entry), 0, 0);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
tracing_generic_entry_update(&entry->ent, 0, 0);
entry->ent.type = TRACE_BOOT_CALL;
entry->boot_call = *bt;
ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
trace_wake_up();
trace_buffer_unlock_commit(tr, event, 0, 0);
out:
preempt_enable();
}
@ -160,27 +158,21 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
{
struct ring_buffer_event *event;
struct trace_boot_ret *entry;
unsigned long irq_flags;
struct trace_array *tr = boot_trace;
if (!pre_initcalls_finished)
if (!tr || !pre_initcalls_finished)
return;
sprint_symbol(bt->func, (unsigned long)fn);
preempt_disable();
event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
&irq_flags);
event = trace_buffer_lock_reserve(tr, TRACE_BOOT_RET,
sizeof(*entry), 0, 0);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
tracing_generic_entry_update(&entry->ent, 0, 0);
entry->ent.type = TRACE_BOOT_RET;
entry->boot_ret = *bt;
ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
trace_wake_up();
trace_buffer_unlock_commit(tr, event, 0, 0);
out:
preempt_enable();
}

View file

@ -14,12 +14,17 @@
#include <linux/hash.h>
#include <linux/fs.h>
#include <asm/local.h>
#include "trace.h"
#include "trace_stat.h"
#include "trace_output.h"
#ifdef CONFIG_BRANCH_TRACER
static struct tracer branch_trace;
static int branch_tracing_enabled __read_mostly;
static DEFINE_MUTEX(branch_tracing_mutex);
static struct trace_array *branch_tracer;
static void
@ -28,7 +33,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
struct trace_array *tr = branch_tracer;
struct ring_buffer_event *event;
struct trace_branch *entry;
unsigned long flags, irq_flags;
unsigned long flags;
int cpu, pc;
const char *p;
@ -47,15 +52,13 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
goto out;
event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
&irq_flags);
pc = preempt_count();
event = trace_buffer_lock_reserve(tr, TRACE_BRANCH,
sizeof(*entry), flags, pc);
if (!event)
goto out;
pc = preempt_count();
entry = ring_buffer_event_data(event);
tracing_generic_entry_update(&entry->ent, flags, pc);
entry->ent.type = TRACE_BRANCH;
/* Strip off the path, only save the file */
p = f->file + strlen(f->file);
@ -70,7 +73,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
entry->line = f->line;
entry->correct = val == expect;
ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
ring_buffer_unlock_commit(tr->buffer, event);
out:
atomic_dec(&tr->data[cpu]->disabled);
@ -88,8 +91,6 @@ void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
int enable_branch_tracing(struct trace_array *tr)
{
int ret = 0;
mutex_lock(&branch_tracing_mutex);
branch_tracer = tr;
/*
@ -100,7 +101,7 @@ int enable_branch_tracing(struct trace_array *tr)
branch_tracing_enabled++;
mutex_unlock(&branch_tracing_mutex);
return ret;
return 0;
}
void disable_branch_tracing(void)
@ -128,11 +129,6 @@ static void stop_branch_trace(struct trace_array *tr)
static int branch_trace_init(struct trace_array *tr)
{
int cpu;
for_each_online_cpu(cpu)
tracing_reset(tr, cpu);
start_branch_trace(tr);
return 0;
}
@ -142,22 +138,53 @@ static void branch_trace_reset(struct trace_array *tr)
stop_branch_trace(tr);
}
struct tracer branch_trace __read_mostly =
static enum print_line_t trace_branch_print(struct trace_iterator *iter,
int flags)
{
struct trace_branch *field;
trace_assign_type(field, iter->ent);
if (trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n",
field->correct ? " ok " : " MISS ",
field->func,
field->file,
field->line))
return TRACE_TYPE_PARTIAL_LINE;
return TRACE_TYPE_HANDLED;
}
static struct trace_event trace_branch_event = {
.type = TRACE_BRANCH,
.trace = trace_branch_print,
};
static struct tracer branch_trace __read_mostly =
{
.name = "branch",
.init = branch_trace_init,
.reset = branch_trace_reset,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_branch,
#endif
#endif /* CONFIG_FTRACE_SELFTEST */
};
__init static int init_branch_trace(void)
__init static int init_branch_tracer(void)
{
int ret;
ret = register_ftrace_event(&trace_branch_event);
if (!ret) {
printk(KERN_WARNING "Warning: could not register "
"branch events\n");
return 1;
}
return register_tracer(&branch_trace);
}
device_initcall(init_branch_tracer);
device_initcall(init_branch_trace);
#else
static inline
void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
@ -183,65 +210,38 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect)
}
EXPORT_SYMBOL(ftrace_likely_update);
struct ftrace_pointer {
void *start;
void *stop;
int hit;
};
extern unsigned long __start_annotated_branch_profile[];
extern unsigned long __stop_annotated_branch_profile[];
static void *
t_next(struct seq_file *m, void *v, loff_t *pos)
static int annotated_branch_stat_headers(struct seq_file *m)
{
const struct ftrace_pointer *f = m->private;
struct ftrace_branch_data *p = v;
(*pos)++;
if (v == (void *)1)
return f->start;
++p;
if ((void *)p >= (void *)f->stop)
return NULL;
return p;
}
static void *t_start(struct seq_file *m, loff_t *pos)
{
void *t = (void *)1;
loff_t l = 0;
for (; t && l < *pos; t = t_next(m, t, &l))
;
return t;
}
static void t_stop(struct seq_file *m, void *p)
{
}
static int t_show(struct seq_file *m, void *v)
{
const struct ftrace_pointer *fp = m->private;
struct ftrace_branch_data *p = v;
const char *f;
long percent;
if (v == (void *)1) {
if (fp->hit)
seq_printf(m, " miss hit %% ");
else
seq_printf(m, " correct incorrect %% ");
seq_printf(m, " Function "
seq_printf(m, " correct incorrect %% ");
seq_printf(m, " Function "
" File Line\n"
" ------- --------- - "
" -------- "
" ---- ----\n");
return 0;
}
return 0;
}
static inline long get_incorrect_percent(struct ftrace_branch_data *p)
{
long percent;
if (p->correct) {
percent = p->incorrect * 100;
percent /= p->correct + p->incorrect;
} else
percent = p->incorrect ? 100 : -1;
return percent;
}
static int branch_stat_show(struct seq_file *m, void *v)
{
struct ftrace_branch_data *p = v;
const char *f;
long percent;
/* Only print the file, not the path */
f = p->file + strlen(p->file);
@ -252,11 +252,7 @@ static int t_show(struct seq_file *m, void *v)
/*
* The miss is overlayed on correct, and hit on incorrect.
*/
if (p->correct) {
percent = p->incorrect * 100;
percent /= p->correct + p->incorrect;
} else
percent = p->incorrect ? 100 : -1;
percent = get_incorrect_percent(p);
seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect);
if (percent < 0)
@ -267,76 +263,118 @@ static int t_show(struct seq_file *m, void *v)
return 0;
}
static struct seq_operations tracing_likely_seq_ops = {
.start = t_start,
.next = t_next,
.stop = t_stop,
.show = t_show,
static void *annotated_branch_stat_start(void)
{
return __start_annotated_branch_profile;
}
static void *
annotated_branch_stat_next(void *v, int idx)
{
struct ftrace_branch_data *p = v;
++p;
if ((void *)p >= (void *)__stop_annotated_branch_profile)
return NULL;
return p;
}
static int annotated_branch_stat_cmp(void *p1, void *p2)
{
struct ftrace_branch_data *a = p1;
struct ftrace_branch_data *b = p2;
long percent_a, percent_b;
percent_a = get_incorrect_percent(a);
percent_b = get_incorrect_percent(b);
if (percent_a < percent_b)
return -1;
if (percent_a > percent_b)
return 1;
else
return 0;
}
static struct tracer_stat annotated_branch_stats = {
.name = "branch_annotated",
.stat_start = annotated_branch_stat_start,
.stat_next = annotated_branch_stat_next,
.stat_cmp = annotated_branch_stat_cmp,
.stat_headers = annotated_branch_stat_headers,
.stat_show = branch_stat_show
};
static int tracing_branch_open(struct inode *inode, struct file *file)
__init static int init_annotated_branch_stats(void)
{
int ret;
ret = seq_open(file, &tracing_likely_seq_ops);
ret = register_stat_tracer(&annotated_branch_stats);
if (!ret) {
struct seq_file *m = file->private_data;
m->private = (void *)inode->i_private;
printk(KERN_WARNING "Warning: could not register "
"annotated branches stats\n");
return 1;
}
return ret;
return 0;
}
static const struct file_operations tracing_branch_fops = {
.open = tracing_branch_open,
.read = seq_read,
.llseek = seq_lseek,
};
fs_initcall(init_annotated_branch_stats);
#ifdef CONFIG_PROFILE_ALL_BRANCHES
extern unsigned long __start_branch_profile[];
extern unsigned long __stop_branch_profile[];
static const struct ftrace_pointer ftrace_branch_pos = {
.start = __start_branch_profile,
.stop = __stop_branch_profile,
.hit = 1,
};
#endif /* CONFIG_PROFILE_ALL_BRANCHES */
extern unsigned long __start_annotated_branch_profile[];
extern unsigned long __stop_annotated_branch_profile[];
static const struct ftrace_pointer ftrace_annotated_branch_pos = {
.start = __start_annotated_branch_profile,
.stop = __stop_annotated_branch_profile,
};
static __init int ftrace_branch_init(void)
static int all_branch_stat_headers(struct seq_file *m)
{
struct dentry *d_tracer;
struct dentry *entry;
d_tracer = tracing_init_dentry();
entry = debugfs_create_file("profile_annotated_branch", 0444, d_tracer,
(void *)&ftrace_annotated_branch_pos,
&tracing_branch_fops);
if (!entry)
pr_warning("Could not create debugfs "
"'profile_annotatet_branch' entry\n");
#ifdef CONFIG_PROFILE_ALL_BRANCHES
entry = debugfs_create_file("profile_branch", 0444, d_tracer,
(void *)&ftrace_branch_pos,
&tracing_branch_fops);
if (!entry)
pr_warning("Could not create debugfs"
" 'profile_branch' entry\n");
#endif
seq_printf(m, " miss hit %% ");
seq_printf(m, " Function "
" File Line\n"
" ------- --------- - "
" -------- "
" ---- ----\n");
return 0;
}
device_initcall(ftrace_branch_init);
static void *all_branch_stat_start(void)
{
return __start_branch_profile;
}
static void *
all_branch_stat_next(void *v, int idx)
{
struct ftrace_branch_data *p = v;
++p;
if ((void *)p >= (void *)__stop_branch_profile)
return NULL;
return p;
}
static struct tracer_stat all_branch_stats = {
.name = "branch_all",
.stat_start = all_branch_stat_start,
.stat_next = all_branch_stat_next,
.stat_headers = all_branch_stat_headers,
.stat_show = branch_stat_show
};
__init static int all_annotated_branch_stats(void)
{
int ret;
ret = register_stat_tracer(&all_branch_stats);
if (!ret) {
printk(KERN_WARNING "Warning: could not register "
"all branches stats\n");
return 1;
}
return 0;
}
fs_initcall(all_annotated_branch_stats);
#endif /* CONFIG_PROFILE_ALL_BRANCHES */

109
kernel/trace/trace_clock.c Normal file
View file

@ -0,0 +1,109 @@
/*
* tracing clocks
*
* Copyright (C) 2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
*
* Implements 3 trace clock variants, with differing scalability/precision
* tradeoffs:
*
* - local: CPU-local trace clock
* - medium: scalable global clock with some jitter
* - global: globally monotonic, serialized clock
*
* Tracer plugins will chose a default from these clocks.
*/
#include <linux/spinlock.h>
#include <linux/hardirq.h>
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/sched.h>
#include <linux/ktime.h>
#include <linux/trace_clock.h>
/*
* trace_clock_local(): the simplest and least coherent tracing clock.
*
* Useful for tracing that does not cross to other CPUs nor
* does it go through idle events.
*/
u64 notrace trace_clock_local(void)
{
unsigned long flags;
u64 clock;
/*
* sched_clock() is an architecture implemented, fast, scalable,
* lockless clock. It is not guaranteed to be coherent across
* CPUs, nor across CPU idle events.
*/
raw_local_irq_save(flags);
clock = sched_clock();
raw_local_irq_restore(flags);
return clock;
}
/*
* trace_clock(): 'inbetween' trace clock. Not completely serialized,
* but not completely incorrect when crossing CPUs either.
*
* This is based on cpu_clock(), which will allow at most ~1 jiffy of
* jitter between CPUs. So it's a pretty scalable clock, but there
* can be offsets in the trace data.
*/
u64 notrace trace_clock(void)
{
return cpu_clock(raw_smp_processor_id());
}
/*
* trace_clock_global(): special globally coherent trace clock
*
* It has higher overhead than the other trace clocks but is still
* an order of magnitude faster than GTOD derived hardware clocks.
*
* Used by plugins that need globally coherent timestamps.
*/
static u64 prev_trace_clock_time;
static raw_spinlock_t trace_clock_lock ____cacheline_aligned_in_smp =
(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
u64 notrace trace_clock_global(void)
{
unsigned long flags;
int this_cpu;
u64 now;
raw_local_irq_save(flags);
this_cpu = raw_smp_processor_id();
now = cpu_clock(this_cpu);
/*
* If in an NMI context then dont risk lockups and return the
* cpu_clock() time:
*/
if (unlikely(in_nmi()))
goto out;
__raw_spin_lock(&trace_clock_lock);
/*
* TODO: if this happens often then maybe we should reset
* my_scd->clock to prev_trace_clock_time+1, to make sure
* we start ticking with the local clock from now on?
*/
if ((s64)(now - prev_trace_clock_time) < 0)
now = prev_trace_clock_time + 1;
prev_trace_clock_time = now;
__raw_spin_unlock(&trace_clock_lock);
out:
raw_local_irq_restore(flags);
return now;
}

View file

@ -0,0 +1,31 @@
/*
* trace event based perf counter profiling
*
* Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com>
*
*/
#include "trace.h"
int ftrace_profile_enable(int event_id)
{
struct ftrace_event_call *event;
for_each_event(event) {
if (event->id == event_id)
return event->profile_enable(event);
}
return -EINVAL;
}
void ftrace_profile_disable(int event_id)
{
struct ftrace_event_call *event;
for_each_event(event) {
if (event->id == event_id)
return event->profile_disable(event);
}
}

View file

@ -0,0 +1,173 @@
#undef TRACE_SYSTEM
#define TRACE_SYSTEM ftrace
/*
* We cheat and use the proto type field as the ID
* and args as the entry type (minus 'struct')
*/
TRACE_EVENT_FORMAT(function, TRACE_FN, ftrace_entry, ignore,
TRACE_STRUCT(
TRACE_FIELD(unsigned long, ip, ip)
TRACE_FIELD(unsigned long, parent_ip, parent_ip)
),
TP_RAW_FMT(" %lx <-- %lx")
);
TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT,
ftrace_graph_ent_entry, ignore,
TRACE_STRUCT(
TRACE_FIELD(unsigned long, graph_ent.func, func)
TRACE_FIELD(int, graph_ent.depth, depth)
),
TP_RAW_FMT("--> %lx (%d)")
);
TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
ftrace_graph_ret_entry, ignore,
TRACE_STRUCT(
TRACE_FIELD(unsigned long, ret.func, func)
TRACE_FIELD(int, ret.depth, depth)
),
TP_RAW_FMT("<-- %lx (%d)")
);
TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore,
TRACE_STRUCT(
TRACE_FIELD(unsigned int, prev_pid, prev_pid)
TRACE_FIELD(unsigned char, prev_prio, prev_prio)
TRACE_FIELD(unsigned char, prev_state, prev_state)
TRACE_FIELD(unsigned int, next_pid, next_pid)
TRACE_FIELD(unsigned char, next_prio, next_prio)
TRACE_FIELD(unsigned char, next_state, next_state)
TRACE_FIELD(unsigned int, next_cpu, next_cpu)
),
TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]")
);
TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
TRACE_STRUCT(
TRACE_FIELD(unsigned int, prev_pid, prev_pid)
TRACE_FIELD(unsigned char, prev_prio, prev_prio)
TRACE_FIELD(unsigned char, prev_state, prev_state)
TRACE_FIELD(unsigned int, next_pid, next_pid)
TRACE_FIELD(unsigned char, next_prio, next_prio)
TRACE_FIELD(unsigned char, next_state, next_state)
TRACE_FIELD(unsigned int, next_cpu, next_cpu)
),
TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]")
);
TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore,
TRACE_STRUCT(
TRACE_FIELD(unsigned long, arg1, arg1)
TRACE_FIELD(unsigned long, arg2, arg2)
TRACE_FIELD(unsigned long, arg3, arg3)
),
TP_RAW_FMT("(%08lx) (%08lx) (%08lx)")
);
/*
* Stack-trace entry:
*/
/* #define FTRACE_STACK_ENTRIES 8 */
TRACE_EVENT_FORMAT(kernel_stack, TRACE_STACK, stack_entry, ignore,
TRACE_STRUCT(
TRACE_FIELD(unsigned long, caller[0], stack0)
TRACE_FIELD(unsigned long, caller[1], stack1)
TRACE_FIELD(unsigned long, caller[2], stack2)
TRACE_FIELD(unsigned long, caller[3], stack3)
TRACE_FIELD(unsigned long, caller[4], stack4)
TRACE_FIELD(unsigned long, caller[5], stack5)
TRACE_FIELD(unsigned long, caller[6], stack6)
TRACE_FIELD(unsigned long, caller[7], stack7)
),
TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
"\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
);
TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore,
TRACE_STRUCT(
TRACE_FIELD(unsigned long, caller[0], stack0)
TRACE_FIELD(unsigned long, caller[1], stack1)
TRACE_FIELD(unsigned long, caller[2], stack2)
TRACE_FIELD(unsigned long, caller[3], stack3)
TRACE_FIELD(unsigned long, caller[4], stack4)
TRACE_FIELD(unsigned long, caller[5], stack5)
TRACE_FIELD(unsigned long, caller[6], stack6)
TRACE_FIELD(unsigned long, caller[7], stack7)
),
TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
"\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
);
TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore,
TRACE_STRUCT(
TRACE_FIELD(unsigned long, ip, ip)
TRACE_FIELD(char *, fmt, fmt)
TRACE_FIELD_ZERO_CHAR(buf)
),
TP_RAW_FMT("%08lx (%d) fmt:%p %s")
);
TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
TRACE_STRUCT(
TRACE_FIELD(unsigned long, ip, ip)
TRACE_FIELD_ZERO_CHAR(buf)
),
TP_RAW_FMT("%08lx (%d) fmt:%p %s")
);
TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
TRACE_STRUCT(
TRACE_FIELD(unsigned int, line, line)
TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, func)
TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, file)
TRACE_FIELD(char, correct, correct)
),
TP_RAW_FMT("%u:%s:%s (%u)")
);
TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
TRACE_STRUCT(
TRACE_FIELD(u64, from, from)
TRACE_FIELD(u64, to, to)
),
TP_RAW_FMT("from: %llx to: %llx")
);
TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
TRACE_STRUCT(
TRACE_FIELD(ktime_t, state_data.stamp, stamp)
TRACE_FIELD(ktime_t, state_data.end, end)
TRACE_FIELD(int, state_data.type, type)
TRACE_FIELD(int, state_data.state, state)
),
TP_RAW_FMT("%llx->%llx type:%u state:%u")
);
TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore,
TRACE_STRUCT(
TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
TRACE_FIELD(unsigned long, call_site, call_site)
TRACE_FIELD(const void *, ptr, ptr)
TRACE_FIELD(size_t, bytes_req, bytes_req)
TRACE_FIELD(size_t, bytes_alloc, bytes_alloc)
TRACE_FIELD(gfp_t, gfp_flags, gfp_flags)
TRACE_FIELD(int, node, node)
),
TP_RAW_FMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu"
" flags:%x node:%d")
);
TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore,
TRACE_STRUCT(
TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
TRACE_FIELD(unsigned long, call_site, call_site)
TRACE_FIELD(const void *, ptr, ptr)
),
TP_RAW_FMT("type:%u call_site:%lx ptr:%p")
);
#undef TRACE_SYSTEM

824
kernel/trace/trace_events.c Normal file
View file

@ -0,0 +1,824 @@
/*
* event tracer
*
* Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
*
* - Added format output of fields of the trace point.
* This was based off of work by Tom Zanussi <tzanussi@gmail.com>.
*
*/
#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/ctype.h>
#include "trace_output.h"
#define TRACE_SYSTEM "TRACE_SYSTEM"
static DEFINE_MUTEX(event_mutex);
int trace_define_field(struct ftrace_event_call *call, char *type,
char *name, int offset, int size)
{
struct ftrace_event_field *field;
field = kzalloc(sizeof(*field), GFP_KERNEL);
if (!field)
goto err;
field->name = kstrdup(name, GFP_KERNEL);
if (!field->name)
goto err;
field->type = kstrdup(type, GFP_KERNEL);
if (!field->type)
goto err;
field->offset = offset;
field->size = size;
list_add(&field->link, &call->fields);
return 0;
err:
if (field) {
kfree(field->name);
kfree(field->type);
}
kfree(field);
return -ENOMEM;
}
static void ftrace_clear_events(void)
{
struct ftrace_event_call *call = (void *)__start_ftrace_events;
while ((unsigned long)call < (unsigned long)__stop_ftrace_events) {
if (call->enabled) {
call->enabled = 0;
call->unregfunc();
}
call++;
}
}
static void ftrace_event_enable_disable(struct ftrace_event_call *call,
int enable)
{
switch (enable) {
case 0:
if (call->enabled) {
call->enabled = 0;
call->unregfunc();
}
break;
case 1:
if (!call->enabled) {
call->enabled = 1;
call->regfunc();
}
break;
}
}
static int ftrace_set_clr_event(char *buf, int set)
{
struct ftrace_event_call *call = __start_ftrace_events;
char *event = NULL, *sub = NULL, *match;
int ret = -EINVAL;
/*
* The buf format can be <subsystem>:<event-name>
* *:<event-name> means any event by that name.
* :<event-name> is the same.
*
* <subsystem>:* means all events in that subsystem
* <subsystem>: means the same.
*
* <name> (no ':') means all events in a subsystem with
* the name <name> or any event that matches <name>
*/
match = strsep(&buf, ":");
if (buf) {
sub = match;
event = buf;
match = NULL;
if (!strlen(sub) || strcmp(sub, "*") == 0)
sub = NULL;
if (!strlen(event) || strcmp(event, "*") == 0)
event = NULL;
}
mutex_lock(&event_mutex);
for_each_event(call) {
if (!call->name || !call->regfunc)
continue;
if (match &&
strcmp(match, call->name) != 0 &&
strcmp(match, call->system) != 0)
continue;
if (sub && strcmp(sub, call->system) != 0)
continue;
if (event && strcmp(event, call->name) != 0)
continue;
ftrace_event_enable_disable(call, set);
ret = 0;
}
mutex_unlock(&event_mutex);
return ret;
}
/* 128 should be much more than enough */
#define EVENT_BUF_SIZE 127
static ssize_t
ftrace_event_write(struct file *file, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
size_t read = 0;
int i, set = 1;
ssize_t ret;
char *buf;
char ch;
if (!cnt || cnt < 0)
return 0;
ret = tracing_update_buffers();
if (ret < 0)
return ret;
ret = get_user(ch, ubuf++);
if (ret)
return ret;
read++;
cnt--;
/* skip white space */
while (cnt && isspace(ch)) {
ret = get_user(ch, ubuf++);
if (ret)
return ret;
read++;
cnt--;
}
/* Only white space found? */
if (isspace(ch)) {
file->f_pos += read;
ret = read;
return ret;
}
buf = kmalloc(EVENT_BUF_SIZE+1, GFP_KERNEL);
if (!buf)
return -ENOMEM;
if (cnt > EVENT_BUF_SIZE)
cnt = EVENT_BUF_SIZE;
i = 0;
while (cnt && !isspace(ch)) {
if (!i && ch == '!')
set = 0;
else
buf[i++] = ch;
ret = get_user(ch, ubuf++);
if (ret)
goto out_free;
read++;
cnt--;
}
buf[i] = 0;
file->f_pos += read;
ret = ftrace_set_clr_event(buf, set);
if (ret)
goto out_free;
ret = read;
out_free:
kfree(buf);
return ret;
}
static void *
t_next(struct seq_file *m, void *v, loff_t *pos)
{
struct ftrace_event_call *call = m->private;
struct ftrace_event_call *next = call;
(*pos)++;
for (;;) {
if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
return NULL;
/*
* The ftrace subsystem is for showing formats only.
* They can not be enabled or disabled via the event files.
*/
if (call->regfunc)
break;
call++;
next = call;
}
m->private = ++next;
return call;
}
static void *t_start(struct seq_file *m, loff_t *pos)
{
return t_next(m, NULL, pos);
}
static void *
s_next(struct seq_file *m, void *v, loff_t *pos)
{
struct ftrace_event_call *call = m->private;
struct ftrace_event_call *next;
(*pos)++;
retry:
if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
return NULL;
if (!call->enabled) {
call++;
goto retry;
}
next = call;
m->private = ++next;
return call;
}
static void *s_start(struct seq_file *m, loff_t *pos)
{
return s_next(m, NULL, pos);
}
static int t_show(struct seq_file *m, void *v)
{
struct ftrace_event_call *call = v;
if (strcmp(call->system, TRACE_SYSTEM) != 0)
seq_printf(m, "%s:", call->system);
seq_printf(m, "%s\n", call->name);
return 0;
}
static void t_stop(struct seq_file *m, void *p)
{
}
static int
ftrace_event_seq_open(struct inode *inode, struct file *file)
{
int ret;
const struct seq_operations *seq_ops;
if ((file->f_mode & FMODE_WRITE) &&
!(file->f_flags & O_APPEND))
ftrace_clear_events();
seq_ops = inode->i_private;
ret = seq_open(file, seq_ops);
if (!ret) {
struct seq_file *m = file->private_data;
m->private = __start_ftrace_events;
}
return ret;
}
static ssize_t
event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
{
struct ftrace_event_call *call = filp->private_data;
char *buf;
if (call->enabled)
buf = "1\n";
else
buf = "0\n";
return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
}
static ssize_t
event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
struct ftrace_event_call *call = filp->private_data;
char buf[64];
unsigned long val;
int ret;
if (cnt >= sizeof(buf))
return -EINVAL;
if (copy_from_user(&buf, ubuf, cnt))
return -EFAULT;
buf[cnt] = 0;
ret = strict_strtoul(buf, 10, &val);
if (ret < 0)
return ret;
ret = tracing_update_buffers();
if (ret < 0)
return ret;
switch (val) {
case 0:
case 1:
mutex_lock(&event_mutex);
ftrace_event_enable_disable(call, val);
mutex_unlock(&event_mutex);
break;
default:
return -EINVAL;
}
*ppos += cnt;
return cnt;
}
#undef FIELD
#define FIELD(type, name) \
#type, "common_" #name, offsetof(typeof(field), name), \
sizeof(field.name)
static int trace_write_header(struct trace_seq *s)
{
struct trace_entry field;
/* struct trace_entry */
return trace_seq_printf(s,
"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
"\n",
FIELD(unsigned char, type),
FIELD(unsigned char, flags),
FIELD(unsigned char, preempt_count),
FIELD(int, pid),
FIELD(int, tgid));
}
static ssize_t
event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
{
struct ftrace_event_call *call = filp->private_data;
struct trace_seq *s;
char *buf;
int r;
if (*ppos)
return 0;
s = kmalloc(sizeof(*s), GFP_KERNEL);
if (!s)
return -ENOMEM;
trace_seq_init(s);
/* If any of the first writes fail, so will the show_format. */
trace_seq_printf(s, "name: %s\n", call->name);
trace_seq_printf(s, "ID: %d\n", call->id);
trace_seq_printf(s, "format:\n");
trace_write_header(s);
r = call->show_format(s);
if (!r) {
/*
* ug! The format output is bigger than a PAGE!!
*/
buf = "FORMAT TOO BIG\n";
r = simple_read_from_buffer(ubuf, cnt, ppos,
buf, strlen(buf));
goto out;
}
r = simple_read_from_buffer(ubuf, cnt, ppos,
s->buffer, s->len);
out:
kfree(s);
return r;
}
static ssize_t
event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
{
struct ftrace_event_call *call = filp->private_data;
struct trace_seq *s;
int r;
if (*ppos)
return 0;
s = kmalloc(sizeof(*s), GFP_KERNEL);
if (!s)
return -ENOMEM;
trace_seq_init(s);
trace_seq_printf(s, "%d\n", call->id);
r = simple_read_from_buffer(ubuf, cnt, ppos,
s->buffer, s->len);
kfree(s);
return r;
}
static ssize_t
event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
{
struct ftrace_event_call *call = filp->private_data;
struct trace_seq *s;
int r;
if (*ppos)
return 0;
s = kmalloc(sizeof(*s), GFP_KERNEL);
if (!s)
return -ENOMEM;
trace_seq_init(s);
filter_print_preds(call->preds, s);
r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
kfree(s);
return r;
}
static ssize_t
event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
struct ftrace_event_call *call = filp->private_data;
char buf[64], *pbuf = buf;
struct filter_pred *pred;
int err;
if (cnt >= sizeof(buf))
return -EINVAL;
if (copy_from_user(&buf, ubuf, cnt))
return -EFAULT;
pred = kzalloc(sizeof(*pred), GFP_KERNEL);
if (!pred)
return -ENOMEM;
err = filter_parse(&pbuf, pred);
if (err < 0) {
filter_free_pred(pred);
return err;
}
if (pred->clear) {
filter_free_preds(call);
filter_free_pred(pred);
return cnt;
}
if (filter_add_pred(call, pred)) {
filter_free_pred(pred);
return -EINVAL;
}
*ppos += cnt;
return cnt;
}
static ssize_t
subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
{
struct event_subsystem *system = filp->private_data;
struct trace_seq *s;
int r;
if (*ppos)
return 0;
s = kmalloc(sizeof(*s), GFP_KERNEL);
if (!s)
return -ENOMEM;
trace_seq_init(s);
filter_print_preds(system->preds, s);
r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
kfree(s);
return r;
}
static ssize_t
subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
struct event_subsystem *system = filp->private_data;
char buf[64], *pbuf = buf;
struct filter_pred *pred;
int err;
if (cnt >= sizeof(buf))
return -EINVAL;
if (copy_from_user(&buf, ubuf, cnt))
return -EFAULT;
pred = kzalloc(sizeof(*pred), GFP_KERNEL);
if (!pred)
return -ENOMEM;
err = filter_parse(&pbuf, pred);
if (err < 0) {
filter_free_pred(pred);
return err;
}
if (pred->clear) {
filter_free_subsystem_preds(system);
filter_free_pred(pred);
return cnt;
}
if (filter_add_subsystem_pred(system, pred)) {
filter_free_subsystem_preds(system);
filter_free_pred(pred);
return -EINVAL;
}
*ppos += cnt;
return cnt;
}
static const struct seq_operations show_event_seq_ops = {
.start = t_start,
.next = t_next,
.show = t_show,
.stop = t_stop,
};
static const struct seq_operations show_set_event_seq_ops = {
.start = s_start,
.next = s_next,
.show = t_show,
.stop = t_stop,
};
static const struct file_operations ftrace_avail_fops = {
.open = ftrace_event_seq_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
static const struct file_operations ftrace_set_event_fops = {
.open = ftrace_event_seq_open,
.read = seq_read,
.write = ftrace_event_write,
.llseek = seq_lseek,
.release = seq_release,
};
static const struct file_operations ftrace_enable_fops = {
.open = tracing_open_generic,
.read = event_enable_read,
.write = event_enable_write,
};
static const struct file_operations ftrace_event_format_fops = {
.open = tracing_open_generic,
.read = event_format_read,
};
static const struct file_operations ftrace_event_id_fops = {
.open = tracing_open_generic,
.read = event_id_read,
};
static const struct file_operations ftrace_event_filter_fops = {
.open = tracing_open_generic,
.read = event_filter_read,
.write = event_filter_write,
};
static const struct file_operations ftrace_subsystem_filter_fops = {
.open = tracing_open_generic,
.read = subsystem_filter_read,
.write = subsystem_filter_write,
};
static struct dentry *event_trace_events_dir(void)
{
static struct dentry *d_tracer;
static struct dentry *d_events;
if (d_events)
return d_events;
d_tracer = tracing_init_dentry();
if (!d_tracer)
return NULL;
d_events = debugfs_create_dir("events", d_tracer);
if (!d_events)
pr_warning("Could not create debugfs "
"'events' directory\n");
return d_events;
}
static LIST_HEAD(event_subsystems);
static struct dentry *
event_subsystem_dir(const char *name, struct dentry *d_events)
{
struct event_subsystem *system;
/* First see if we did not already create this dir */
list_for_each_entry(system, &event_subsystems, list) {
if (strcmp(system->name, name) == 0)
return system->entry;
}
/* need to create new entry */
system = kmalloc(sizeof(*system), GFP_KERNEL);
if (!system) {
pr_warning("No memory to create event subsystem %s\n",
name);
return d_events;
}
system->entry = debugfs_create_dir(name, d_events);
if (!system->entry) {
pr_warning("Could not create event subsystem %s\n",
name);
kfree(system);
return d_events;
}
system->name = name;
list_add(&system->list, &event_subsystems);
system->preds = NULL;
return system->entry;
}
static int
event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
{
struct dentry *entry;
int ret;
/*
* If the trace point header did not define TRACE_SYSTEM
* then the system would be called "TRACE_SYSTEM".
*/
if (strcmp(call->system, "TRACE_SYSTEM") != 0)
d_events = event_subsystem_dir(call->system, d_events);
if (call->raw_init) {
ret = call->raw_init();
if (ret < 0) {
pr_warning("Could not initialize trace point"
" events/%s\n", call->name);
return ret;
}
}
call->dir = debugfs_create_dir(call->name, d_events);
if (!call->dir) {
pr_warning("Could not create debugfs "
"'%s' directory\n", call->name);
return -1;
}
if (call->regfunc) {
entry = debugfs_create_file("enable", 0644, call->dir, call,
&ftrace_enable_fops);
if (!entry)
pr_warning("Could not create debugfs "
"'%s/enable' entry\n", call->name);
}
if (call->id) {
entry = debugfs_create_file("id", 0444, call->dir, call,
&ftrace_event_id_fops);
if (!entry)
pr_warning("Could not create debugfs '%s/id' entry\n",
call->name);
}
if (call->define_fields) {
ret = call->define_fields();
if (ret < 0) {
pr_warning("Could not initialize trace point"
" events/%s\n", call->name);
return ret;
}
entry = debugfs_create_file("filter", 0644, call->dir, call,
&ftrace_event_filter_fops);
if (!entry)
pr_warning("Could not create debugfs "
"'%s/filter' entry\n", call->name);
}
/* A trace may not want to export its format */
if (!call->show_format)
return 0;
entry = debugfs_create_file("format", 0444, call->dir, call,
&ftrace_event_format_fops);
if (!entry)
pr_warning("Could not create debugfs "
"'%s/format' entry\n", call->name);
return 0;
}
static __init int event_trace_init(void)
{
struct ftrace_event_call *call = __start_ftrace_events;
struct dentry *d_tracer;
struct dentry *entry;
struct dentry *d_events;
d_tracer = tracing_init_dentry();
if (!d_tracer)
return 0;
entry = debugfs_create_file("available_events", 0444, d_tracer,
(void *)&show_event_seq_ops,
&ftrace_avail_fops);
if (!entry)
pr_warning("Could not create debugfs "
"'available_events' entry\n");
entry = debugfs_create_file("set_event", 0644, d_tracer,
(void *)&show_set_event_seq_ops,
&ftrace_set_event_fops);
if (!entry)
pr_warning("Could not create debugfs "
"'set_event' entry\n");
d_events = event_trace_events_dir();
if (!d_events)
return 0;
for_each_event(call) {
/* The linker may leave blanks */
if (!call->name)
continue;
event_create_dir(call, d_events);
}
return 0;
}
fs_initcall(event_trace_init);

View file

@ -0,0 +1,427 @@
/*
* trace_events_filter - generic event filtering
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
* Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com>
*/
#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/ctype.h>
#include "trace.h"
#include "trace_output.h"
static int filter_pred_64(struct filter_pred *pred, void *event)
{
u64 *addr = (u64 *)(event + pred->offset);
u64 val = (u64)pred->val;
int match;
match = (val == *addr) ^ pred->not;
return match;
}
static int filter_pred_32(struct filter_pred *pred, void *event)
{
u32 *addr = (u32 *)(event + pred->offset);
u32 val = (u32)pred->val;
int match;
match = (val == *addr) ^ pred->not;
return match;
}
static int filter_pred_16(struct filter_pred *pred, void *event)
{
u16 *addr = (u16 *)(event + pred->offset);
u16 val = (u16)pred->val;
int match;
match = (val == *addr) ^ pred->not;
return match;
}
static int filter_pred_8(struct filter_pred *pred, void *event)
{
u8 *addr = (u8 *)(event + pred->offset);
u8 val = (u8)pred->val;
int match;
match = (val == *addr) ^ pred->not;
return match;
}
static int filter_pred_string(struct filter_pred *pred, void *event)
{
char *addr = (char *)(event + pred->offset);
int cmp, match;
cmp = strncmp(addr, pred->str_val, pred->str_len);
match = (!cmp) ^ pred->not;
return match;
}
/* return 1 if event matches, 0 otherwise (discard) */
int filter_match_preds(struct ftrace_event_call *call, void *rec)
{
int i, matched, and_failed = 0;
struct filter_pred *pred;
for (i = 0; i < MAX_FILTER_PRED; i++) {
if (call->preds[i]) {
pred = call->preds[i];
if (and_failed && !pred->or)
continue;
matched = pred->fn(pred, rec);
if (!matched && !pred->or) {
and_failed = 1;
continue;
} else if (matched && pred->or)
return 1;
} else
break;
}
if (and_failed)
return 0;
return 1;
}
void filter_print_preds(struct filter_pred **preds, struct trace_seq *s)
{
char *field_name;
struct filter_pred *pred;
int i;
if (!preds) {
trace_seq_printf(s, "none\n");
return;
}
for (i = 0; i < MAX_FILTER_PRED; i++) {
if (preds[i]) {
pred = preds[i];
field_name = pred->field_name;
if (i)
trace_seq_printf(s, pred->or ? "|| " : "&& ");
trace_seq_printf(s, "%s ", field_name);
trace_seq_printf(s, pred->not ? "!= " : "== ");
if (pred->str_val)
trace_seq_printf(s, "%s\n", pred->str_val);
else
trace_seq_printf(s, "%llu\n", pred->val);
} else
break;
}
}
static struct ftrace_event_field *
find_event_field(struct ftrace_event_call *call, char *name)
{
struct ftrace_event_field *field;
list_for_each_entry(field, &call->fields, link) {
if (!strcmp(field->name, name))
return field;
}
return NULL;
}
void filter_free_pred(struct filter_pred *pred)
{
if (!pred)
return;
kfree(pred->field_name);
kfree(pred->str_val);
kfree(pred);
}
void filter_free_preds(struct ftrace_event_call *call)
{
int i;
if (call->preds) {
for (i = 0; i < MAX_FILTER_PRED; i++)
filter_free_pred(call->preds[i]);
kfree(call->preds);
call->preds = NULL;
}
}
void filter_free_subsystem_preds(struct event_subsystem *system)
{
struct ftrace_event_call *call = __start_ftrace_events;
int i;
if (system->preds) {
for (i = 0; i < MAX_FILTER_PRED; i++)
filter_free_pred(system->preds[i]);
kfree(system->preds);
system->preds = NULL;
}
events_for_each(call) {
if (!call->name || !call->regfunc)
continue;
if (!strcmp(call->system, system->name))
filter_free_preds(call);
}
}
static int __filter_add_pred(struct ftrace_event_call *call,
struct filter_pred *pred)
{
int i;
if (call->preds && !pred->compound)
filter_free_preds(call);
if (!call->preds) {
call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
GFP_KERNEL);
if (!call->preds)
return -ENOMEM;
}
for (i = 0; i < MAX_FILTER_PRED; i++) {
if (!call->preds[i]) {
call->preds[i] = pred;
return 0;
}
}
return -ENOMEM;
}
static int is_string_field(const char *type)
{
if (strchr(type, '[') && strstr(type, "char"))
return 1;
return 0;
}
int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
{
struct ftrace_event_field *field;
field = find_event_field(call, pred->field_name);
if (!field)
return -EINVAL;
pred->offset = field->offset;
if (is_string_field(field->type)) {
if (!pred->str_val)
return -EINVAL;
pred->fn = filter_pred_string;
pred->str_len = field->size;
return __filter_add_pred(call, pred);
} else {
if (pred->str_val)
return -EINVAL;
}
switch (field->size) {
case 8:
pred->fn = filter_pred_64;
break;
case 4:
pred->fn = filter_pred_32;
break;
case 2:
pred->fn = filter_pred_16;
break;
case 1:
pred->fn = filter_pred_8;
break;
default:
return -EINVAL;
}
return __filter_add_pred(call, pred);
}
static struct filter_pred *copy_pred(struct filter_pred *pred)
{
struct filter_pred *new_pred = kmalloc(sizeof(*pred), GFP_KERNEL);
if (!new_pred)
return NULL;
memcpy(new_pred, pred, sizeof(*pred));
if (pred->field_name) {
new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
if (!new_pred->field_name) {
kfree(new_pred);
return NULL;
}
}
if (pred->str_val) {
new_pred->str_val = kstrdup(pred->str_val, GFP_KERNEL);
if (!new_pred->str_val) {
filter_free_pred(new_pred);
return NULL;
}
}
return new_pred;
}
int filter_add_subsystem_pred(struct event_subsystem *system,
struct filter_pred *pred)
{
struct ftrace_event_call *call = __start_ftrace_events;
struct filter_pred *event_pred;
int i;
if (system->preds && !pred->compound)
filter_free_subsystem_preds(system);
if (!system->preds) {
system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
GFP_KERNEL);
if (!system->preds)
return -ENOMEM;
}
for (i = 0; i < MAX_FILTER_PRED; i++) {
if (!system->preds[i]) {
system->preds[i] = pred;
break;
}
}
if (i == MAX_FILTER_PRED)
return -EINVAL;
events_for_each(call) {
int err;
if (!call->name || !call->regfunc)
continue;
if (strcmp(call->system, system->name))
continue;
if (!find_event_field(call, pred->field_name))
continue;
event_pred = copy_pred(pred);
if (!event_pred)
goto oom;
err = filter_add_pred(call, event_pred);
if (err)
filter_free_pred(event_pred);
if (err == -ENOMEM)
goto oom;
}
return 0;
oom:
system->preds[i] = NULL;
return -ENOMEM;
}
int filter_parse(char **pbuf, struct filter_pred *pred)
{
char *tmp, *tok, *val_str = NULL;
int tok_n = 0;
/* field ==/!= number, or/and field ==/!= number, number */
while ((tok = strsep(pbuf, " \n"))) {
if (tok_n == 0) {
if (!strcmp(tok, "0")) {
pred->clear = 1;
return 0;
} else if (!strcmp(tok, "&&")) {
pred->or = 0;
pred->compound = 1;
} else if (!strcmp(tok, "||")) {
pred->or = 1;
pred->compound = 1;
} else
pred->field_name = tok;
tok_n = 1;
continue;
}
if (tok_n == 1) {
if (!pred->field_name)
pred->field_name = tok;
else if (!strcmp(tok, "!="))
pred->not = 1;
else if (!strcmp(tok, "=="))
pred->not = 0;
else {
pred->field_name = NULL;
return -EINVAL;
}
tok_n = 2;
continue;
}
if (tok_n == 2) {
if (pred->compound) {
if (!strcmp(tok, "!="))
pred->not = 1;
else if (!strcmp(tok, "=="))
pred->not = 0;
else {
pred->field_name = NULL;
return -EINVAL;
}
} else {
val_str = tok;
break; /* done */
}
tok_n = 3;
continue;
}
if (tok_n == 3) {
val_str = tok;
break; /* done */
}
}
pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
if (!pred->field_name)
return -ENOMEM;
pred->val = simple_strtoull(val_str, &tmp, 10);
if (tmp == val_str) {
pred->str_val = kstrdup(val_str, GFP_KERNEL);
if (!pred->str_val)
return -ENOMEM;
}
return 0;
}

View file

@ -0,0 +1,39 @@
/*
* Stage 1 of the trace events.
*
* Override the macros in <trace/trace_event_types.h> to include the following:
*
* struct ftrace_raw_<call> {
* struct trace_entry ent;
* <type> <item>;
* <type2> <item2>[<len>];
* [...]
* };
*
* The <type> <item> is created by the __field(type, item) macro or
* the __array(type2, item2, len) macro.
* We simply do "type item;", and that will create the fields
* in the structure.
*/
#undef TRACE_FORMAT
#define TRACE_FORMAT(call, proto, args, fmt)
#undef __array
#define __array(type, item, len) type item[len];
#undef __field
#define __field(type, item) type item;
#undef TP_STRUCT__entry
#define TP_STRUCT__entry(args...) args
#undef TRACE_EVENT
#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \
struct ftrace_raw_##name { \
struct trace_entry ent; \
tstruct \
}; \
static struct ftrace_event_call event_##name
#include <trace/trace_event_types.h>

View file

@ -0,0 +1,176 @@
/*
* Stage 2 of the trace events.
*
* Override the macros in <trace/trace_event_types.h> to include the following:
*
* enum print_line_t
* ftrace_raw_output_<call>(struct trace_iterator *iter, int flags)
* {
* struct trace_seq *s = &iter->seq;
* struct ftrace_raw_<call> *field; <-- defined in stage 1
* struct trace_entry *entry;
* int ret;
*
* entry = iter->ent;
*
* if (entry->type != event_<call>.id) {
* WARN_ON_ONCE(1);
* return TRACE_TYPE_UNHANDLED;
* }
*
* field = (typeof(field))entry;
*
* ret = trace_seq_printf(s, <TP_printk> "\n");
* if (!ret)
* return TRACE_TYPE_PARTIAL_LINE;
*
* return TRACE_TYPE_HANDLED;
* }
*
* This is the method used to print the raw event to the trace
* output format. Note, this is not needed if the data is read
* in binary.
*/
#undef __entry
#define __entry field
#undef TP_printk
#define TP_printk(fmt, args...) fmt "\n", args
#undef TRACE_EVENT
#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \
enum print_line_t \
ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \
{ \
struct trace_seq *s = &iter->seq; \
struct ftrace_raw_##call *field; \
struct trace_entry *entry; \
int ret; \
\
entry = iter->ent; \
\
if (entry->type != event_##call.id) { \
WARN_ON_ONCE(1); \
return TRACE_TYPE_UNHANDLED; \
} \
\
field = (typeof(field))entry; \
\
ret = trace_seq_printf(s, #call ": " print); \
if (!ret) \
return TRACE_TYPE_PARTIAL_LINE; \
\
return TRACE_TYPE_HANDLED; \
}
#include <trace/trace_event_types.h>
/*
* Setup the showing format of trace point.
*
* int
* ftrace_format_##call(struct trace_seq *s)
* {
* struct ftrace_raw_##call field;
* int ret;
*
* ret = trace_seq_printf(s, #type " " #item ";"
* " offset:%u; size:%u;\n",
* offsetof(struct ftrace_raw_##call, item),
* sizeof(field.type));
*
* }
*/
#undef TP_STRUCT__entry
#define TP_STRUCT__entry(args...) args
#undef __field
#define __field(type, item) \
ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
"offset:%u;\tsize:%u;\n", \
(unsigned int)offsetof(typeof(field), item), \
(unsigned int)sizeof(field.item)); \
if (!ret) \
return 0;
#undef __array
#define __array(type, item, len) \
ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
"offset:%u;\tsize:%u;\n", \
(unsigned int)offsetof(typeof(field), item), \
(unsigned int)sizeof(field.item)); \
if (!ret) \
return 0;
#undef __entry
#define __entry "REC"
#undef TP_printk
#define TP_printk(fmt, args...) "%s, %s\n", #fmt, #args
#undef TP_fast_assign
#define TP_fast_assign(args...) args
#undef TRACE_EVENT
#define TRACE_EVENT(call, proto, args, tstruct, func, print) \
static int \
ftrace_format_##call(struct trace_seq *s) \
{ \
struct ftrace_raw_##call field; \
int ret; \
\
tstruct; \
\
trace_seq_printf(s, "\nprint fmt: " print); \
\
return ret; \
}
#include <trace/trace_event_types.h>
#undef __field
#define __field(type, item) \
ret = trace_define_field(event_call, #type, #item, \
offsetof(typeof(field), item), \
sizeof(field.item)); \
if (ret) \
return ret;
#undef __array
#define __array(type, item, len) \
ret = trace_define_field(event_call, #type "[" #len "]", #item, \
offsetof(typeof(field), item), \
sizeof(field.item)); \
if (ret) \
return ret;
#define __common_field(type, item) \
ret = trace_define_field(event_call, #type, "common_" #item, \
offsetof(typeof(field.ent), item), \
sizeof(field.ent.item)); \
if (ret) \
return ret;
#undef TRACE_EVENT
#define TRACE_EVENT(call, proto, args, tstruct, func, print) \
int \
ftrace_define_fields_##call(void) \
{ \
struct ftrace_raw_##call field; \
struct ftrace_event_call *event_call = &event_##call; \
int ret; \
\
__common_field(unsigned char, type); \
__common_field(unsigned char, flags); \
__common_field(unsigned char, preempt_count); \
__common_field(int, pid); \
__common_field(int, tgid); \
\
tstruct; \
\
return ret; \
}
#include <trace/trace_event_types.h>

View file

@ -0,0 +1,281 @@
/*
* Stage 3 of the trace events.
*
* Override the macros in <trace/trace_event_types.h> to include the following:
*
* static void ftrace_event_<call>(proto)
* {
* event_trace_printk(_RET_IP_, "<call>: " <fmt>);
* }
*
* static int ftrace_reg_event_<call>(void)
* {
* int ret;
*
* ret = register_trace_<call>(ftrace_event_<call>);
* if (!ret)
* pr_info("event trace: Could not activate trace point "
* "probe to <call>");
* return ret;
* }
*
* static void ftrace_unreg_event_<call>(void)
* {
* unregister_trace_<call>(ftrace_event_<call>);
* }
*
* For those macros defined with TRACE_FORMAT:
*
* static struct ftrace_event_call __used
* __attribute__((__aligned__(4)))
* __attribute__((section("_ftrace_events"))) event_<call> = {
* .name = "<call>",
* .regfunc = ftrace_reg_event_<call>,
* .unregfunc = ftrace_unreg_event_<call>,
* }
*
*
* For those macros defined with TRACE_EVENT:
*
* static struct ftrace_event_call event_<call>;
*
* static void ftrace_raw_event_<call>(proto)
* {
* struct ring_buffer_event *event;
* struct ftrace_raw_<call> *entry; <-- defined in stage 1
* unsigned long irq_flags;
* int pc;
*
* local_save_flags(irq_flags);
* pc = preempt_count();
*
* event = trace_current_buffer_lock_reserve(event_<call>.id,
* sizeof(struct ftrace_raw_<call>),
* irq_flags, pc);
* if (!event)
* return;
* entry = ring_buffer_event_data(event);
*
* <assign>; <-- Here we assign the entries by the __field and
* __array macros.
*
* trace_current_buffer_unlock_commit(event, irq_flags, pc);
* }
*
* static int ftrace_raw_reg_event_<call>(void)
* {
* int ret;
*
* ret = register_trace_<call>(ftrace_raw_event_<call>);
* if (!ret)
* pr_info("event trace: Could not activate trace point "
* "probe to <call>");
* return ret;
* }
*
* static void ftrace_unreg_event_<call>(void)
* {
* unregister_trace_<call>(ftrace_raw_event_<call>);
* }
*
* static struct trace_event ftrace_event_type_<call> = {
* .trace = ftrace_raw_output_<call>, <-- stage 2
* };
*
* static int ftrace_raw_init_event_<call>(void)
* {
* int id;
*
* id = register_ftrace_event(&ftrace_event_type_<call>);
* if (!id)
* return -ENODEV;
* event_<call>.id = id;
* return 0;
* }
*
* static struct ftrace_event_call __used
* __attribute__((__aligned__(4)))
* __attribute__((section("_ftrace_events"))) event_<call> = {
* .name = "<call>",
* .system = "<system>",
* .raw_init = ftrace_raw_init_event_<call>,
* .regfunc = ftrace_reg_event_<call>,
* .unregfunc = ftrace_unreg_event_<call>,
* .show_format = ftrace_format_<call>,
* }
*
*/
#undef TP_FMT
#define TP_FMT(fmt, args...) fmt "\n", ##args
#ifdef CONFIG_EVENT_PROFILE
#define _TRACE_PROFILE(call, proto, args) \
static void ftrace_profile_##call(proto) \
{ \
extern void perf_tpcounter_event(int); \
perf_tpcounter_event(event_##call.id); \
} \
\
static int ftrace_profile_enable_##call(struct ftrace_event_call *call) \
{ \
int ret = 0; \
\
if (!atomic_inc_return(&call->profile_count)) \
ret = register_trace_##call(ftrace_profile_##call); \
\
return ret; \
} \
\
static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \
{ \
if (atomic_add_negative(-1, &call->profile_count)) \
unregister_trace_##call(ftrace_profile_##call); \
}
#define _TRACE_PROFILE_INIT(call) \
.profile_count = ATOMIC_INIT(-1), \
.profile_enable = ftrace_profile_enable_##call, \
.profile_disable = ftrace_profile_disable_##call,
#else
#define _TRACE_PROFILE(call, proto, args)
#define _TRACE_PROFILE_INIT(call)
#endif
#define _TRACE_FORMAT(call, proto, args, fmt) \
static void ftrace_event_##call(proto) \
{ \
event_trace_printk(_RET_IP_, #call ": " fmt); \
} \
\
static int ftrace_reg_event_##call(void) \
{ \
int ret; \
\
ret = register_trace_##call(ftrace_event_##call); \
if (ret) \
pr_info("event trace: Could not activate trace point " \
"probe to " #call "\n"); \
return ret; \
} \
\
static void ftrace_unreg_event_##call(void) \
{ \
unregister_trace_##call(ftrace_event_##call); \
} \
\
static struct ftrace_event_call event_##call; \
\
static int ftrace_init_event_##call(void) \
{ \
int id; \
\
id = register_ftrace_event(NULL); \
if (!id) \
return -ENODEV; \
event_##call.id = id; \
return 0; \
}
#undef TRACE_FORMAT
#define TRACE_FORMAT(call, proto, args, fmt) \
_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt)) \
_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \
static struct ftrace_event_call __used \
__attribute__((__aligned__(4))) \
__attribute__((section("_ftrace_events"))) event_##call = { \
.name = #call, \
.system = __stringify(TRACE_SYSTEM), \
.raw_init = ftrace_init_event_##call, \
.regfunc = ftrace_reg_event_##call, \
.unregfunc = ftrace_unreg_event_##call, \
_TRACE_PROFILE_INIT(call) \
}
#undef __entry
#define __entry entry
#undef TRACE_EVENT
#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \
_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \
\
static struct ftrace_event_call event_##call; \
\
static void ftrace_raw_event_##call(proto) \
{ \
struct ftrace_event_call *call = &event_##call; \
struct ring_buffer_event *event; \
struct ftrace_raw_##call *entry; \
unsigned long irq_flags; \
int pc; \
\
local_save_flags(irq_flags); \
pc = preempt_count(); \
\
event = trace_current_buffer_lock_reserve(event_##call.id, \
sizeof(struct ftrace_raw_##call), \
irq_flags, pc); \
if (!event) \
return; \
entry = ring_buffer_event_data(event); \
\
assign; \
\
if (call->preds && !filter_match_preds(call, entry)) \
ring_buffer_event_discard(event); \
\
trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \
\
} \
\
static int ftrace_raw_reg_event_##call(void) \
{ \
int ret; \
\
ret = register_trace_##call(ftrace_raw_event_##call); \
if (ret) \
pr_info("event trace: Could not activate trace point " \
"probe to " #call "\n"); \
return ret; \
} \
\
static void ftrace_raw_unreg_event_##call(void) \
{ \
unregister_trace_##call(ftrace_raw_event_##call); \
} \
\
static struct trace_event ftrace_event_type_##call = { \
.trace = ftrace_raw_output_##call, \
}; \
\
static int ftrace_raw_init_event_##call(void) \
{ \
int id; \
\
id = register_ftrace_event(&ftrace_event_type_##call); \
if (!id) \
return -ENODEV; \
event_##call.id = id; \
INIT_LIST_HEAD(&event_##call.fields); \
return 0; \
} \
\
static struct ftrace_event_call __used \
__attribute__((__aligned__(4))) \
__attribute__((section("_ftrace_events"))) event_##call = { \
.name = #call, \
.system = __stringify(TRACE_SYSTEM), \
.raw_init = ftrace_raw_init_event_##call, \
.regfunc = ftrace_raw_reg_event_##call, \
.unregfunc = ftrace_raw_unreg_event_##call, \
.show_format = ftrace_format_##call, \
.define_fields = ftrace_define_fields_##call, \
_TRACE_PROFILE_INIT(call) \
}
#include <trace/trace_event_types.h>
#undef _TRACE_PROFILE
#undef _TRACE_PROFILE_INIT

102
kernel/trace/trace_export.c Normal file
View file

@ -0,0 +1,102 @@
/*
* trace_export.c - export basic ftrace utilities to user space
*
* Copyright (C) 2009 Steven Rostedt <srostedt@redhat.com>
*/
#include <linux/stringify.h>
#include <linux/kallsyms.h>
#include <linux/seq_file.h>
#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/ftrace.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/fs.h>
#include "trace_output.h"
#undef TRACE_STRUCT
#define TRACE_STRUCT(args...) args
#undef TRACE_FIELD
#define TRACE_FIELD(type, item, assign) \
ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
"offset:%u;\tsize:%u;\n", \
(unsigned int)offsetof(typeof(field), item), \
(unsigned int)sizeof(field.item)); \
if (!ret) \
return 0;
#undef TRACE_FIELD_SPECIAL
#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \
"offset:%u;\tsize:%u;\n", \
(unsigned int)offsetof(typeof(field), item), \
(unsigned int)sizeof(field.item)); \
if (!ret) \
return 0;
#undef TRACE_FIELD_ZERO_CHAR
#define TRACE_FIELD_ZERO_CHAR(item) \
ret = trace_seq_printf(s, "\tfield: char " #item ";\t" \
"offset:%u;\tsize:0;\n", \
(unsigned int)offsetof(typeof(field), item)); \
if (!ret) \
return 0;
#undef TP_RAW_FMT
#define TP_RAW_FMT(args...) args
#undef TRACE_EVENT_FORMAT
#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
static int \
ftrace_format_##call(struct trace_seq *s) \
{ \
struct args field; \
int ret; \
\
tstruct; \
\
trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \
\
return ret; \
}
#include "trace_event_types.h"
#undef TRACE_ZERO_CHAR
#define TRACE_ZERO_CHAR(arg)
#undef TRACE_FIELD
#define TRACE_FIELD(type, item, assign)\
entry->item = assign;
#undef TRACE_FIELD
#define TRACE_FIELD(type, item, assign)\
entry->item = assign;
#undef TP_CMD
#define TP_CMD(cmd...) cmd
#undef TRACE_ENTRY
#define TRACE_ENTRY entry
#undef TRACE_FIELD_SPECIAL
#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
cmd;
#undef TRACE_EVENT_FORMAT
#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \
\
static struct ftrace_event_call __used \
__attribute__((__aligned__(4))) \
__attribute__((section("_ftrace_events"))) event_##call = { \
.name = #call, \
.id = proto, \
.system = __stringify(TRACE_SYSTEM), \
.show_format = ftrace_format_##call, \
}
#include "trace_event_types.h"

View file

@ -9,6 +9,7 @@
* Copyright (C) 2004-2006 Ingo Molnar
* Copyright (C) 2004 William Lee Irwin III
*/
#include <linux/ring_buffer.h>
#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/ftrace.h>
@ -16,31 +17,29 @@
#include "trace.h"
static void start_function_trace(struct trace_array *tr)
/* function tracing enabled */
static int ftrace_function_enabled;
static struct trace_array *func_trace;
static void tracing_start_function_trace(void);
static void tracing_stop_function_trace(void);
static int function_trace_init(struct trace_array *tr)
{
func_trace = tr;
tr->cpu = get_cpu();
tracing_reset_online_cpus(tr);
put_cpu();
tracing_start_cmdline_record();
tracing_start_function_trace();
}
static void stop_function_trace(struct trace_array *tr)
{
tracing_stop_function_trace();
tracing_stop_cmdline_record();
}
static int function_trace_init(struct trace_array *tr)
{
start_function_trace(tr);
return 0;
}
static void function_trace_reset(struct trace_array *tr)
{
stop_function_trace(tr);
tracing_stop_function_trace();
tracing_stop_cmdline_record();
}
static void function_trace_start(struct trace_array *tr)
@ -48,20 +47,358 @@ static void function_trace_start(struct trace_array *tr)
tracing_reset_online_cpus(tr);
}
static void
function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
{
struct trace_array *tr = func_trace;
struct trace_array_cpu *data;
unsigned long flags;
long disabled;
int cpu, resched;
int pc;
if (unlikely(!ftrace_function_enabled))
return;
pc = preempt_count();
resched = ftrace_preempt_disable();
local_save_flags(flags);
cpu = raw_smp_processor_id();
data = tr->data[cpu];
disabled = atomic_inc_return(&data->disabled);
if (likely(disabled == 1))
trace_function(tr, ip, parent_ip, flags, pc);
atomic_dec(&data->disabled);
ftrace_preempt_enable(resched);
}
static void
function_trace_call(unsigned long ip, unsigned long parent_ip)
{
struct trace_array *tr = func_trace;
struct trace_array_cpu *data;
unsigned long flags;
long disabled;
int cpu;
int pc;
if (unlikely(!ftrace_function_enabled))
return;
/*
* Need to use raw, since this must be called before the
* recursive protection is performed.
*/
local_irq_save(flags);
cpu = raw_smp_processor_id();
data = tr->data[cpu];
disabled = atomic_inc_return(&data->disabled);
if (likely(disabled == 1)) {
pc = preempt_count();
trace_function(tr, ip, parent_ip, flags, pc);
}
atomic_dec(&data->disabled);
local_irq_restore(flags);
}
static void
function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
{
struct trace_array *tr = func_trace;
struct trace_array_cpu *data;
unsigned long flags;
long disabled;
int cpu;
int pc;
if (unlikely(!ftrace_function_enabled))
return;
/*
* Need to use raw, since this must be called before the
* recursive protection is performed.
*/
local_irq_save(flags);
cpu = raw_smp_processor_id();
data = tr->data[cpu];
disabled = atomic_inc_return(&data->disabled);
if (likely(disabled == 1)) {
pc = preempt_count();
trace_function(tr, ip, parent_ip, flags, pc);
/*
* skip over 5 funcs:
* __ftrace_trace_stack,
* __trace_stack,
* function_stack_trace_call
* ftrace_list_func
* ftrace_call
*/
__trace_stack(tr, flags, 5, pc);
}
atomic_dec(&data->disabled);
local_irq_restore(flags);
}
static struct ftrace_ops trace_ops __read_mostly =
{
.func = function_trace_call,
};
static struct ftrace_ops trace_stack_ops __read_mostly =
{
.func = function_stack_trace_call,
};
/* Our two options */
enum {
TRACE_FUNC_OPT_STACK = 0x1,
};
static struct tracer_opt func_opts[] = {
#ifdef CONFIG_STACKTRACE
{ TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
#endif
{ } /* Always set a last empty entry */
};
static struct tracer_flags func_flags = {
.val = 0, /* By default: all flags disabled */
.opts = func_opts
};
static void tracing_start_function_trace(void)
{
ftrace_function_enabled = 0;
if (trace_flags & TRACE_ITER_PREEMPTONLY)
trace_ops.func = function_trace_call_preempt_only;
else
trace_ops.func = function_trace_call;
if (func_flags.val & TRACE_FUNC_OPT_STACK)
register_ftrace_function(&trace_stack_ops);
else
register_ftrace_function(&trace_ops);
ftrace_function_enabled = 1;
}
static void tracing_stop_function_trace(void)
{
ftrace_function_enabled = 0;
/* OK if they are not registered */
unregister_ftrace_function(&trace_stack_ops);
unregister_ftrace_function(&trace_ops);
}
static int func_set_flag(u32 old_flags, u32 bit, int set)
{
if (bit == TRACE_FUNC_OPT_STACK) {
/* do nothing if already set */
if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK))
return 0;
if (set) {
unregister_ftrace_function(&trace_ops);
register_ftrace_function(&trace_stack_ops);
} else {
unregister_ftrace_function(&trace_stack_ops);
register_ftrace_function(&trace_ops);
}
return 0;
}
return -EINVAL;
}
static struct tracer function_trace __read_mostly =
{
.name = "function",
.init = function_trace_init,
.reset = function_trace_reset,
.start = function_trace_start,
.name = "function",
.init = function_trace_init,
.reset = function_trace_reset,
.start = function_trace_start,
.wait_pipe = poll_wait_pipe,
.flags = &func_flags,
.set_flag = func_set_flag,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_function,
.selftest = trace_selftest_startup_function,
#endif
};
static __init int init_function_trace(void)
#ifdef CONFIG_DYNAMIC_FTRACE
static void
ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data)
{
return register_tracer(&function_trace);
long *count = (long *)data;
if (tracing_is_on())
return;
if (!*count)
return;
if (*count != -1)
(*count)--;
tracing_on();
}
static void
ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
{
long *count = (long *)data;
if (!tracing_is_on())
return;
if (!*count)
return;
if (*count != -1)
(*count)--;
tracing_off();
}
static int
ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
struct ftrace_probe_ops *ops, void *data);
static struct ftrace_probe_ops traceon_probe_ops = {
.func = ftrace_traceon,
.print = ftrace_trace_onoff_print,
};
static struct ftrace_probe_ops traceoff_probe_ops = {
.func = ftrace_traceoff,
.print = ftrace_trace_onoff_print,
};
static int
ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
struct ftrace_probe_ops *ops, void *data)
{
char str[KSYM_SYMBOL_LEN];
long count = (long)data;
kallsyms_lookup(ip, NULL, NULL, NULL, str);
seq_printf(m, "%s:", str);
if (ops == &traceon_probe_ops)
seq_printf(m, "traceon");
else
seq_printf(m, "traceoff");
if (count == -1)
seq_printf(m, ":unlimited\n");
else
seq_printf(m, ":count=%ld", count);
seq_putc(m, '\n');
return 0;
}
static int
ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param)
{
struct ftrace_probe_ops *ops;
/* we register both traceon and traceoff to this callback */
if (strcmp(cmd, "traceon") == 0)
ops = &traceon_probe_ops;
else
ops = &traceoff_probe_ops;
unregister_ftrace_function_probe_func(glob, ops);
return 0;
}
static int
ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)
{
struct ftrace_probe_ops *ops;
void *count = (void *)-1;
char *number;
int ret;
/* hash funcs only work with set_ftrace_filter */
if (!enable)
return -EINVAL;
if (glob[0] == '!')
return ftrace_trace_onoff_unreg(glob+1, cmd, param);
/* we register both traceon and traceoff to this callback */
if (strcmp(cmd, "traceon") == 0)
ops = &traceon_probe_ops;
else
ops = &traceoff_probe_ops;
if (!param)
goto out_reg;
number = strsep(&param, ":");
if (!strlen(number))
goto out_reg;
/*
* We use the callback data field (which is a pointer)
* as our counter.
*/
ret = strict_strtoul(number, 0, (unsigned long *)&count);
if (ret)
return ret;
out_reg:
ret = register_ftrace_function_probe(glob, ops, count);
return ret;
}
static struct ftrace_func_command ftrace_traceon_cmd = {
.name = "traceon",
.func = ftrace_trace_onoff_callback,
};
static struct ftrace_func_command ftrace_traceoff_cmd = {
.name = "traceoff",
.func = ftrace_trace_onoff_callback,
};
static int __init init_func_cmd_traceon(void)
{
int ret;
ret = register_ftrace_command(&ftrace_traceoff_cmd);
if (ret)
return ret;
ret = register_ftrace_command(&ftrace_traceon_cmd);
if (ret)
unregister_ftrace_command(&ftrace_traceoff_cmd);
return ret;
}
#else
static inline int init_func_cmd_traceon(void)
{
return 0;
}
#endif /* CONFIG_DYNAMIC_FTRACE */
static __init int init_function_trace(void)
{
init_func_cmd_traceon();
return register_tracer(&function_trace);
}
device_initcall(init_function_trace);

View file

@ -1,7 +1,7 @@
/*
*
* Function graph tracer.
* Copyright (c) 2008 Frederic Weisbecker <fweisbec@gmail.com>
* Copyright (c) 2008-2009 Frederic Weisbecker <fweisbec@gmail.com>
* Mostly borrowed from function tracer which
* is Copyright (c) Steven Rostedt <srostedt@redhat.com>
*
@ -12,6 +12,12 @@
#include <linux/fs.h>
#include "trace.h"
#include "trace_output.h"
struct fgraph_data {
pid_t last_pid;
int depth;
};
#define TRACE_GRAPH_INDENT 2
@ -20,9 +26,11 @@
#define TRACE_GRAPH_PRINT_CPU 0x2
#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
#define TRACE_GRAPH_PRINT_PROC 0x8
#define TRACE_GRAPH_PRINT_DURATION 0x10
#define TRACE_GRAPH_PRINT_ABS_TIME 0X20
static struct tracer_opt trace_opts[] = {
/* Display overruns ? */
/* Display overruns? (for self-debug purpose) */
{ TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) },
/* Display CPU ? */
{ TRACER_OPT(funcgraph-cpu, TRACE_GRAPH_PRINT_CPU) },
@ -30,26 +38,103 @@ static struct tracer_opt trace_opts[] = {
{ TRACER_OPT(funcgraph-overhead, TRACE_GRAPH_PRINT_OVERHEAD) },
/* Display proc name/pid */
{ TRACER_OPT(funcgraph-proc, TRACE_GRAPH_PRINT_PROC) },
/* Display duration of execution */
{ TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) },
/* Display absolute time of an entry */
{ TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) },
{ } /* Empty entry */
};
static struct tracer_flags tracer_flags = {
/* Don't display overruns and proc by default */
.val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD,
.val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD |
TRACE_GRAPH_PRINT_DURATION,
.opts = trace_opts
};
/* pid on the last trace processed */
static pid_t last_pid[NR_CPUS] = { [0 ... NR_CPUS-1] = -1 };
/* Add a function return address to the trace stack on thread info.*/
int
ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
{
unsigned long long calltime;
int index;
if (!current->ret_stack)
return -EBUSY;
/* The return trace stack is full */
if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
atomic_inc(&current->trace_overrun);
return -EBUSY;
}
calltime = trace_clock_local();
index = ++current->curr_ret_stack;
barrier();
current->ret_stack[index].ret = ret;
current->ret_stack[index].func = func;
current->ret_stack[index].calltime = calltime;
*depth = index;
return 0;
}
/* Retrieve a function return address to the trace stack on thread info.*/
void
ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
{
int index;
index = current->curr_ret_stack;
if (unlikely(index < 0)) {
ftrace_graph_stop();
WARN_ON(1);
/* Might as well panic, otherwise we have no where to go */
*ret = (unsigned long)panic;
return;
}
*ret = current->ret_stack[index].ret;
trace->func = current->ret_stack[index].func;
trace->calltime = current->ret_stack[index].calltime;
trace->overrun = atomic_read(&current->trace_overrun);
trace->depth = index;
barrier();
current->curr_ret_stack--;
}
/*
* Send the trace to the ring-buffer.
* @return the original return address.
*/
unsigned long ftrace_return_to_handler(void)
{
struct ftrace_graph_ret trace;
unsigned long ret;
ftrace_pop_return_trace(&trace, &ret);
trace.rettime = trace_clock_local();
ftrace_graph_return(&trace);
if (unlikely(!ret)) {
ftrace_graph_stop();
WARN_ON(1);
/* Might as well panic. What else to do? */
ret = (unsigned long)panic;
}
return ret;
}
static int graph_trace_init(struct trace_array *tr)
{
int cpu, ret;
for_each_online_cpu(cpu)
tracing_reset(tr, cpu);
ret = register_ftrace_graph(&trace_graph_return,
int ret = register_ftrace_graph(&trace_graph_return,
&trace_graph_entry);
if (ret)
return ret;
@ -112,15 +197,15 @@ print_graph_cpu(struct trace_seq *s, int cpu)
static enum print_line_t
print_graph_proc(struct trace_seq *s, pid_t pid)
{
int i;
int ret;
int len;
char comm[8];
int spaces = 0;
char comm[TASK_COMM_LEN];
/* sign + log10(MAX_INT) + '\0' */
char pid_str[11];
int spaces = 0;
int ret;
int len;
int i;
strncpy(comm, trace_find_cmdline(pid), 7);
trace_find_cmdline(pid, comm);
comm[7] = '\0';
sprintf(pid_str, "%d", pid);
@ -153,17 +238,25 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
/* If the pid changed since the last trace, output this event */
static enum print_line_t
verif_pid(struct trace_seq *s, pid_t pid, int cpu)
verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
{
pid_t prev_pid;
pid_t *last_pid;
int ret;
if (last_pid[cpu] != -1 && last_pid[cpu] == pid)
if (!data)
return TRACE_TYPE_HANDLED;
prev_pid = last_pid[cpu];
last_pid[cpu] = pid;
last_pid = &(per_cpu_ptr(data, cpu)->last_pid);
if (*last_pid == pid)
return TRACE_TYPE_HANDLED;
prev_pid = *last_pid;
*last_pid = pid;
if (prev_pid == -1)
return TRACE_TYPE_HANDLED;
/*
* Context-switch trace line:
@ -175,34 +268,34 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu)
ret = trace_seq_printf(s,
" ------------------------------------------\n");
if (!ret)
TRACE_TYPE_PARTIAL_LINE;
return TRACE_TYPE_PARTIAL_LINE;
ret = print_graph_cpu(s, cpu);
if (ret == TRACE_TYPE_PARTIAL_LINE)
TRACE_TYPE_PARTIAL_LINE;
return TRACE_TYPE_PARTIAL_LINE;
ret = print_graph_proc(s, prev_pid);
if (ret == TRACE_TYPE_PARTIAL_LINE)
TRACE_TYPE_PARTIAL_LINE;
return TRACE_TYPE_PARTIAL_LINE;
ret = trace_seq_printf(s, " => ");
if (!ret)
TRACE_TYPE_PARTIAL_LINE;
return TRACE_TYPE_PARTIAL_LINE;
ret = print_graph_proc(s, pid);
if (ret == TRACE_TYPE_PARTIAL_LINE)
TRACE_TYPE_PARTIAL_LINE;
return TRACE_TYPE_PARTIAL_LINE;
ret = trace_seq_printf(s,
"\n ------------------------------------------\n\n");
if (!ret)
TRACE_TYPE_PARTIAL_LINE;
return TRACE_TYPE_PARTIAL_LINE;
return ret;
return TRACE_TYPE_HANDLED;
}
static bool
trace_branch_is_leaf(struct trace_iterator *iter,
static struct ftrace_graph_ret_entry *
get_return_for_leaf(struct trace_iterator *iter,
struct ftrace_graph_ent_entry *curr)
{
struct ring_buffer_iter *ring_iter;
@ -211,65 +304,123 @@ trace_branch_is_leaf(struct trace_iterator *iter,
ring_iter = iter->buffer_iter[iter->cpu];
if (!ring_iter)
return false;
event = ring_buffer_iter_peek(ring_iter, NULL);
/* First peek to compare current entry and the next one */
if (ring_iter)
event = ring_buffer_iter_peek(ring_iter, NULL);
else {
/* We need to consume the current entry to see the next one */
ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
NULL);
}
if (!event)
return false;
return NULL;
next = ring_buffer_event_data(event);
if (next->ent.type != TRACE_GRAPH_RET)
return false;
return NULL;
if (curr->ent.pid != next->ent.pid ||
curr->graph_ent.func != next->ret.func)
return false;
return NULL;
return true;
/* this is a leaf, now advance the iterator */
if (ring_iter)
ring_buffer_read(ring_iter, NULL);
return next;
}
/* Signal a overhead of time execution to the output */
static int
print_graph_overhead(unsigned long long duration, struct trace_seq *s)
{
/* If duration disappear, we don't need anything */
if (!(tracer_flags.val & TRACE_GRAPH_PRINT_DURATION))
return 1;
/* Non nested entry or return */
if (duration == -1)
return trace_seq_printf(s, " ");
if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
/* Duration exceeded 100 msecs */
if (duration > 100000ULL)
return trace_seq_printf(s, "! ");
/* Duration exceeded 10 msecs */
if (duration > 10000ULL)
return trace_seq_printf(s, "+ ");
}
return trace_seq_printf(s, " ");
}
static int print_graph_abs_time(u64 t, struct trace_seq *s)
{
unsigned long usecs_rem;
usecs_rem = do_div(t, NSEC_PER_SEC);
usecs_rem /= 1000;
return trace_seq_printf(s, "%5lu.%06lu | ",
(unsigned long)t, usecs_rem);
}
static enum print_line_t
print_graph_irq(struct trace_seq *s, unsigned long addr,
enum trace_type type, int cpu, pid_t pid)
print_graph_irq(struct trace_iterator *iter, unsigned long addr,
enum trace_type type, int cpu, pid_t pid)
{
int ret;
struct trace_seq *s = &iter->seq;
if (addr < (unsigned long)__irqentry_text_start ||
addr >= (unsigned long)__irqentry_text_end)
return TRACE_TYPE_UNHANDLED;
if (type == TRACE_GRAPH_ENT) {
ret = trace_seq_printf(s, "==========> | ");
} else {
/* Cpu */
if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
ret = print_graph_cpu(s, cpu);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
}
/* Proc */
if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
ret = print_graph_proc(s, pid);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
ret = trace_seq_printf(s, " | ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
/* No overhead */
if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
ret = trace_seq_printf(s, " ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
ret = trace_seq_printf(s, "<========== |\n");
/* Absolute time */
if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
ret = print_graph_abs_time(iter->ts, s);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
/* Cpu */
if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
ret = print_graph_cpu(s, cpu);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
}
/* Proc */
if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
ret = print_graph_proc(s, pid);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
ret = trace_seq_printf(s, " | ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
/* No overhead */
ret = print_graph_overhead(-1, s);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
if (type == TRACE_GRAPH_ENT)
ret = trace_seq_printf(s, "==========>");
else
ret = trace_seq_printf(s, "<==========");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* Don't close the duration column if haven't one */
if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
trace_seq_printf(s, " |");
ret = trace_seq_printf(s, "\n");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
return TRACE_TYPE_HANDLED;
@ -288,7 +439,7 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
sprintf(msecs_str, "%lu", (unsigned long) duration);
/* Print msecs */
ret = trace_seq_printf(s, msecs_str);
ret = trace_seq_printf(s, "%s", msecs_str);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
@ -321,52 +472,47 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
}
/* Signal a overhead of time execution to the output */
static int
print_graph_overhead(unsigned long long duration, struct trace_seq *s)
{
/* Duration exceeded 100 msecs */
if (duration > 100000ULL)
return trace_seq_printf(s, "! ");
/* Duration exceeded 10 msecs */
if (duration > 10000ULL)
return trace_seq_printf(s, "+ ");
return trace_seq_printf(s, " ");
}
/* Case of a leaf function on its call entry */
static enum print_line_t
print_graph_entry_leaf(struct trace_iterator *iter,
struct ftrace_graph_ent_entry *entry, struct trace_seq *s)
struct ftrace_graph_ent_entry *entry,
struct ftrace_graph_ret_entry *ret_entry, struct trace_seq *s)
{
struct ftrace_graph_ret_entry *ret_entry;
struct fgraph_data *data = iter->private;
struct ftrace_graph_ret *graph_ret;
struct ring_buffer_event *event;
struct ftrace_graph_ent *call;
unsigned long long duration;
int ret;
int i;
event = ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
ret_entry = ring_buffer_event_data(event);
graph_ret = &ret_entry->ret;
call = &entry->graph_ent;
duration = graph_ret->rettime - graph_ret->calltime;
/* Overhead */
if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
ret = print_graph_overhead(duration, s);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
if (data) {
int cpu = iter->cpu;
int *depth = &(per_cpu_ptr(data, cpu)->depth);
/*
* Comments display at + 1 to depth. Since
* this is a leaf function, keep the comments
* equal to this depth.
*/
*depth = call->depth - 1;
}
/* Duration */
ret = print_graph_duration(duration, s);
if (ret == TRACE_TYPE_PARTIAL_LINE)
/* Overhead */
ret = print_graph_overhead(duration, s);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* Duration */
if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
ret = print_graph_duration(duration, s);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
}
/* Function */
for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
ret = trace_seq_printf(s, " ");
@ -386,33 +532,34 @@ print_graph_entry_leaf(struct trace_iterator *iter,
}
static enum print_line_t
print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
struct trace_seq *s, pid_t pid, int cpu)
print_graph_entry_nested(struct trace_iterator *iter,
struct ftrace_graph_ent_entry *entry,
struct trace_seq *s, int cpu)
{
int i;
int ret;
struct ftrace_graph_ent *call = &entry->graph_ent;
struct fgraph_data *data = iter->private;
int ret;
int i;
/* No overhead */
if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
ret = trace_seq_printf(s, " ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
if (data) {
int cpu = iter->cpu;
int *depth = &(per_cpu_ptr(data, cpu)->depth);
*depth = call->depth;
}
/* Interrupt */
ret = print_graph_irq(s, call->func, TRACE_GRAPH_ENT, cpu, pid);
if (ret == TRACE_TYPE_UNHANDLED) {
/* No time */
/* No overhead */
ret = print_graph_overhead(-1, s);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* No time */
if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
ret = trace_seq_printf(s, " | ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
} else {
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
}
/* Function */
for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
ret = trace_seq_printf(s, " ");
@ -428,20 +575,40 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
return TRACE_TYPE_HANDLED;
/*
* we already consumed the current entry to check the next one
* and see if this is a leaf.
*/
return TRACE_TYPE_NO_CONSUME;
}
static enum print_line_t
print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
struct trace_iterator *iter, int cpu)
print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
int type, unsigned long addr)
{
int ret;
struct fgraph_data *data = iter->private;
struct trace_entry *ent = iter->ent;
int cpu = iter->cpu;
int ret;
/* Pid */
if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE)
if (verif_pid(s, ent->pid, cpu, data) == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
if (type) {
/* Interrupt */
ret = print_graph_irq(iter, addr, type, cpu, ent->pid);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
}
/* Absolute time */
if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
ret = print_graph_abs_time(iter->ts, s);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
/* Cpu */
if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
ret = print_graph_cpu(s, cpu);
@ -460,54 +627,65 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
return TRACE_TYPE_PARTIAL_LINE;
}
if (trace_branch_is_leaf(iter, field))
return print_graph_entry_leaf(iter, field, s);
return 0;
}
static enum print_line_t
print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
struct trace_iterator *iter)
{
int cpu = iter->cpu;
struct ftrace_graph_ent *call = &field->graph_ent;
struct ftrace_graph_ret_entry *leaf_ret;
if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func))
return TRACE_TYPE_PARTIAL_LINE;
leaf_ret = get_return_for_leaf(iter, field);
if (leaf_ret)
return print_graph_entry_leaf(iter, field, leaf_ret, s);
else
return print_graph_entry_nested(field, s, iter->ent->pid, cpu);
return print_graph_entry_nested(iter, field, s, cpu);
}
static enum print_line_t
print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
struct trace_entry *ent, int cpu)
struct trace_entry *ent, struct trace_iterator *iter)
{
int i;
int ret;
unsigned long long duration = trace->rettime - trace->calltime;
struct fgraph_data *data = iter->private;
pid_t pid = ent->pid;
int cpu = iter->cpu;
int ret;
int i;
/* Pid */
if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE)
if (data) {
int cpu = iter->cpu;
int *depth = &(per_cpu_ptr(data, cpu)->depth);
/*
* Comments display at + 1 to depth. This is the
* return from a function, we now want the comments
* to display at the same level of the bracket.
*/
*depth = trace->depth - 1;
}
if (print_graph_prologue(iter, s, 0, 0))
return TRACE_TYPE_PARTIAL_LINE;
/* Cpu */
if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
ret = print_graph_cpu(s, cpu);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
}
/* Proc */
if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
ret = print_graph_proc(s, ent->pid);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
ret = trace_seq_printf(s, " | ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
/* Overhead */
if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
ret = print_graph_overhead(duration, s);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
ret = print_graph_overhead(duration, s);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* Duration */
ret = print_graph_duration(duration, s);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
ret = print_graph_duration(duration, s);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
}
/* Closing brace */
for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
@ -528,7 +706,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
return TRACE_TYPE_PARTIAL_LINE;
}
ret = print_graph_irq(s, trace->func, TRACE_GRAPH_RET, cpu, ent->pid);
ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, cpu, pid);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
@ -536,61 +714,73 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
}
static enum print_line_t
print_graph_comment(struct print_entry *trace, struct trace_seq *s,
struct trace_entry *ent, struct trace_iterator *iter)
print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
struct trace_iterator *iter)
{
int i;
unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
struct fgraph_data *data = iter->private;
struct trace_event *event;
int depth = 0;
int ret;
int i;
/* Pid */
if (verif_pid(s, ent->pid, iter->cpu) == TRACE_TYPE_PARTIAL_LINE)
if (data)
depth = per_cpu_ptr(data, iter->cpu)->depth;
if (print_graph_prologue(iter, s, 0, 0))
return TRACE_TYPE_PARTIAL_LINE;
/* Cpu */
if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
ret = print_graph_cpu(s, iter->cpu);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
}
/* Proc */
if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
ret = print_graph_proc(s, ent->pid);
if (ret == TRACE_TYPE_PARTIAL_LINE)
return TRACE_TYPE_PARTIAL_LINE;
ret = trace_seq_printf(s, " | ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
/* No overhead */
if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
ret = trace_seq_printf(s, " ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
/* No time */
ret = trace_seq_printf(s, " | ");
ret = print_graph_overhead(-1, s);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
/* No time */
if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
ret = trace_seq_printf(s, " | ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
/* Indentation */
if (trace->depth > 0)
for (i = 0; i < (trace->depth + 1) * TRACE_GRAPH_INDENT; i++) {
if (depth > 0)
for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) {
ret = trace_seq_printf(s, " ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
}
/* The comment */
ret = trace_seq_printf(s, "/* %s", trace->buf);
ret = trace_seq_printf(s, "/* ");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
if (ent->flags & TRACE_FLAG_CONT)
trace_seq_print_cont(s, iter);
switch (iter->ent->type) {
case TRACE_BPRINT:
ret = trace_print_bprintk_msg_only(iter);
if (ret != TRACE_TYPE_HANDLED)
return ret;
break;
case TRACE_PRINT:
ret = trace_print_printk_msg_only(iter);
if (ret != TRACE_TYPE_HANDLED)
return ret;
break;
default:
event = ftrace_find_event(ent->type);
if (!event)
return TRACE_TYPE_UNHANDLED;
ret = event->trace(iter, sym_flags);
if (ret != TRACE_TYPE_HANDLED)
return ret;
}
/* Strip ending newline */
if (s->buffer[s->len - 1] == '\n') {
s->buffer[s->len - 1] = '\0';
s->len--;
}
ret = trace_seq_printf(s, " */\n");
if (!ret)
@ -603,62 +793,91 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s,
enum print_line_t
print_graph_function(struct trace_iterator *iter)
{
struct trace_seq *s = &iter->seq;
struct trace_entry *entry = iter->ent;
struct trace_seq *s = &iter->seq;
switch (entry->type) {
case TRACE_GRAPH_ENT: {
struct ftrace_graph_ent_entry *field;
trace_assign_type(field, entry);
return print_graph_entry(field, s, iter,
iter->cpu);
return print_graph_entry(field, s, iter);
}
case TRACE_GRAPH_RET: {
struct ftrace_graph_ret_entry *field;
trace_assign_type(field, entry);
return print_graph_return(&field->ret, s, entry, iter->cpu);
}
case TRACE_PRINT: {
struct print_entry *field;
trace_assign_type(field, entry);
return print_graph_comment(field, s, entry, iter);
return print_graph_return(&field->ret, s, entry, iter);
}
default:
return TRACE_TYPE_UNHANDLED;
return print_graph_comment(s, entry, iter);
}
return TRACE_TYPE_HANDLED;
}
static void print_graph_headers(struct seq_file *s)
{
/* 1st line */
seq_printf(s, "# ");
if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
seq_printf(s, " TIME ");
if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
seq_printf(s, "CPU ");
seq_printf(s, "CPU");
if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
seq_printf(s, "TASK/PID ");
if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD)
seq_printf(s, "OVERHEAD/");
seq_printf(s, "DURATION FUNCTION CALLS\n");
seq_printf(s, " TASK/PID ");
if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
seq_printf(s, " DURATION ");
seq_printf(s, " FUNCTION CALLS\n");
/* 2nd line */
seq_printf(s, "# ");
if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
seq_printf(s, " | ");
if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
seq_printf(s, "| ");
seq_printf(s, "| ");
if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
seq_printf(s, "| | ");
if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
seq_printf(s, "| ");
seq_printf(s, "| | | | |\n");
} else
seq_printf(s, " | | | | |\n");
seq_printf(s, " | | ");
if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
seq_printf(s, " | | ");
seq_printf(s, " | | | |\n");
}
static void graph_trace_open(struct trace_iterator *iter)
{
/* pid and depth on the last trace processed */
struct fgraph_data *data = alloc_percpu(struct fgraph_data);
int cpu;
if (!data)
pr_warning("function graph tracer: not enough memory\n");
else
for_each_possible_cpu(cpu) {
pid_t *pid = &(per_cpu_ptr(data, cpu)->last_pid);
int *depth = &(per_cpu_ptr(data, cpu)->depth);
*pid = -1;
*depth = 0;
}
iter->private = data;
}
static void graph_trace_close(struct trace_iterator *iter)
{
free_percpu(iter->private);
}
static struct tracer graph_trace __read_mostly = {
.name = "function_graph",
.init = graph_trace_init,
.reset = graph_trace_reset,
.name = "function_graph",
.open = graph_trace_open,
.close = graph_trace_close,
.wait_pipe = poll_wait_pipe,
.init = graph_trace_init,
.reset = graph_trace_reset,
.print_line = print_graph_function,
.print_header = print_graph_headers,
.flags = &tracer_flags,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_function_graph,
#endif
};
static __init int init_graph_trace(void)

View file

@ -1,30 +1,53 @@
/*
* h/w branch tracer for x86 based on bts
*
* Copyright (C) 2008 Markus Metzger <markus.t.metzger@gmail.com>
*
* Copyright (C) 2008-2009 Intel Corporation.
* Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
*/
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/spinlock.h>
#include <linux/kallsyms.h>
#include <linux/debugfs.h>
#include <linux/ftrace.h>
#include <linux/kallsyms.h>
#include <linux/module.h>
#include <linux/cpu.h>
#include <linux/smp.h>
#include <linux/fs.h>
#include <asm/ds.h>
#include "trace.h"
#include "trace_output.h"
#define SIZEOF_BTS (1 << 13)
/*
* The tracer lock protects the below per-cpu tracer array.
* It needs to be held to:
* - start tracing on all cpus
* - stop tracing on all cpus
* - start tracing on a single hotplug cpu
* - stop tracing on a single hotplug cpu
* - read the trace from all cpus
* - read the trace from a single cpu
*/
static DEFINE_SPINLOCK(bts_tracer_lock);
static DEFINE_PER_CPU(struct bts_tracer *, tracer);
static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
#define this_tracer per_cpu(tracer, smp_processor_id())
#define this_buffer per_cpu(buffer, smp_processor_id())
static int __read_mostly trace_hw_branches_enabled;
static struct trace_array *hw_branch_trace __read_mostly;
/*
* Start tracing on the current cpu.
* The argument is ignored.
*
* pre: bts_tracer_lock must be locked.
*/
static void bts_trace_start_cpu(void *arg)
{
if (this_tracer)
@ -42,14 +65,20 @@ static void bts_trace_start_cpu(void *arg)
static void bts_trace_start(struct trace_array *tr)
{
int cpu;
spin_lock(&bts_tracer_lock);
tracing_reset_online_cpus(tr);
on_each_cpu(bts_trace_start_cpu, NULL, 1);
trace_hw_branches_enabled = 1;
for_each_cpu(cpu, cpu_possible_mask)
smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
spin_unlock(&bts_tracer_lock);
}
/*
* Stop tracing on the current cpu.
* The argument is ignored.
*
* pre: bts_tracer_lock must be locked.
*/
static void bts_trace_stop_cpu(void *arg)
{
if (this_tracer) {
@ -60,26 +89,60 @@ static void bts_trace_stop_cpu(void *arg)
static void bts_trace_stop(struct trace_array *tr)
{
int cpu;
spin_lock(&bts_tracer_lock);
for_each_cpu(cpu, cpu_possible_mask)
smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1);
trace_hw_branches_enabled = 0;
on_each_cpu(bts_trace_stop_cpu, NULL, 1);
spin_unlock(&bts_tracer_lock);
}
static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
unsigned int cpu = (unsigned long)hcpu;
spin_lock(&bts_tracer_lock);
if (!trace_hw_branches_enabled)
goto out;
switch (action) {
case CPU_ONLINE:
case CPU_DOWN_FAILED:
smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
break;
case CPU_DOWN_PREPARE:
smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1);
break;
}
out:
spin_unlock(&bts_tracer_lock);
return NOTIFY_DONE;
}
static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
.notifier_call = bts_hotcpu_handler
};
static int bts_trace_init(struct trace_array *tr)
{
tracing_reset_online_cpus(tr);
hw_branch_trace = tr;
bts_trace_start(tr);
return 0;
}
static void bts_trace_reset(struct trace_array *tr)
{
bts_trace_stop(tr);
}
static void bts_trace_print_header(struct seq_file *m)
{
seq_puts(m,
"# CPU# FROM TO FUNCTION\n");
seq_puts(m,
"# | | | |\n");
seq_puts(m, "# CPU# TO <- FROM\n");
}
static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
@ -87,15 +150,15 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
struct trace_entry *entry = iter->ent;
struct trace_seq *seq = &iter->seq;
struct hw_branch_entry *it;
unsigned long symflags = TRACE_ITER_SYM_OFFSET;
trace_assign_type(it, entry);
if (entry->type == TRACE_HW_BRANCHES) {
if (trace_seq_printf(seq, "%4d ", entry->cpu) &&
trace_seq_printf(seq, "0x%016llx -> 0x%016llx ",
it->from, it->to) &&
(!it->from ||
seq_print_ip_sym(seq, it->from, /* sym_flags = */ 0)) &&
if (trace_seq_printf(seq, "%4d ", iter->cpu) &&
seq_print_ip_sym(seq, it->to, symflags) &&
trace_seq_printf(seq, "\t <- ") &&
seq_print_ip_sym(seq, it->from, symflags) &&
trace_seq_printf(seq, "\n"))
return TRACE_TYPE_HANDLED;
return TRACE_TYPE_PARTIAL_LINE;;
@ -103,26 +166,42 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
return TRACE_TYPE_UNHANDLED;
}
void trace_hw_branch(struct trace_array *tr, u64 from, u64 to)
void trace_hw_branch(u64 from, u64 to)
{
struct trace_array *tr = hw_branch_trace;
struct ring_buffer_event *event;
struct hw_branch_entry *entry;
unsigned long irq;
unsigned long irq1;
int cpu;
event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq);
if (!event)
if (unlikely(!tr))
return;
if (unlikely(!trace_hw_branches_enabled))
return;
local_irq_save(irq1);
cpu = raw_smp_processor_id();
if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
goto out;
event = trace_buffer_lock_reserve(tr, TRACE_HW_BRANCHES,
sizeof(*entry), 0, 0);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
tracing_generic_entry_update(&entry->ent, 0, from);
entry->ent.type = TRACE_HW_BRANCHES;
entry->ent.cpu = smp_processor_id();
entry->from = from;
entry->to = to;
ring_buffer_unlock_commit(tr->buffer, event, irq);
trace_buffer_unlock_commit(tr, event, 0, 0);
out:
atomic_dec(&tr->data[cpu]->disabled);
local_irq_restore(irq1);
}
static void trace_bts_at(struct trace_array *tr,
const struct bts_trace *trace, void *at)
static void trace_bts_at(const struct bts_trace *trace, void *at)
{
struct bts_struct bts;
int err = 0;
@ -137,18 +216,29 @@ static void trace_bts_at(struct trace_array *tr,
switch (bts.qualifier) {
case BTS_BRANCH:
trace_hw_branch(tr, bts.variant.lbr.from, bts.variant.lbr.to);
trace_hw_branch(bts.variant.lbr.from, bts.variant.lbr.to);
break;
}
}
/*
* Collect the trace on the current cpu and write it into the ftrace buffer.
*
* pre: bts_tracer_lock must be locked
*/
static void trace_bts_cpu(void *arg)
{
struct trace_array *tr = (struct trace_array *) arg;
const struct bts_trace *trace;
unsigned char *at;
if (!this_tracer)
if (unlikely(!tr))
return;
if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled)))
return;
if (unlikely(!this_tracer))
return;
ds_suspend_bts(this_tracer);
@ -158,11 +248,11 @@ static void trace_bts_cpu(void *arg)
for (at = trace->ds.top; (void *)at < trace->ds.end;
at += trace->ds.size)
trace_bts_at(tr, trace, at);
trace_bts_at(trace, at);
for (at = trace->ds.begin; (void *)at < trace->ds.top;
at += trace->ds.size)
trace_bts_at(tr, trace, at);
trace_bts_at(trace, at);
out:
ds_resume_bts(this_tracer);
@ -170,26 +260,43 @@ out:
static void trace_bts_prepare(struct trace_iterator *iter)
{
int cpu;
spin_lock(&bts_tracer_lock);
for_each_cpu(cpu, cpu_possible_mask)
smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1);
on_each_cpu(trace_bts_cpu, iter->tr, 1);
spin_unlock(&bts_tracer_lock);
}
static void trace_bts_close(struct trace_iterator *iter)
{
tracing_reset_online_cpus(iter->tr);
}
void trace_hw_branch_oops(void)
{
spin_lock(&bts_tracer_lock);
trace_bts_cpu(hw_branch_trace);
spin_unlock(&bts_tracer_lock);
}
struct tracer bts_tracer __read_mostly =
{
.name = "hw-branch-tracer",
.init = bts_trace_init,
.reset = bts_trace_stop,
.reset = bts_trace_reset,
.print_header = bts_trace_print_header,
.print_line = bts_trace_print_line,
.start = bts_trace_start,
.stop = bts_trace_stop,
.open = trace_bts_prepare
.open = trace_bts_prepare,
.close = trace_bts_close
};
__init static int init_bts_trace(void)
{
register_hotcpu_notifier(&bts_hotcpu_notifier);
return register_tracer(&bts_tracer);
}
device_initcall(init_bts_trace);

View file

@ -1,5 +1,5 @@
/*
* trace irqs off criticall timings
* trace irqs off critical timings
*
* Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
* Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
@ -32,6 +32,8 @@ enum {
static int trace_type __read_mostly;
static int save_lat_flag;
#ifdef CONFIG_PREEMPT_TRACER
static inline int
preempt_trace(void)
@ -95,7 +97,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
disabled = atomic_inc_return(&data->disabled);
if (likely(disabled == 1))
trace_function(tr, data, ip, parent_ip, flags, preempt_count());
trace_function(tr, ip, parent_ip, flags, preempt_count());
atomic_dec(&data->disabled);
}
@ -153,7 +155,7 @@ check_critical_timing(struct trace_array *tr,
if (!report_latency(delta))
goto out_unlock;
trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc);
trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
latency = nsecs_to_usecs(delta);
@ -177,7 +179,7 @@ out:
data->critical_sequence = max_sequence;
data->preempt_timestamp = ftrace_now(cpu);
tracing_reset(tr, cpu);
trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc);
trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
}
static inline void
@ -210,7 +212,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
local_save_flags(flags);
trace_function(tr, data, ip, parent_ip, flags, preempt_count());
trace_function(tr, ip, parent_ip, flags, preempt_count());
per_cpu(tracing_cpu, cpu) = 1;
@ -244,7 +246,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
atomic_inc(&data->disabled);
local_save_flags(flags);
trace_function(tr, data, ip, parent_ip, flags, preempt_count());
trace_function(tr, ip, parent_ip, flags, preempt_count());
check_critical_timing(tr, data, parent_ip ? : ip, cpu);
data->critical_start = 0;
atomic_dec(&data->disabled);
@ -353,33 +355,26 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
}
#endif /* CONFIG_PREEMPT_TRACER */
/*
* save_tracer_enabled is used to save the state of the tracer_enabled
* variable when we disable it when we open a trace output file.
*/
static int save_tracer_enabled;
static void start_irqsoff_tracer(struct trace_array *tr)
{
register_ftrace_function(&trace_ops);
if (tracing_is_enabled()) {
if (tracing_is_enabled())
tracer_enabled = 1;
save_tracer_enabled = 1;
} else {
else
tracer_enabled = 0;
save_tracer_enabled = 0;
}
}
static void stop_irqsoff_tracer(struct trace_array *tr)
{
tracer_enabled = 0;
save_tracer_enabled = 0;
unregister_ftrace_function(&trace_ops);
}
static void __irqsoff_tracer_init(struct trace_array *tr)
{
save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT;
trace_flags |= TRACE_ITER_LATENCY_FMT;
tracing_max_latency = 0;
irqsoff_trace = tr;
/* make sure that the tracer is visible */
@ -390,30 +385,19 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
static void irqsoff_tracer_reset(struct trace_array *tr)
{
stop_irqsoff_tracer(tr);
if (!save_lat_flag)
trace_flags &= ~TRACE_ITER_LATENCY_FMT;
}
static void irqsoff_tracer_start(struct trace_array *tr)
{
tracer_enabled = 1;
save_tracer_enabled = 1;
}
static void irqsoff_tracer_stop(struct trace_array *tr)
{
tracer_enabled = 0;
save_tracer_enabled = 0;
}
static void irqsoff_tracer_open(struct trace_iterator *iter)
{
/* stop the trace while dumping */
tracer_enabled = 0;
}
static void irqsoff_tracer_close(struct trace_iterator *iter)
{
/* restart tracing */
tracer_enabled = save_tracer_enabled;
}
#ifdef CONFIG_IRQSOFF_TRACER
@ -431,8 +415,6 @@ static struct tracer irqsoff_tracer __read_mostly =
.reset = irqsoff_tracer_reset,
.start = irqsoff_tracer_start,
.stop = irqsoff_tracer_stop,
.open = irqsoff_tracer_open,
.close = irqsoff_tracer_close,
.print_max = 1,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_irqsoff,
@ -459,8 +441,6 @@ static struct tracer preemptoff_tracer __read_mostly =
.reset = irqsoff_tracer_reset,
.start = irqsoff_tracer_start,
.stop = irqsoff_tracer_stop,
.open = irqsoff_tracer_open,
.close = irqsoff_tracer_close,
.print_max = 1,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_preemptoff,
@ -489,8 +469,6 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
.reset = irqsoff_tracer_reset,
.start = irqsoff_tracer_start,
.stop = irqsoff_tracer_stop,
.open = irqsoff_tracer_open,
.close = irqsoff_tracer_close,
.print_max = 1,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_preemptirqsoff,

View file

@ -12,6 +12,7 @@
#include <asm/atomic.h>
#include "trace.h"
#include "trace_output.h"
struct header_iter {
struct pci_dev *dev;
@ -183,21 +184,22 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
switch (rw->opcode) {
case MMIO_READ:
ret = trace_seq_printf(s,
"R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
"R %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
rw->width, secs, usec_rem, rw->map_id,
(unsigned long long)rw->phys,
rw->value, rw->pc, 0);
break;
case MMIO_WRITE:
ret = trace_seq_printf(s,
"W %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
"W %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
rw->width, secs, usec_rem, rw->map_id,
(unsigned long long)rw->phys,
rw->value, rw->pc, 0);
break;
case MMIO_UNKNOWN_OP:
ret = trace_seq_printf(s,
"UNKNOWN %lu.%06lu %d 0x%llx %02x,%02x,%02x 0x%lx %d\n",
"UNKNOWN %u.%06lu %d 0x%llx %02lx,%02lx,"
"%02lx 0x%lx %d\n",
secs, usec_rem, rw->map_id,
(unsigned long long)rw->phys,
(rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff,
@ -229,14 +231,14 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter)
switch (m->opcode) {
case MMIO_PROBE:
ret = trace_seq_printf(s,
"MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
"MAP %u.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
secs, usec_rem, m->map_id,
(unsigned long long)m->phys, m->virt, m->len,
0UL, 0);
break;
case MMIO_UNPROBE:
ret = trace_seq_printf(s,
"UNMAP %lu.%06lu %d 0x%lx %d\n",
"UNMAP %u.%06lu %d 0x%lx %d\n",
secs, usec_rem, m->map_id, 0UL, 0);
break;
default:
@ -255,18 +257,15 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
const char *msg = print->buf;
struct trace_seq *s = &iter->seq;
unsigned long long t = ns2usecs(iter->ts);
unsigned long usec_rem = do_div(t, 1000000ULL);
unsigned long usec_rem = do_div(t, USEC_PER_SEC);
unsigned secs = (unsigned long)t;
int ret;
/* The trailing newline must be in the message. */
ret = trace_seq_printf(s, "MARK %lu.%06lu %s", secs, usec_rem, msg);
ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg);
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;
if (entry->flags & TRACE_FLAG_CONT)
trace_seq_print_cont(s, iter);
return TRACE_TYPE_HANDLED;
}
@ -308,21 +307,17 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
{
struct ring_buffer_event *event;
struct trace_mmiotrace_rw *entry;
unsigned long irq_flags;
int pc = preempt_count();
event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
&irq_flags);
event = trace_buffer_lock_reserve(tr, TRACE_MMIO_RW,
sizeof(*entry), 0, pc);
if (!event) {
atomic_inc(&dropped_count);
return;
}
entry = ring_buffer_event_data(event);
tracing_generic_entry_update(&entry->ent, 0, preempt_count());
entry->ent.type = TRACE_MMIO_RW;
entry->rw = *rw;
ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
trace_wake_up();
trace_buffer_unlock_commit(tr, event, 0, pc);
}
void mmio_trace_rw(struct mmiotrace_rw *rw)
@ -338,21 +333,17 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
{
struct ring_buffer_event *event;
struct trace_mmiotrace_map *entry;
unsigned long irq_flags;
int pc = preempt_count();
event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
&irq_flags);
event = trace_buffer_lock_reserve(tr, TRACE_MMIO_MAP,
sizeof(*entry), 0, pc);
if (!event) {
atomic_inc(&dropped_count);
return;
}
entry = ring_buffer_event_data(event);
tracing_generic_entry_update(&entry->ent, 0, preempt_count());
entry->ent.type = TRACE_MMIO_MAP;
entry->map = *map;
ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
trace_wake_up();
trace_buffer_unlock_commit(tr, event, 0, pc);
}
void mmio_trace_mapping(struct mmiotrace_map *map)
@ -368,5 +359,5 @@ void mmio_trace_mapping(struct mmiotrace_map *map)
int mmio_trace_printk(const char *fmt, va_list args)
{
return trace_vprintk(0, -1, fmt, args);
return trace_vprintk(0, fmt, args);
}

View file

@ -47,12 +47,7 @@ static void stop_nop_trace(struct trace_array *tr)
static int nop_trace_init(struct trace_array *tr)
{
int cpu;
ctx_trace = tr;
for_each_online_cpu(cpu)
tracing_reset(tr, cpu);
start_nop_trace(tr);
return 0;
}
@ -96,6 +91,7 @@ struct tracer nop_trace __read_mostly =
.name = "nop",
.init = nop_trace_init,
.reset = nop_trace_reset,
.wait_pipe = poll_wait_pipe,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_nop,
#endif

1017
kernel/trace/trace_output.c Normal file

File diff suppressed because it is too large Load diff

Some files were not shown because too many files have changed in this diff Show more