aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/audit.c43
-rw-r--r--kernel/audit.h2
-rw-r--r--kernel/audit_tree.c2
-rw-r--r--kernel/audit_watch.c2
-rw-r--r--kernel/auditfilter.c10
-rw-r--r--kernel/cgroup.c71
-rw-r--r--kernel/cpu/Makefile1
-rw-r--r--kernel/cpuset.c10
-rw-r--r--kernel/events/core.c51
-rw-r--r--kernel/fork.c1
-rw-r--r--kernel/futex.c90
-rw-r--r--kernel/irq/irqdomain.c1
-rw-r--r--kernel/irq/manage.c3
-rw-r--r--kernel/irq_work.c6
-rw-r--r--kernel/ksysfs.c2
-rw-r--r--kernel/locking/Makefile3
-rw-r--r--kernel/locking/lockdep.c17
-rw-r--r--kernel/locking/locktorture.c452
-rw-r--r--kernel/locking/mcs_spinlock.c178
-rw-r--r--kernel/locking/mcs_spinlock.h129
-rw-r--r--kernel/locking/mutex-debug.c6
-rw-r--r--kernel/locking/mutex.c94
-rw-r--r--kernel/locking/rtmutex.c12
-rw-r--r--kernel/notifier.c2
-rw-r--r--kernel/power/console.c1
-rw-r--r--kernel/printk/printk.c2
-rw-r--r--kernel/profile.c4
-rw-r--r--kernel/rcu/Makefile2
-rw-r--r--kernel/rcu/rcu.h7
-rw-r--r--kernel/rcu/rcutorture.c (renamed from kernel/rcu/torture.c)1004
-rw-r--r--kernel/rcu/srcu.c11
-rw-r--r--kernel/rcu/tiny.c8
-rw-r--r--kernel/rcu/tiny_plugin.h4
-rw-r--r--kernel/rcu/tree.c80
-rw-r--r--kernel/rcu/tree.h4
-rw-r--r--kernel/rcu/tree_plugin.h19
-rw-r--r--kernel/rcu/tree_trace.c6
-rw-r--r--kernel/rcu/update.c5
-rw-r--r--kernel/sched/Makefile2
-rw-r--r--kernel/sched/auto_group.c2
-rw-r--r--kernel/sched/clock.c4
-rw-r--r--kernel/sched/core.c244
-rw-r--r--kernel/sched/cpudeadline.c6
-rw-r--r--kernel/sched/cputime.c4
-rw-r--r--kernel/sched/deadline.c76
-rw-r--r--kernel/sched/debug.c7
-rw-r--r--kernel/sched/fair.c610
-rw-r--r--kernel/sched/idle.c (renamed from kernel/cpu/idle.c)7
-rw-r--r--kernel/sched/idle_task.c25
-rw-r--r--kernel/sched/rt.c110
-rw-r--r--kernel/sched/sched.h66
-rw-r--r--kernel/sched/stop_task.c15
-rw-r--r--kernel/stop_machine.c2
-rw-r--r--kernel/sys.c8
-rw-r--r--kernel/sysctl.c7
-rw-r--r--kernel/time/sched_clock.c46
-rw-r--r--kernel/time/timekeeping.c3
-rw-r--r--kernel/torture.c719
-rw-r--r--kernel/trace/ring_buffer_benchmark.c6
-rw-r--r--kernel/trace/trace.c27
-rw-r--r--kernel/trace/trace_event_perf.c22
-rw-r--r--kernel/trace/trace_events.c16
-rw-r--r--kernel/trace/trace_export.c7
-rw-r--r--kernel/tracepoint.c7
-rw-r--r--kernel/user_namespace.c2
-rw-r--r--kernel/workqueue.c9
67 files changed, 2976 insertions, 1430 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index bc010ee272b..4fd847488b7 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -22,7 +22,6 @@ obj-y += sched/
obj-y += locking/
obj-y += power/
obj-y += printk/
-obj-y += cpu/
obj-y += irq/
obj-y += rcu/
@@ -93,6 +92,7 @@ obj-$(CONFIG_PADATA) += padata.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_JUMP_LABEL) += jump_label.o
obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
+obj-$(CONFIG_TORTURE_TEST) += torture.o
$(obj)/configs.o: $(obj)/config_data.h
diff --git a/kernel/audit.c b/kernel/audit.c
index 34c5a2310fb..95a20f3f52f 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -182,7 +182,7 @@ struct audit_buffer {
struct audit_reply {
__u32 portid;
- pid_t pid;
+ struct net *net;
struct sk_buff *skb;
};
@@ -500,7 +500,7 @@ int audit_send_list(void *_dest)
{
struct audit_netlink_list *dest = _dest;
struct sk_buff *skb;
- struct net *net = get_net_ns_by_pid(dest->pid);
+ struct net *net = dest->net;
struct audit_net *aunet = net_generic(net, audit_net_id);
/* wait for parent to finish and send an ACK */
@@ -510,6 +510,7 @@ int audit_send_list(void *_dest)
while ((skb = __skb_dequeue(&dest->q)) != NULL)
netlink_unicast(aunet->nlsk, skb, dest->portid, 0);
+ put_net(net);
kfree(dest);
return 0;
@@ -543,7 +544,7 @@ out_kfree_skb:
static int audit_send_reply_thread(void *arg)
{
struct audit_reply *reply = (struct audit_reply *)arg;
- struct net *net = get_net_ns_by_pid(reply->pid);
+ struct net *net = reply->net;
struct audit_net *aunet = net_generic(net, audit_net_id);
mutex_lock(&audit_cmd_mutex);
@@ -552,12 +553,13 @@ static int audit_send_reply_thread(void *arg)
/* Ignore failure. It'll only happen if the sender goes away,
because our timeout is set to infinite. */
netlink_unicast(aunet->nlsk , reply->skb, reply->portid, 0);
+ put_net(net);
kfree(reply);
return 0;
}
/**
* audit_send_reply - send an audit reply message via netlink
- * @portid: netlink port to which to send reply
+ * @request_skb: skb of request we are replying to (used to target the reply)
* @seq: sequence number
* @type: audit message type
* @done: done (last) flag
@@ -568,9 +570,11 @@ static int audit_send_reply_thread(void *arg)
* Allocates an skb, builds the netlink message, and sends it to the port id.
* No failure notifications.
*/
-static void audit_send_reply(__u32 portid, int seq, int type, int done,
+static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done,
int multi, const void *payload, int size)
{
+ u32 portid = NETLINK_CB(request_skb).portid;
+ struct net *net = sock_net(NETLINK_CB(request_skb).sk);
struct sk_buff *skb;
struct task_struct *tsk;
struct audit_reply *reply = kmalloc(sizeof(struct audit_reply),
@@ -583,8 +587,8 @@ static void audit_send_reply(__u32 portid, int seq, int type, int done,
if (!skb)
goto out;
+ reply->net = get_net(net);
reply->portid = portid;
- reply->pid = task_pid_vnr(current);
reply->skb = skb;
tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply");
@@ -604,9 +608,19 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
int err = 0;
/* Only support the initial namespaces for now. */
+ /*
+ * We return ECONNREFUSED because it tricks userspace into thinking
+ * that audit was not configured into the kernel. Lots of users
+ * configure their PAM stack (because that's what the distro does)
+ * to reject login if unable to send messages to audit. If we return
+ * ECONNREFUSED the PAM stack thinks the kernel does not have audit
+ * configured in and will let login proceed. If we return EPERM
+ * userspace will reject all logins. This should be removed when we
+ * support non init namespaces!!
+ */
if ((current_user_ns() != &init_user_ns) ||
(task_active_pid_ns(current) != &init_pid_ns))
- return -EPERM;
+ return -ECONNREFUSED;
switch (msg_type) {
case AUDIT_LIST:
@@ -673,8 +687,7 @@ static int audit_get_feature(struct sk_buff *skb)
seq = nlmsg_hdr(skb)->nlmsg_seq;
- audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0,
- &af, sizeof(af));
+ audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &af, sizeof(af));
return 0;
}
@@ -794,8 +807,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
s.backlog = skb_queue_len(&audit_skb_queue);
s.version = AUDIT_VERSION_LATEST;
s.backlog_wait_time = audit_backlog_wait_time;
- audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0,
- &s, sizeof(s));
+ audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &s, sizeof(s));
break;
}
case AUDIT_SET: {
@@ -905,7 +917,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
seq, data, nlmsg_len(nlh));
break;
case AUDIT_LIST_RULES:
- err = audit_list_rules_send(NETLINK_CB(skb).portid, seq);
+ err = audit_list_rules_send(skb, seq);
break;
case AUDIT_TRIM:
audit_trim_trees();
@@ -970,8 +982,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
memcpy(sig_data->ctx, ctx, len);
security_release_secctx(ctx, len);
}
- audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_SIGNAL_INFO,
- 0, 0, sig_data, sizeof(*sig_data) + len);
+ audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0,
+ sig_data, sizeof(*sig_data) + len);
kfree(sig_data);
break;
case AUDIT_TTY_GET: {
@@ -983,8 +995,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
s.log_passwd = tsk->signal->audit_tty_log_passwd;
spin_unlock(&tsk->sighand->siglock);
- audit_send_reply(NETLINK_CB(skb).portid, seq,
- AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
+ audit_send_reply(skb, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
break;
}
case AUDIT_TTY_SET: {
diff --git a/kernel/audit.h b/kernel/audit.h
index 57cc64d6771..8df13221460 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -247,7 +247,7 @@ extern void audit_panic(const char *message);
struct audit_netlink_list {
__u32 portid;
- pid_t pid;
+ struct net *net;
struct sk_buff_head q;
};
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 67ccf0e7cca..135944a7b28 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -916,7 +916,7 @@ static int audit_tree_handle_event(struct fsnotify_group *group,
struct fsnotify_mark *inode_mark,
struct fsnotify_mark *vfsmount_mark,
u32 mask, void *data, int data_type,
- const unsigned char *file_name)
+ const unsigned char *file_name, u32 cookie)
{
return 0;
}
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 2596fac5dcb..70b4554d2fb 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -471,7 +471,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
struct fsnotify_mark *inode_mark,
struct fsnotify_mark *vfsmount_mark,
u32 mask, void *data, int data_type,
- const unsigned char *dname)
+ const unsigned char *dname, u32 cookie)
{
struct inode *inode;
struct audit_parent *parent;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 14a78cca384..92062fd6cc8 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -29,6 +29,8 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/security.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
#include "audit.h"
/*
@@ -1065,11 +1067,13 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data,
/**
* audit_list_rules_send - list the audit rules
- * @portid: target portid for netlink audit messages
+ * @request_skb: skb of request we are replying to (used to target the reply)
* @seq: netlink audit message sequence (serial) number
*/
-int audit_list_rules_send(__u32 portid, int seq)
+int audit_list_rules_send(struct sk_buff *request_skb, int seq)
{
+ u32 portid = NETLINK_CB(request_skb).portid;
+ struct net *net = sock_net(NETLINK_CB(request_skb).sk);
struct task_struct *tsk;
struct audit_netlink_list *dest;
int err = 0;
@@ -1083,8 +1087,8 @@ int audit_list_rules_send(__u32 portid, int seq)
dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
if (!dest)
return -ENOMEM;
+ dest->net = get_net(net);
dest->portid = portid;
- dest->pid = task_pid_vnr(current);
skb_queue_head_init(&dest->q);
mutex_lock(&audit_filter_mutex);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e2f46ba37f7..0c753ddd223 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -886,7 +886,9 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
* per-subsystem and moved to css->id so that lookups are
* successful until the target css is released.
*/
+ mutex_lock(&cgroup_mutex);
idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
+ mutex_unlock(&cgroup_mutex);
cgrp->id = -1;
call_rcu(&cgrp->rcu_head, cgroup_free_rcu);
@@ -1566,10 +1568,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
mutex_lock(&cgroup_mutex);
mutex_lock(&cgroup_root_mutex);
- root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp,
- 0, 1, GFP_KERNEL);
- if (root_cgrp->id < 0)
+ ret = idr_alloc(&root->cgroup_idr, root_cgrp, 0, 1, GFP_KERNEL);
+ if (ret < 0)
goto unlock_drop;
+ root_cgrp->id = ret;
/* Check for name clashes with existing mounts */
ret = -EBUSY;
@@ -2763,10 +2765,7 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
*/
update_before = cgroup_serial_nr_next;
- mutex_unlock(&cgroup_mutex);
-
/* add/rm files for all cgroups created before */
- rcu_read_lock();
css_for_each_descendant_pre(css, cgroup_css(root, ss)) {
struct cgroup *cgrp = css->cgroup;
@@ -2775,23 +2774,19 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add)
inode = cgrp->dentry->d_inode;
dget(cgrp->dentry);
- rcu_read_unlock();
-
dput(prev);
prev = cgrp->dentry;
+ mutex_unlock(&cgroup_mutex);
mutex_lock(&inode->i_mutex);
mutex_lock(&cgroup_mutex);
if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp))
ret = cgroup_addrm_files(cgrp, cfts, is_add);
- mutex_unlock(&cgroup_mutex);
mutex_unlock(&inode->i_mutex);
-
- rcu_read_lock();
if (ret)
break;
}
- rcu_read_unlock();
+ mutex_unlock(&cgroup_mutex);
dput(prev);
deactivate_super(sb);
return ret;
@@ -2910,9 +2905,14 @@ static void cgroup_enable_task_cg_lists(void)
* We should check if the process is exiting, otherwise
* it will race with cgroup_exit() in that the list
* entry won't be deleted though the process has exited.
+ * Do it while holding siglock so that we don't end up
+ * racing against cgroup_exit().
*/
+ spin_lock_irq(&p->sighand->siglock);
if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list))
list_add(&p->cg_list, &task_css_set(p)->tasks);
+ spin_unlock_irq(&p->sighand->siglock);
+
task_unlock(p);
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
@@ -4112,17 +4112,17 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
err = percpu_ref_init(&css->refcnt, css_release);
if (err)
- goto err_free;
+ goto err_free_css;
init_css(css, ss, cgrp);
err = cgroup_populate_dir(cgrp, 1 << ss->subsys_id);
if (err)
- goto err_free;
+ goto err_free_percpu_ref;
err = online_css(css);
if (err)
- goto err_free;
+ goto err_clear_dir;
dget(cgrp->dentry);
css_get(css->parent);
@@ -4138,8 +4138,11 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
return 0;
-err_free:
+err_clear_dir:
+ cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id);
+err_free_percpu_ref:
percpu_ref_cancel_init(&css->refcnt);
+err_free_css:
ss->css_free(css);
return err;
}
@@ -4158,7 +4161,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
struct cgroup *cgrp;
struct cgroup_name *name;
struct cgroupfs_root *root = parent->root;
- int ssid, err = 0;
+ int ssid, err;
struct cgroup_subsys *ss;
struct super_block *sb = root->sb;
@@ -4168,19 +4171,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
return -ENOMEM;
name = cgroup_alloc_name(dentry);
- if (!name)
+ if (!name) {
+ err = -ENOMEM;
goto err_free_cgrp;
+ }
rcu_assign_pointer(cgrp->name, name);
/*
- * Temporarily set the pointer to NULL, so idr_find() won't return
- * a half-baked cgroup.
- */
- cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
- if (cgrp->id < 0)
- goto err_free_name;
-
- /*
* Only live parents can have children. Note that the liveliness
* check isn't strictly necessary because cgroup_mkdir() and
* cgroup_rmdir() are fully synchronized by i_mutex; however, do it
@@ -4189,7 +4186,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
*/
if (!cgroup_lock_live_group(parent)) {
err = -ENODEV;
- goto err_free_id;
+ goto err_free_name;
+ }
+
+ /*
+ * Temporarily set the pointer to NULL, so idr_find() won't return
+ * a half-baked cgroup.
+ */
+ cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL);
+ if (cgrp->id < 0) {
+ err = -ENOMEM;
+ goto err_unlock;
}
/* Grab a reference on the superblock so the hierarchy doesn't
@@ -4221,7 +4228,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
*/
err = cgroup_create_file(dentry, S_IFDIR | mode, sb);
if (err < 0)
- goto err_unlock;
+ goto err_free_id;
lockdep_assert_held(&dentry->d_inode->i_mutex);
cgrp->serial_nr = cgroup_serial_nr_next++;
@@ -4257,12 +4264,12 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
return 0;
-err_unlock:
- mutex_unlock(&cgroup_mutex);
- /* Release the reference count that we took on the superblock */
- deactivate_super(sb);
err_free_id:
idr_remove(&root->cgroup_idr, cgrp->id);
+ /* Release the reference count that we took on the superblock */
+ deactivate_super(sb);
+err_unlock:
+ mutex_unlock(&cgroup_mutex);
err_free_name:
kfree(rcu_dereference_raw(cgrp->name));
err_free_cgrp:
diff --git a/kernel/cpu/Makefile b/kernel/cpu/Makefile
deleted file mode 100644
index 59ab052ef7a..00000000000
--- a/kernel/cpu/Makefile
+++ /dev/null
@@ -1 +0,0 @@
-obj-y = idle.o
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4410ac6a55f..e6b1b66afe5 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -974,12 +974,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
* Temporarilly set tasks mems_allowed to target nodes of migration,
* so that the migration code can allocate pages on these nodes.
*
- * Call holding cpuset_mutex, so current's cpuset won't change
- * during this call, as manage_mutex holds off any cpuset_attach()
- * calls. Therefore we don't need to take task_lock around the
- * call to guarantee_online_mems(), as we know no one is changing
- * our task's cpuset.
- *
* While the mm_struct we are migrating is typically from some
* other task, the task_struct mems_allowed that we are hacking
* is for our current task, which must allocate new pages for that
@@ -996,8 +990,10 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
+ rcu_read_lock();
mems_cs = effective_nodemask_cpuset(task_cs(tsk));
guarantee_online_mems(mems_cs, &tsk->mems_allowed);
+ rcu_read_unlock();
}
/*
@@ -2486,9 +2482,9 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
task_lock(current);
cs = nearest_hardwall_ancestor(task_cs(current));
+ allowed = node_isset(node, cs->mems_allowed);
task_unlock(current);
- allowed = node_isset(node, cs->mems_allowed);
mutex_unlock(&callback_mutex);
return allowed;
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 56003c6edfd..661951ab8ae 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -231,11 +231,29 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
#define NR_ACCUMULATED_SAMPLES 128
static DEFINE_PER_CPU(u64, running_sample_length);
-void perf_sample_event_took(u64 sample_len_ns)
+static void perf_duration_warn(struct irq_work *w)
{
+ u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
u64 avg_local_sample_len;
u64 local_samples_len;
+
+ local_samples_len = __get_cpu_var(running_sample_length);
+ avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
+
+ printk_ratelimited(KERN_WARNING
+ "perf interrupt took too long (%lld > %lld), lowering "
+ "kernel.perf_event_max_sample_rate to %d\n",
+ avg_local_sample_len, allowed_ns >> 1,
+ sysctl_perf_event_sample_rate);
+}
+
+static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
+
+void perf_sample_event_took(u64 sample_len_ns)
+{
u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
+ u64 avg_local_sample_len;
+ u64 local_samples_len;
if (allowed_ns == 0)
return;
@@ -263,13 +281,14 @@ void perf_sample_event_took(u64 sample_len_ns)
sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
- printk_ratelimited(KERN_WARNING
- "perf samples too long (%lld > %lld), lowering "
- "kernel.perf_event_max_sample_rate to %d\n",
- avg_local_sample_len, allowed_ns,
- sysctl_perf_event_sample_rate);
-
update_perf_cpu_limits();
+
+ if (!irq_work_queue(&perf_duration_work)) {
+ early_printk("perf interrupt took too long (%lld > %lld), lowering "
+ "kernel.perf_event_max_sample_rate to %d\n",
+ avg_local_sample_len, allowed_ns >> 1,
+ sysctl_perf_event_sample_rate);
+ }
}
static atomic64_t perf_event_id;
@@ -1714,7 +1733,7 @@ group_sched_in(struct perf_event *group_event,
struct perf_event_context *ctx)
{
struct perf_event *event, *partial_group = NULL;
- struct pmu *pmu = group_event->pmu;
+ struct pmu *pmu = ctx->pmu;
u64 now = ctx->time;
bool simulate = false;
@@ -2563,8 +2582,6 @@ static void perf_branch_stack_sched_in(struct task_struct *prev,
if (cpuctx->ctx.nr_branch_stack > 0
&& pmu->flush_branch_stack) {
- pmu = cpuctx->ctx.pmu;
-
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
perf_pmu_disable(pmu);
@@ -6294,7 +6311,7 @@ static int perf_event_idx_default(struct perf_event *event)
* Ensures all contexts with the same task_ctx_nr have the same
* pmu_cpu_context too.
*/
-static void *find_pmu_context(int ctxn)
+static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
{
struct pmu *pmu;
@@ -7856,14 +7873,14 @@ static void perf_pmu_rotate_stop(struct pmu *pmu)
static void __perf_event_exit_context(void *__info)
{
struct perf_event_context *ctx = __info;
- struct perf_event *event, *tmp;
+ struct perf_event *event;
perf_pmu_rotate_stop(ctx->pmu);
- list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
- __perf_remove_from_context(event);
- list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
+ rcu_read_lock();
+ list_for_each_entry_rcu(event, &ctx->event_list, event_entry)
__perf_remove_from_context(event);
+ rcu_read_unlock();
}
static void perf_event_exit_cpu_context(int cpu)
@@ -7887,11 +7904,11 @@ static void perf_event_exit_cpu(int cpu)
{
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
+ perf_event_exit_cpu_context(cpu);
+
mutex_lock(&swhash->hlist_mutex);
swevent_hlist_release(swhash);
mutex_unlock(&swhash->hlist_mutex);
-
- perf_event_exit_cpu_context(cpu);
}
#else
static inline void perf_event_exit_cpu(int cpu) { }
diff --git a/kernel/fork.c b/kernel/fork.c
index a17621c6cd4..332688e5e7b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -237,6 +237,7 @@ void __put_task_struct(struct task_struct *tsk)
WARN_ON(atomic_read(&tsk->usage));
WARN_ON(tsk == current);
+ task_numa_free(tsk);
security_task_free(tsk);
exit_creds(tsk);
delayacct_tsk_free(tsk);
diff --git a/kernel/futex.c b/kernel/futex.c
index 44a1261cb9f..67dacaf93e5 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -157,7 +157,9 @@
* enqueue.
*/
+#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
int __read_mostly futex_cmpxchg_enabled;
+#endif
/*
* Futex flags used to encode options to functions and preserve them across
@@ -234,6 +236,7 @@ static const struct futex_q futex_q_init = {
* waiting on a futex.
*/
struct futex_hash_bucket {
+ atomic_t waiters;
spinlock_t lock;
struct plist_head chain;
} ____cacheline_aligned_in_smp;
@@ -253,22 +256,37 @@ static inline void futex_get_mm(union futex_key *key)
smp_mb__after_atomic_inc();
}
-static inline bool hb_waiters_pending(struct futex_hash_bucket *hb)
+/*
+ * Reflects a new waiter being added to the waitqueue.
+ */
+static inline void hb_waiters_inc(struct futex_hash_bucket *hb)
{
#ifdef CONFIG_SMP
+ atomic_inc(&hb->waiters);
/*
- * Tasks trying to enter the critical region are most likely
- * potential waiters that will be added to the plist. Ensure
- * that wakers won't miss to-be-slept tasks in the window between
- * the wait call and the actual plist_add.
+ * Full barrier (A), see the ordering comment above.
*/
- if (spin_is_locked(&hb->lock))
- return true;
- smp_rmb(); /* Make sure we check the lock state first */
+ smp_mb__after_atomic_inc();
+#endif
+}
+
+/*
+ * Reflects a waiter being removed from the waitqueue by wakeup
+ * paths.
+ */
+static inline void hb_waiters_dec(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+ atomic_dec(&hb->waiters);
+#endif
+}
- return !plist_head_empty(&hb->chain);
+static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
+{
+#ifdef CONFIG_SMP
+ return atomic_read(&hb->waiters);
#else
- return true;
+ return 1;
#endif
}
@@ -954,6 +972,7 @@ static void __unqueue_futex(struct futex_q *q)
hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
plist_del(&q->list, &hb->chain);
+ hb_waiters_dec(hb);
}
/*
@@ -1257,7 +1276,9 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
*/
if (likely(&hb1->chain != &hb2->chain)) {
plist_del(&q->list, &hb1->chain);
+ hb_waiters_dec(hb1);
plist_add(&q->list, &hb2->chain);
+ hb_waiters_inc(hb2);
q->lock_ptr = &hb2->lock;
}
get_futex_key_refs(key2);
@@ -1600,6 +1621,17 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
struct futex_hash_bucket *hb;
hb = hash_futex(&q->key);
+
+ /*
+ * Increment the counter before taking the lock so that
+ * a potential waker won't miss a to-be-slept task that is
+ * waiting for the spinlock. This is safe as all queue_lock()
+ * users end up calling queue_me(). Similarly, for housekeeping,
+ * decrement the counter at queue_unlock() when some error has
+ * occurred and we don't end up adding the task to the list.
+ */
+ hb_waiters_inc(hb);
+
q->lock_ptr = &hb->lock;
spin_lock(&hb->lock); /* implies MB (A) */
@@ -1611,6 +1643,7 @@ queue_unlock(struct futex_hash_bucket *hb)
__releases(&hb->lock)
{
spin_unlock(&hb->lock);
+ hb_waiters_dec(hb);
}
/**
@@ -2342,6 +2375,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
* Unqueue the futex_q and determine which it was.
*/
plist_del(&q->list, &hb->chain);
+ hb_waiters_dec(hb);
/* Handle spurious wakeups gracefully */
ret = -EWOULDBLOCK;
@@ -2843,9 +2877,28 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
}
-static int __init futex_init(void)
+static void __init futex_detect_cmpxchg(void)
{
+#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
u32 curval;
+
+ /*
+ * This will fail and we want it. Some arch implementations do
+ * runtime detection of the futex_atomic_cmpxchg_inatomic()
+ * functionality. We want to know that before we call in any
+ * of the complex code paths. Also we want to prevent
+ * registration of robust lists in that case. NULL is
+ * guaranteed to fault and we get -EFAULT on functional
+ * implementation, the non-functional ones will return
+ * -ENOSYS.
+ */
+ if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
+ futex_cmpxchg_enabled = 1;
+#endif
+}
+
+static int __init futex_init(void)
+{
unsigned int futex_shift;
unsigned long i;
@@ -2861,20 +2914,11 @@ static int __init futex_init(void)
&futex_shift, NULL,
futex_hashsize, futex_hashsize);
futex_hashsize = 1UL << futex_shift;
- /*
- * This will fail and we want it. Some arch implementations do
- * runtime detection of the futex_atomic_cmpxchg_inatomic()
- * functionality. We want to know that before we call in any
- * of the complex code paths. Also we want to prevent
- * registration of robust lists in that case. NULL is
- * guaranteed to fault and we get -EFAULT on functional
- * implementation, the non-functional ones will return
- * -ENOSYS.
- */
- if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
- futex_cmpxchg_enabled = 1;
+
+ futex_detect_cmpxchg();
for (i = 0; i < futex_hashsize; i++) {
+ atomic_set(&futex_queues[i].waiters, 0);
plist_head_init(&futex_queues[i].chain);
spin_lock_init(&futex_queues[i].lock);
}
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index cf68bb36fe5..f14033700c2 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -10,6 +10,7 @@
#include <linux/mutex.h>
#include <linux/of.h>
#include <linux/of_address.h>
+#include <linux/of_irq.h>
#include <linux/topology.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 481a13c43b1..d3bf660cb57 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -802,8 +802,7 @@ static irqreturn_t irq_thread_fn(struct irq_desc *desc,
static void wake_threads_waitq(struct irq_desc *desc)
{
- if (atomic_dec_and_test(&desc->threads_active) &&
- waitqueue_active(&desc->wait_for_threads))
+ if (atomic_dec_and_test(&desc->threads_active))
wake_up(&desc->wait_for_threads);
}
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 55fcce6065c..a82170e2fa7 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -61,11 +61,11 @@ void __weak arch_irq_work_raise(void)
*
* Can be re-enqueued while the callback is still in progress.
*/
-void irq_work_queue(struct irq_work *work)
+bool irq_work_queue(struct irq_work *work)
{
/* Only queue if not already pending */
if (!irq_work_claim(work))
- return;
+ return false;
/* Queue the entry and raise the IPI if needed. */
preempt_disable();
@@ -83,6 +83,8 @@ void irq_work_queue(struct irq_work *work)
}
preempt_enable();
+
+ return true;
}
EXPORT_SYMBOL_GPL(irq_work_queue);
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index d945a949760..e660964086e 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -19,6 +19,8 @@
#include <linux/sched.h>
#include <linux/capability.h>
+#include <linux/rcupdate.h> /* rcu_expedited */
+
#define KERNEL_ATTR_RO(_name) \
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index baab8e5e7f6..306a76b51e0 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,5 +1,5 @@
-obj-y += mutex.o semaphore.o rwsem.o lglock.o
+obj-y += mutex.o semaphore.o rwsem.o lglock.o mcs_spinlock.o
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_lockdep.o = -pg
@@ -23,3 +23,4 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
+obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index eb8a54783fa..bf0c6b0dd9c 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1936,12 +1936,12 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
for (;;) {
int distance = curr->lockdep_depth - depth + 1;
- hlock = curr->held_locks + depth-1;
+ hlock = curr->held_locks + depth - 1;
/*
* Only non-recursive-read entries get new dependencies
* added:
*/
- if (hlock->read != 2) {
+ if (hlock->read != 2 && hlock->check) {
if (!check_prev_add(curr, hlock, next,
distance, trylock_loop))
return 0;
@@ -2098,7 +2098,7 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
* (If lookup_chain_cache() returns with 1 it acquires
* graph_lock for us)
*/
- if (!hlock->trylock && (hlock->check == 2) &&
+ if (!hlock->trylock && hlock->check &&
lookup_chain_cache(curr, hlock, chain_key)) {
/*
* Check whether last held lock:
@@ -2517,7 +2517,7 @@ mark_held_locks(struct task_struct *curr, enum mark_type mark)
BUG_ON(usage_bit >= LOCK_USAGE_STATES);
- if (hlock_class(hlock)->key == __lockdep_no_validate__.subkeys)
+ if (!hlock->check)
continue;
if (!mark_lock(curr, hlock, usage_bit))
@@ -3055,9 +3055,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
int class_idx;
u64 chain_key;
- if (!prove_locking)
- check = 1;
-
if (unlikely(!debug_locks))
return 0;
@@ -3069,8 +3066,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return 0;
- if (lock->key == &__lockdep_no_validate__)
- check = 1;
+ if (!prove_locking || lock->key == &__lockdep_no_validate__)
+ check = 0;
if (subclass < NR_LOCKDEP_CACHING_CLASSES)
class = lock->class_cache[subclass];
@@ -3138,7 +3135,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
hlock->holdtime_stamp = lockstat_clock();
#endif
- if (check == 2 && !mark_irqflags(curr, hlock))
+ if (check && !mark_irqflags(curr, hlock))
return 0;
/* mark it as used: */
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
new file mode 100644
index 00000000000..f26b1a18e34
--- /dev/null
+++ b/kernel/locking/locktorture.c
@@ -0,0 +1,452 @@
+/*
+ * Module-based torture test facility for locking
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2014
+ *
+ * Author: Paul E. McKenney <paulmck@us.ibm.com>
+ * Based on kernel/rcu/torture.c.
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/freezer.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include <linux/trace_clock.h>
+#include <asm/byteorder.h>
+#include <linux/torture.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>");
+
+torture_param(int, nwriters_stress, -1,
+ "Number of write-locking stress-test threads");
+torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
+torture_param(int, onoff_interval, 0,
+ "Time between CPU hotplugs (s), 0=disable");
+torture_param(int, shuffle_interval, 3,
+ "Number of jiffies between shuffles, 0=disable");
+torture_param(int, shutdown_secs, 0, "Shutdown time (j), <= zero to disable.");
+torture_param(int, stat_interval, 60,
+ "Number of seconds between stats printk()s");
+torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable");
+torture_param(bool, verbose, true,
+ "Enable verbose debugging printk()s");
+
+static char *torture_type = "spin_lock";
+module_param(torture_type, charp, 0444);
+MODULE_PARM_DESC(torture_type,
+ "Type of lock to torture (spin_lock, spin_lock_irq, ...)");
+
+static atomic_t n_lock_torture_errors;
+
+static struct task_struct *stats_task;
+static struct task_struct **writer_tasks;
+
+static int nrealwriters_stress;
+static bool lock_is_write_held;
+
+struct lock_writer_stress_stats {
+ long n_write_lock_fail;
+ long n_write_lock_acquired;
+};
+static struct lock_writer_stress_stats *lwsa;
+
+#if defined(MODULE) || defined(CONFIG_LOCK_TORTURE_TEST_RUNNABLE)
+#define LOCKTORTURE_RUNNABLE_INIT 1
+#else
+#define LOCKTORTURE_RUNNABLE_INIT 0
+#endif
+int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT;
+module_param(locktorture_runnable, int, 0444);
+MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at boot");
+
+/* Forward reference. */
+static void lock_torture_cleanup(void);
+
+/*
+ * Operations vector for selecting different types of tests.
+ */
+struct lock_torture_ops {
+ void (*init)(void);
+ int (*writelock)(void);
+ void (*write_delay)(struct torture_random_state *trsp);
+ void (*writeunlock)(void);
+ unsigned long flags;
+ const char *name;
+};
+
+static struct lock_torture_ops *cur_ops;
+
+/*
+ * Definitions for lock torture testing.
+ */
+
+static int torture_lock_busted_write_lock(void)
+{
+ return 0; /* BUGGY, do not use in real life!!! */
+}
+
+static void torture_lock_busted_write_delay(struct torture_random_state *trsp)
+{
+ const unsigned long longdelay_us = 100;
+
+ /* We want a long delay occasionally to force massive contention. */
+ if (!(torture_random(trsp) %
+ (nrealwriters_stress * 2000 * longdelay_us)))
+ mdelay(longdelay_us);
+#ifdef CONFIG_PREEMPT
+ if (!(torture_random(trsp) % (nrealwriters_stress * 20000)))
+ preempt_schedule(); /* Allow test to be preempted. */
+#endif
+}
+
+static void torture_lock_busted_write_unlock(void)
+{
+ /* BUGGY, do not use in real life!!! */
+}
+
+static struct lock_torture_ops lock_busted_ops = {
+ .writelock = torture_lock_busted_write_lock,
+ .write_delay = torture_lock_busted_write_delay,
+ .writeunlock = torture_lock_busted_write_unlock,
+ .name = "lock_busted"
+};
+
+static DEFINE_SPINLOCK(torture_spinlock);
+
+static int torture_spin_lock_write_lock(void) __acquires(torture_spinlock)
+{
+ spin_lock(&torture_spinlock);
+ return 0;
+}
+
+static void torture_spin_lock_write_delay(struct torture_random_state *trsp)
+{
+ const unsigned long shortdelay_us = 2;
+ const unsigned long longdelay_us = 100;
+
+ /* We want a short delay mostly to emulate likely code, and
+ * we want a long delay occasionally to force massive contention.
+ */
+ if (!(torture_random(trsp) %
+ (nrealwriters_stress * 2000 * longdelay_us)))
+ mdelay(longdelay_us);
+ if (!(torture_random(trsp) %
+ (nrealwriters_stress * 2 * shortdelay_us)))
+ udelay(shortdelay_us);
+#ifdef CONFIG_PREEMPT
+ if (!(torture_random(trsp) % (nrealwriters_stress * 20000)))
+ preempt_schedule(); /* Allow test to be preempted. */
+#endif
+}
+
+static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock)
+{
+ spin_unlock(&torture_spinlock);
+}
+
+static struct lock_torture_ops spin_lock_ops = {
+ .writelock = torture_spin_lock_write_lock,
+ .write_delay = torture_spin_lock_write_delay,
+ .writeunlock = torture_spin_lock_write_unlock,
+ .name = "spin_lock"
+};
+
+static int torture_spin_lock_write_lock_irq(void)
+__acquires(torture_spinlock_irq)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&torture_spinlock, flags);
+ cur_ops->flags = flags;
+ return 0;
+}
+
+static void torture_lock_spin_write_unlock_irq(void)
+__releases(torture_spinlock)
+{
+ spin_unlock_irqrestore(&torture_spinlock, cur_ops->flags);
+}
+
+static struct lock_torture_ops spin_lock_irq_ops = {
+ .writelock = torture_spin_lock_write_lock_irq,
+ .write_delay = torture_spin_lock_write_delay,
+ .writeunlock = torture_lock_spin_write_unlock_irq,
+ .name = "spin_lock_irq"
+};
+
+/*
+ * Lock torture writer kthread. Repeatedly acquires and releases
+ * the lock, checking for duplicate acquisitions.
+ */
+static int lock_torture_writer(void *arg)
+{
+ struct lock_writer_stress_stats *lwsp = arg;
+ static DEFINE_TORTURE_RANDOM(rand);
+
+ VERBOSE_TOROUT_STRING("lock_torture_writer task started");
+ set_user_nice(current, 19);
+
+ do {
+ schedule_timeout_uninterruptible(1);
+ cur_ops->writelock();
+ if (WARN_ON_ONCE(lock_is_write_held))
+ lwsp->n_write_lock_fail++;
+ lock_is_write_held = 1;
+ lwsp->n_write_lock_acquired++;
+ cur_ops->write_delay(&rand);
+ lock_is_write_held = 0;
+ cur_ops->writeunlock();
+ stutter_wait("lock_torture_writer");
+ } while (!torture_must_stop());
+ torture_kthread_stopping("lock_torture_writer");
+ return 0;
+}
+
+/*
+ * Create an lock-torture-statistics message in the specified buffer.
+ */
+static void lock_torture_printk(char *page)
+{
+ bool fail = 0;
+ int i;
+ long max = 0;
+ long min = lwsa[0].n_write_lock_acquired;
+ long long sum = 0;
+
+ for (i = 0; i < nrealwriters_stress; i++) {
+ if (lwsa[i].n_write_lock_fail)
+ fail = true;
+ sum += lwsa[i].n_write_lock_acquired;
+ if (max < lwsa[i].n_write_lock_fail)
+ max = lwsa[i].n_write_lock_fail;
+ if (min > lwsa[i].n_write_lock_fail)
+ min = lwsa[i].n_write_lock_fail;
+ }
+ page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG);
+ page += sprintf(page,
+ "Writes: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n",
+ sum, max, min, max / 2 > min ? "???" : "",
+ fail, fail ? "!!!" : "");
+ if (fail)
+ atomic_inc(&n_lock_torture_errors);
+}
+
+/*
+ * Print torture statistics. Caller must ensure that there is only one
+ * call to this function at a given time!!! This is normally accomplished
+ * by relying on the module system to only have one copy of the module
+ * loaded, and then by giving the lock_torture_stats kthread full control
+ * (or the init/cleanup functions when lock_torture_stats thread is not
+ * running).
+ */
+static void lock_torture_stats_print(void)
+{
+ int size = nrealwriters_stress * 200 + 8192;
+ char *buf;
+
+ buf = kmalloc(size, GFP_KERNEL);
+ if (!buf) {
+ pr_err("lock_torture_stats_print: Out of memory, need: %d",
+ size);
+ return;
+ }
+ lock_torture_printk(buf);
+ pr_alert("%s", buf);
+ kfree(buf);
+}
+
+/*
+ * Periodically prints torture statistics, if periodic statistics printing
+ * was specified via the stat_interval module parameter.
+ *
+ * No need to worry about fullstop here, since this one doesn't reference
+ * volatile state or register callbacks.
+ */
+static int lock_torture_stats(void *arg)
+{
+ VERBOSE_TOROUT_STRING("lock_torture_stats task started");
+ do {
+ schedule_timeout_interruptible(stat_interval * HZ);
+ lock_torture_stats_print();
+ torture_shutdown_absorb("lock_torture_stats");
+ } while (!torture_must_stop());
+ torture_kthread_stopping("lock_torture_stats");
+ return 0;
+}
+
+static inline void
+lock_torture_print_module_parms(struct lock_torture_ops *cur_ops,
+ const char *tag)
+{
+ pr_alert("%s" TORTURE_FLAG
+ "--- %s: nwriters_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n",
+ torture_type, tag, nrealwriters_stress, stat_interval, verbose,
+ shuffle_interval, stutter, shutdown_secs,
+ onoff_interval, onoff_holdoff);
+}
+
+static void lock_torture_cleanup(void)
+{
+ int i;
+
+ if (torture_cleanup())
+ return;
+
+ if (writer_tasks) {
+ for (i = 0; i < nrealwriters_stress; i++)
+ torture_stop_kthread(lock_torture_writer,
+ writer_tasks[i]);
+ kfree(writer_tasks);
+ writer_tasks = NULL;
+ }
+
+ torture_stop_kthread(lock_torture_stats, stats_task);
+ lock_torture_stats_print(); /* -After- the stats thread is stopped! */
+
+ if (atomic_read(&n_lock_torture_errors))
+ lock_torture_print_module_parms(cur_ops,
+ "End of test: FAILURE");
+ else if (torture_onoff_failures())
+ lock_torture_print_module_parms(cur_ops,
+ "End of test: LOCK_HOTPLUG");
+ else
+ lock_torture_print_module_parms(cur_ops,
+ "End of test: SUCCESS");
+}
+
+static int __init lock_torture_init(void)
+{
+ int i;
+ int firsterr = 0;
+ static struct lock_torture_ops *torture_ops[] = {
+ &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops,
+ };
+
+ torture_init_begin(torture_type, verbose, &locktorture_runnable);
+
+ /* Process args and tell the world that the torturer is on the job. */
+ for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
+ cur_ops = torture_ops[i];
+ if (strcmp(torture_type, cur_ops->name) == 0)
+ break;
+ }
+ if (i == ARRAY_SIZE(torture_ops)) {
+ pr_alert("lock-torture: invalid torture type: \"%s\"\n",
+ torture_type);
+ pr_alert("lock-torture types:");
+ for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
+ pr_alert(" %s", torture_ops[i]->name);
+ pr_alert("\n");
+ torture_init_end();
+ return -EINVAL;
+ }
+ if (cur_ops->init)
+ cur_ops->init(); /* no "goto unwind" prior to this point!!! */
+
+ if (nwriters_stress >= 0)
+ nrealwriters_stress = nwriters_stress;
+ else
+ nrealwriters_stress = 2 * num_online_cpus();
+ lock_torture_print_module_parms(cur_ops, "Start of test");
+
+ /* Initialize the statistics so that each run gets its own numbers. */
+
+ lock_is_write_held = 0;
+ lwsa = kmalloc(sizeof(*lwsa) * nrealwriters_stress, GFP_KERNEL);
+ if (lwsa == NULL) {
+ VERBOSE_TOROUT_STRING("lwsa: Out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ for (i = 0; i < nrealwriters_stress; i++) {
+ lwsa[i].n_write_lock_fail = 0;
+ lwsa[i].n_write_lock_acquired = 0;
+ }
+
+ /* Start up the kthreads. */
+
+ if (onoff_interval > 0) {
+ firsterr = torture_onoff_init(onoff_holdoff * HZ,
+ onoff_interval * HZ);
+ if (firsterr)
+ goto unwind;
+ }
+ if (shuffle_interval > 0) {
+ firsterr = torture_shuffle_init(shuffle_interval);
+ if (firsterr)
+ goto unwind;
+ }
+ if (shutdown_secs > 0) {
+ firsterr = torture_shutdown_init(shutdown_secs,
+ lock_torture_cleanup);
+ if (firsterr)
+ goto unwind;
+ }
+ if (stutter > 0) {
+ firsterr = torture_stutter_init(stutter);
+ if (firsterr)
+ goto unwind;
+ }
+
+ writer_tasks = kzalloc(nrealwriters_stress * sizeof(writer_tasks[0]),
+ GFP_KERNEL);
+ if (writer_tasks == NULL) {
+ VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory");
+ firsterr = -ENOMEM;
+ goto unwind;
+ }
+ for (i = 0; i < nrealwriters_stress; i++) {
+ firsterr = torture_create_kthread(lock_torture_writer, &lwsa[i],
+ writer_tasks[i]);
+ if (firsterr)
+ goto unwind;
+ }
+ if (stat_interval > 0) {
+ firsterr = torture_create_kthread(lock_torture_stats, NULL,
+ stats_task);
+ if (firsterr)
+ goto unwind;
+ }
+ torture_init_end();
+ return 0;
+
+unwind:
+ torture_init_end();
+ lock_torture_cleanup();
+ return firsterr;
+}
+
+module_init(lock_torture_init);
+module_exit(lock_torture_cleanup);
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
new file mode 100644
index 00000000000..838dc9e0066
--- /dev/null
+++ b/kernel/locking/mcs_spinlock.c
@@ -0,0 +1,178 @@
+
+#include <linux/percpu.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include "mcs_spinlock.h"
+
+#ifdef CONFIG_SMP
+
+/*
+ * An MCS like lock especially tailored for optimistic spinning for sleeping
+ * lock implementations (mutex, rwsem, etc).
+ *
+ * Using a single mcs node per CPU is safe because sleeping locks should not be
+ * called from interrupt context and we have preemption disabled while
+ * spinning.
+ */
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_queue, osq_node);
+
+/*
+ * Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
+ * Can return NULL in case we were the last queued and we updated @lock instead.
+ */
+static inline struct optimistic_spin_queue *
+osq_wait_next(struct optimistic_spin_queue **lock,
+ struct optimistic_spin_queue *node,
+ struct optimistic_spin_queue *prev)
+{
+ struct optimistic_spin_queue *next = NULL;
+
+ for (;;) {
+ if (*lock == node && cmpxchg(lock, node, prev) == node) {
+ /*
+ * We were the last queued, we moved @lock back. @prev
+ * will now observe @lock and will complete its
+ * unlock()/unqueue().
+ */
+ break;
+ }
+
+ /*
+ * We must xchg() the @node->next value, because if we were to
+ * leave it in, a concurrent unlock()/unqueue() from
+ * @node->next might complete Step-A and think its @prev is
+ * still valid.
+ *
+ * If the concurrent unlock()/unqueue() wins the race, we'll
+ * wait for either @lock to point to us, through its Step-B, or
+ * wait for a new @node->next from its Step-C.
+ */
+ if (node->next) {
+ next = xchg(&node->next, NULL);
+ if (next)
+ break;
+ }
+
+ arch_mutex_cpu_relax();
+ }
+
+ return next;
+}
+
+bool osq_lock(struct optimistic_spin_queue **lock)
+{
+ struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node);
+ struct optimistic_spin_queue *prev, *next;
+
+ node->locked = 0;
+ node->next = NULL;
+
+ node->prev = prev = xchg(lock, node);
+ if (likely(prev == NULL))
+ return true;
+
+ ACCESS_ONCE(prev->next) = node;
+
+ /*
+ * Normally @prev is untouchable after the above store; because at that
+ * moment unlock can proceed and wipe the node element from stack.
+ *
+ * However, since our nodes are static per-cpu storage, we're
+ * guaranteed their existence -- this allows us to apply
+ * cmpxchg in an attempt to undo our queueing.
+ */
+
+ while (!smp_load_acquire(&node->locked)) {
+ /*
+ * If we need to reschedule bail... so we can block.
+ */
+ if (need_resched())
+ goto unqueue;
+
+ arch_mutex_cpu_relax();
+ }
+ return true;
+
+unqueue:
+ /*
+ * Step - A -- stabilize @prev
+ *
+ * Undo our @prev->next assignment; this will make @prev's
+ * unlock()/unqueue() wait for a next pointer since @lock points to us
+ * (or later).
+ */
+
+ for (;;) {
+ if (prev->next == node &&
+ cmpxchg(&prev->next, node, NULL) == node)
+ break;
+
+ /*
+ * We can only fail the cmpxchg() racing against an unlock(),
+ * in which case we should observe @node->locked becomming
+ * true.
+ */
+ if (smp_load_acquire(&node->locked))
+ return true;
+
+ arch_mutex_cpu_relax();
+
+ /*
+ * Or we race against a concurrent unqueue()'s step-B, in which
+ * case its step-C will write us a new @node->prev pointer.
+ */
+ prev = ACCESS_ONCE(node->prev);
+ }
+
+ /*
+ * Step - B -- stabilize @next
+ *
+ * Similar to unlock(), wait for @node->next or move @lock from @node
+ * back to @prev.
+ */
+
+ next = osq_wait_next(lock, node, prev);
+ if (!next)
+ return false;
+
+ /*
+ * Step - C -- unlink
+ *
+ * @prev is stable because its still waiting for a new @prev->next
+ * pointer, @next is stable because our @node->next pointer is NULL and
+ * it will wait in Step-A.
+ */
+
+ ACCESS_ONCE(next->prev) = prev;
+ ACCESS_ONCE(prev->next) = next;
+
+ return false;
+}
+
+void osq_unlock(struct optimistic_spin_queue **lock)
+{
+ struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node);
+ struct optimistic_spin_queue *next;
+
+ /*
+ * Fast path for the uncontended case.
+ */
+ if (likely(cmpxchg(lock, node, NULL) == node))
+ return;
+
+ /*
+ * Second most likely case.
+ */
+ next = xchg(&node->next, NULL);
+ if (next) {
+ ACCESS_ONCE(next->locked) = 1;
+ return;
+ }
+
+ next = osq_wait_next(lock, node, NULL);
+ if (next)
+ ACCESS_ONCE(next->locked) = 1;
+}
+
+#endif
+
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
new file mode 100644
index 00000000000..a2dbac4aca6
--- /dev/null
+++ b/kernel/locking/mcs_spinlock.h
@@ -0,0 +1,129 @@
+/*
+ * MCS lock defines
+ *
+ * This file contains the main data structure and API definitions of MCS lock.
+ *
+ * The MCS lock (proposed by Mellor-Crummey and Scott) is a simple spin-lock
+ * with the desirable properties of being fair, and with each cpu trying
+ * to acquire the lock spinning on a local variable.
+ * It avoids expensive cache bouncings that common test-and-set spin-lock
+ * implementations incur.
+ */
+#ifndef __LINUX_MCS_SPINLOCK_H
+#define __LINUX_MCS_SPINLOCK_H
+
+#include <asm/mcs_spinlock.h>
+
+struct mcs_spinlock {
+ struct mcs_spinlock *next;
+ int locked; /* 1 if lock acquired */
+};
+
+#ifndef arch_mcs_spin_lock_contended
+/*
+ * Using smp_load_acquire() provides a memory barrier that ensures
+ * subsequent operations happen after the lock is acquired.
+ */
+#define arch_mcs_spin_lock_contended(l) \
+do { \
+ while (!(smp_load_acquire(l))) \
+ arch_mutex_cpu_relax(); \
+} while (0)
+#endif
+
+#ifndef arch_mcs_spin_unlock_contended
+/*
+ * smp_store_release() provides a memory barrier to ensure all
+ * operations in the critical section has been completed before
+ * unlocking.
+ */
+#define arch_mcs_spin_unlock_contended(l) \
+ smp_store_release((l), 1)
+#endif
+
+/*
+ * Note: the smp_load_acquire/smp_store_release pair is not
+ * sufficient to form a full memory barrier across
+ * cpus for many architectures (except x86) for mcs_unlock and mcs_lock.
+ * For applications that need a full barrier across multiple cpus
+ * with mcs_unlock and mcs_lock pair, smp_mb__after_unlock_lock() should be
+ * used after mcs_lock.
+ */
+
+/*
+ * In order to acquire the lock, the caller should declare a local node and
+ * pass a reference of the node to this function in addition to the lock.
+ * If the lock has already been acquired, then this will proceed to spin
+ * on this node->locked until the previous lock holder sets the node->locked
+ * in mcs_spin_unlock().
+ *
+ * We don't inline mcs_spin_lock() so that perf can correctly account for the
+ * time spent in this lock function.
+ */
+static inline
+void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
+{
+ struct mcs_spinlock *prev;
+
+ /* Init node */
+ node->locked = 0;
+ node->next = NULL;
+
+ prev = xchg(lock, node);
+ if (likely(prev == NULL)) {
+ /*
+ * Lock acquired, don't need to set node->locked to 1. Threads
+ * only spin on its own node->locked value for lock acquisition.
+ * However, since this thread can immediately acquire the lock
+ * and does not proceed to spin on its own node->locked, this
+ * value won't be used. If a debug mode is needed to
+ * audit lock status, then set node->locked value here.
+ */
+ return;
+ }
+ ACCESS_ONCE(prev->next) = node;
+
+ /* Wait until the lock holder passes the lock down. */
+ arch_mcs_spin_lock_contended(&node->locked);
+}
+
+/*
+ * Releases the lock. The caller should pass in the corresponding node that
+ * was used to acquire the lock.
+ */
+static inline
+void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
+{
+ struct mcs_spinlock *next = ACCESS_ONCE(node->next);
+
+ if (likely(!next)) {
+ /*
+ * Release the lock by setting it to NULL
+ */
+ if (likely(cmpxchg(lock, node, NULL) == node))
+ return;
+ /* Wait until the next pointer is set */
+ while (!(next = ACCESS_ONCE(node->next)))
+ arch_mutex_cpu_relax();
+ }
+
+ /* Pass lock to next waiter. */
+ arch_mcs_spin_unlock_contended(&next->locked);
+}
+
+/*
+ * Cancellable version of the MCS lock above.
+ *
+ * Intended for adaptive spinning of sleeping locks:
+ * mutex_lock()/rwsem_down_{read,write}() etc.
+ */
+
+struct optimistic_spin_queue {
+ struct optimistic_spin_queue *next, *prev;
+ int locked; /* 1 if lock acquired */
+};
+
+extern bool osq_lock(struct optimistic_spin_queue **lock);
+extern void osq_unlock(struct optimistic_spin_queue **lock);
+
+#endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
index faf6f5b53e7..e1191c996c5 100644
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -83,6 +83,12 @@ void debug_mutex_unlock(struct mutex *lock)
DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
mutex_clear_owner(lock);
+
+ /*
+ * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug
+ * mutexes so that we can do it here after we've verified state.
+ */
+ atomic_set(&lock->count, 1);
}
void debug_mutex_init(struct mutex *lock, const char *name,
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 4dd6e4c219d..14fe72cc8ce 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -25,6 +25,7 @@
#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <linux/debug_locks.h>
+#include "mcs_spinlock.h"
/*
* In the DEBUG case we are using the "NULL fastpath" for mutexes,
@@ -33,6 +34,13 @@
#ifdef CONFIG_DEBUG_MUTEXES
# include "mutex-debug.h"
# include <asm-generic/mutex-null.h>
+/*
+ * Must be 0 for the debug case so we do not do the unlock outside of the
+ * wait_lock region. debug_mutex_unlock() will do the actual unlock in this
+ * case.
+ */
+# undef __mutex_slowpath_needs_to_unlock
+# define __mutex_slowpath_needs_to_unlock() 0
#else
# include "mutex.h"
# include <asm/mutex.h>
@@ -52,7 +60,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
INIT_LIST_HEAD(&lock->wait_list);
mutex_clear_owner(lock);
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
- lock->spin_mlock = NULL;
+ lock->osq = NULL;
#endif
debug_mutex_init(lock, name, key);
@@ -111,54 +119,7 @@ EXPORT_SYMBOL(mutex_lock);
* more or less simultaneously, the spinners need to acquire a MCS lock
* first before spinning on the owner field.
*
- * We don't inline mspin_lock() so that perf can correctly account for the
- * time spent in this lock function.
*/
-struct mspin_node {
- struct mspin_node *next ;
- int locked; /* 1 if lock acquired */
-};
-#define MLOCK(mutex) ((struct mspin_node **)&((mutex)->spin_mlock))
-
-static noinline
-void mspin_lock(struct mspin_node **lock, struct mspin_node *node)
-{
- struct mspin_node *prev;
-
- /* Init node */
- node->locked = 0;
- node->next = NULL;
-
- prev = xchg(lock, node);
- if (likely(prev == NULL)) {
- /* Lock acquired */
- node->locked = 1;
- return;
- }
- ACCESS_ONCE(prev->next) = node;
- smp_wmb();
- /* Wait until the lock holder passes the lock down */
- while (!ACCESS_ONCE(node->locked))
- arch_mutex_cpu_relax();
-}
-
-static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node)
-{
- struct mspin_node *next = ACCESS_ONCE(node->next);
-
- if (likely(!next)) {
- /*
- * Release the lock by setting it to NULL
- */
- if (cmpxchg(lock, node, NULL) == node)
- return;
- /* Wait until the next pointer is set */
- while (!(next = ACCESS_ONCE(node->next)))
- arch_mutex_cpu_relax();
- }
- ACCESS_ONCE(next->locked) = 1;
- smp_wmb();
-}
/*
* Mutex spinning code migrated from kernel/sched/core.c
@@ -212,6 +173,9 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
struct task_struct *owner;
int retval = 1;
+ if (need_resched())
+ return 0;
+
rcu_read_lock();
owner = ACCESS_ONCE(lock->owner);
if (owner)
@@ -446,9 +410,11 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
if (!mutex_can_spin_on_owner(lock))
goto slowpath;
+ if (!osq_lock(&lock->osq))
+ goto slowpath;
+
for (;;) {
struct task_struct *owner;
- struct mspin_node node;
if (use_ww_ctx && ww_ctx->acquired > 0) {
struct ww_mutex *ww;
@@ -463,19 +429,16 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
* performed the optimistic spinning cannot be done.
*/
if (ACCESS_ONCE(ww->ctx))
- goto slowpath;
+ break;
}
/*
* If there's an owner, wait for it to either
* release the lock or go to sleep.
*/
- mspin_lock(MLOCK(lock), &node);
owner = ACCESS_ONCE(lock->owner);
- if (owner && !mutex_spin_on_owner(lock, owner)) {
- mspin_unlock(MLOCK(lock), &node);
- goto slowpath;
- }
+ if (owner && !mutex_spin_on_owner(lock, owner))
+ break;
if ((atomic_read(&lock->count) == 1) &&
(atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
@@ -488,11 +451,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
}
mutex_set_owner(lock);
- mspin_unlock(MLOCK(lock), &node);
+ osq_unlock(&lock->osq);
preempt_enable();
return 0;
}
- mspin_unlock(MLOCK(lock), &node);
/*
* When there's no owner, we might have preempted between the
@@ -501,7 +463,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
* the owner complete.
*/
if (!owner && (need_resched() || rt_task(task)))
- goto slowpath;
+ break;
/*
* The cpu_relax() call is a compiler barrier which forces
@@ -511,7 +473,15 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
*/
arch_mutex_cpu_relax();
}
+ osq_unlock(&lock->osq);
slowpath:
+ /*
+ * If we fell out of the spin path because of need_resched(),
+ * reschedule now, before we try-lock the mutex. This avoids getting
+ * scheduled out right after we obtained the mutex.
+ */
+ if (need_resched())
+ schedule_preempt_disabled();
#endif
spin_lock_mutex(&lock->wait_lock, flags);
@@ -717,10 +687,6 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
struct mutex *lock = container_of(lock_count, struct mutex, count);
unsigned long flags;
- spin_lock_mutex(&lock->wait_lock, flags);
- mutex_release(&lock->dep_map, nested, _RET_IP_);
- debug_mutex_unlock(lock);
-
/*
* some architectures leave the lock unlocked in the fastpath failure
* case, others need to leave it locked. In the later case we have to
@@ -729,6 +695,10 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
if (__mutex_slowpath_needs_to_unlock())
atomic_set(&lock->count, 1);
+ spin_lock_mutex(&lock->wait_lock, flags);
+ mutex_release(&lock->dep_map, nested, _RET_IP_);
+ debug_mutex_unlock(lock);
+
if (!list_empty(&lock->wait_list)) {
/* get the first entry from the wait-list: */
struct mutex_waiter *waiter =
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 2e960a2bab8..aa4dff04b59 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -213,6 +213,18 @@ struct task_struct *rt_mutex_get_top_task(struct task_struct *task)
}
/*
+ * Called by sched_setscheduler() to check whether the priority change
+ * is overruled by a possible priority boosting.
+ */
+int rt_mutex_check_prio(struct task_struct *task, int newprio)
+{
+ if (!task_has_pi_waiters(task))
+ return 0;
+
+ return task_top_pi_waiter(task)->task->prio <= newprio;
+}
+
+/*
* Adjust the priority of a task, after its pi_waiters got modified.
*
* This can be both boosting and unboosting. task->pi_lock must be held.
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 2d5cc4ccff7..db4c8b08a50 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -309,7 +309,7 @@ int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
* racy then it does not matter what the result of the test
* is, we re-check the list after having taken the lock anyway:
*/
- if (rcu_dereference_raw(nh->head)) {
+ if (rcu_access_pointer(nh->head)) {
down_read(&nh->rwsem);
ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
nr_calls);
diff --git a/kernel/power/console.c b/kernel/power/console.c
index eacb8bd8cab..aba9c545a0e 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -9,6 +9,7 @@
#include <linux/kbd_kern.h>
#include <linux/vt.h>
#include <linux/module.h>
+#include <linux/slab.h>
#include "power.h"
#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index b1d255f0413..4dae9cbe925 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1076,7 +1076,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
next_seq = log_next_seq;
len = 0;
- prev = 0;
while (len >= 0 && seq < next_seq) {
struct printk_log *msg = log_from_idx(idx);
int textlen;
@@ -2788,7 +2787,6 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
next_idx = idx;
l = 0;
- prev = 0;
while (seq < dumper->next_seq) {
struct printk_log *msg = log_from_idx(idx);
diff --git a/kernel/profile.c b/kernel/profile.c
index 6631e1ef55a..ebdd9c1a86b 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -549,14 +549,14 @@ static int create_hash_tables(void)
struct page *page;
page = alloc_pages_exact_node(node,
- GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+ GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
0);
if (!page)
goto out_cleanup;
per_cpu(cpu_profile_hits, cpu)[1]
= (struct profile_hit *)page_address(page);
page = alloc_pages_exact_node(node,
- GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
+ GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
0);
if (!page)
goto out_cleanup;
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 01e9ec37a3e..807ccfbf69b 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,5 +1,5 @@
obj-y += update.o srcu.o
-obj-$(CONFIG_RCU_TORTURE_TEST) += torture.o
+obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_TREE_RCU) += tree.o
obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o
obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 79c3877e9c5..bfda2726ca4 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -12,8 +12,8 @@
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
*
* Copyright IBM Corporation, 2011
*
@@ -23,6 +23,7 @@
#ifndef __LINUX_RCU_H
#define __LINUX_RCU_H
+#include <trace/events/rcu.h>
#ifdef CONFIG_RCU_TRACE
#define RCU_TRACE(stmt) stmt
#else /* #ifdef CONFIG_RCU_TRACE */
@@ -116,8 +117,6 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
}
}
-extern int rcu_expedited;
-
#ifdef CONFIG_RCU_STALL_COMMON
extern int rcu_cpu_stall_suppress;
diff --git a/kernel/rcu/torture.c b/kernel/rcu/rcutorture.c
index 732f8ae3086..bd30bc61bc0 100644
--- a/kernel/rcu/torture.c
+++ b/kernel/rcu/rcutorture.c
@@ -12,8 +12,8 @@
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
*
* Copyright (C) IBM Corporation, 2005, 2006
*
@@ -48,110 +48,58 @@
#include <linux/slab.h>
#include <linux/trace_clock.h>
#include <asm/byteorder.h>
+#include <linux/torture.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>");
-MODULE_ALIAS("rcutorture");
-#ifdef MODULE_PARAM_PREFIX
-#undef MODULE_PARAM_PREFIX
-#endif
-#define MODULE_PARAM_PREFIX "rcutorture."
-
-static int fqs_duration;
-module_param(fqs_duration, int, 0444);
-MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable");
-static int fqs_holdoff;
-module_param(fqs_holdoff, int, 0444);
-MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
-static int fqs_stutter = 3;
-module_param(fqs_stutter, int, 0444);
-MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
-static bool gp_exp;
-module_param(gp_exp, bool, 0444);
-MODULE_PARM_DESC(gp_exp, "Use expedited GP wait primitives");
-static bool gp_normal;
-module_param(gp_normal, bool, 0444);
-MODULE_PARM_DESC(gp_normal, "Use normal (non-expedited) GP wait primitives");
-static int irqreader = 1;
-module_param(irqreader, int, 0444);
-MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers");
-static int n_barrier_cbs;
-module_param(n_barrier_cbs, int, 0444);
-MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing");
-static int nfakewriters = 4;
-module_param(nfakewriters, int, 0444);
-MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads");
-static int nreaders = -1;
-module_param(nreaders, int, 0444);
-MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
-static int object_debug;
-module_param(object_debug, int, 0444);
-MODULE_PARM_DESC(object_debug, "Enable debug-object double call_rcu() testing");
-static int onoff_holdoff;
-module_param(onoff_holdoff, int, 0444);
-MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)");
-static int onoff_interval;
-module_param(onoff_interval, int, 0444);
-MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
-static int shuffle_interval = 3;
-module_param(shuffle_interval, int, 0444);
-MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
-static int shutdown_secs;
-module_param(shutdown_secs, int, 0444);
-MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), <= zero to disable.");
-static int stall_cpu;
-module_param(stall_cpu, int, 0444);
-MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable.");
-static int stall_cpu_holdoff = 10;
-module_param(stall_cpu_holdoff, int, 0444);
-MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s).");
-static int stat_interval = 60;
-module_param(stat_interval, int, 0644);
-MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
-static int stutter = 5;
-module_param(stutter, int, 0444);
-MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
-static int test_boost = 1;
-module_param(test_boost, int, 0444);
-MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
-static int test_boost_duration = 4;
-module_param(test_boost_duration, int, 0444);
-MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds.");
-static int test_boost_interval = 7;
-module_param(test_boost_interval, int, 0444);
-MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds.");
-static bool test_no_idle_hz = true;
-module_param(test_no_idle_hz, bool, 0444);
-MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
+
+torture_param(int, fqs_duration, 0,
+ "Duration of fqs bursts (us), 0 to disable");
+torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)");
+torture_param(int, fqs_stutter, 3, "Wait time between fqs bursts (s)");
+torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
+torture_param(bool, gp_normal, false,
+ "Use normal (non-expedited) GP wait primitives");
+torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers");
+torture_param(int, n_barrier_cbs, 0,
+ "# of callbacks/kthreads for barrier testing");
+torture_param(int, nfakewriters, 4, "Number of RCU fake writer threads");
+torture_param(int, nreaders, -1, "Number of RCU reader threads");
+torture_param(int, object_debug, 0,
+ "Enable debug-object double call_rcu() testing");
+torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
+torture_param(int, onoff_interval, 0,
+ "Time between CPU hotplugs (s), 0=disable");
+torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles");
+torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable.");
+torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable.");
+torture_param(int, stall_cpu_holdoff, 10,
+ "Time to wait before starting stall (s).");
+torture_param(int, stat_interval, 60,
+ "Number of seconds between stats printk()s");
+torture_param(int, stutter, 5, "Number of seconds to run/halt test");
+torture_param(int, test_boost, 1, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
+torture_param(int, test_boost_duration, 4,
+ "Duration of each boost test, seconds.");
+torture_param(int, test_boost_interval, 7,
+ "Interval between boost tests, seconds.");
+torture_param(bool, test_no_idle_hz, true,
+ "Test support for tickless idle CPUs");
+torture_param(bool, verbose, true,
+ "Enable verbose debugging printk()s");
+
static char *torture_type = "rcu";
module_param(torture_type, charp, 0444);
MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)");
-static bool verbose;
-module_param(verbose, bool, 0444);
-MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
-
-#define TORTURE_FLAG "-torture:"
-#define PRINTK_STRING(s) \
- do { pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0)
-#define VERBOSE_PRINTK_STRING(s) \
- do { if (verbose) pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0)
-#define VERBOSE_PRINTK_ERRSTRING(s) \
- do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)
static int nrealreaders;
static struct task_struct *writer_task;
static struct task_struct **fakewriter_tasks;
static struct task_struct **reader_tasks;
static struct task_struct *stats_task;
-static struct task_struct *shuffler_task;
-static struct task_struct *stutter_task;
static struct task_struct *fqs_task;
static struct task_struct *boost_tasks[NR_CPUS];
-static struct task_struct *shutdown_task;
-#ifdef CONFIG_HOTPLUG_CPU
-static struct task_struct *onoff_task;
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
static struct task_struct *stall_task;
static struct task_struct **barrier_cbs_tasks;
static struct task_struct *barrier_task;
@@ -170,10 +118,10 @@ static struct rcu_torture __rcu *rcu_torture_current;
static unsigned long rcu_torture_current_version;
static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
static DEFINE_SPINLOCK(rcu_torture_lock);
-static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
- { 0 };
-static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) =
- { 0 };
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1],
+ rcu_torture_count) = { 0 };
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1],
+ rcu_torture_batch) = { 0 };
static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
static atomic_t n_rcu_torture_alloc;
static atomic_t n_rcu_torture_alloc_fail;
@@ -186,22 +134,9 @@ static long n_rcu_torture_boost_rterror;
static long n_rcu_torture_boost_failure;
static long n_rcu_torture_boosts;
static long n_rcu_torture_timers;
-static long n_offline_attempts;
-static long n_offline_successes;
-static unsigned long sum_offline;
-static int min_offline = -1;
-static int max_offline;
-static long n_online_attempts;
-static long n_online_successes;
-static unsigned long sum_online;
-static int min_online = -1;
-static int max_online;
static long n_barrier_attempts;
static long n_barrier_successes;
static struct list_head rcu_torture_removed;
-static cpumask_var_t shuffle_tmp_mask;
-
-static int stutter_pause_test;
#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
#define RCUTORTURE_RUNNABLE_INIT 1
@@ -232,7 +167,6 @@ static u64 notrace rcu_trace_clock_local(void)
}
#endif /* #else #ifdef CONFIG_RCU_TRACE */
-static unsigned long shutdown_time; /* jiffies to system shutdown. */
static unsigned long boost_starttime; /* jiffies of next boost test start. */
DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
/* and boost task create/destroy. */
@@ -242,51 +176,6 @@ static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */
static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */
static DECLARE_WAIT_QUEUE_HEAD(barrier_wq);
-/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */
-
-#define FULLSTOP_DONTSTOP 0 /* Normal operation. */
-#define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */
-#define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */
-static int fullstop = FULLSTOP_RMMOD;
-/*
- * Protect fullstop transitions and spawning of kthreads.
- */
-static DEFINE_MUTEX(fullstop_mutex);
-
-/* Forward reference. */
-static void rcu_torture_cleanup(void);
-
-/*
- * Detect and respond to a system shutdown.
- */
-static int
-rcutorture_shutdown_notify(struct notifier_block *unused1,
- unsigned long unused2, void *unused3)
-{
- mutex_lock(&fullstop_mutex);
- if (fullstop == FULLSTOP_DONTSTOP)
- fullstop = FULLSTOP_SHUTDOWN;
- else
- pr_warn(/* but going down anyway, so... */
- "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
- mutex_unlock(&fullstop_mutex);
- return NOTIFY_DONE;
-}
-
-/*
- * Absorb kthreads into a kernel function that won't return, so that
- * they won't ever access module text or data again.
- */
-static void rcutorture_shutdown_absorb(const char *title)
-{
- if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
- pr_notice(
- "rcutorture thread %s parking due to system shutdown\n",
- title);
- schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT);
- }
-}
-
/*
* Allocate an element from the rcu_tortures pool.
*/
@@ -320,44 +209,6 @@ rcu_torture_free(struct rcu_torture *p)
spin_unlock_bh(&rcu_torture_lock);
}
-struct rcu_random_state {
- unsigned long rrs_state;
- long rrs_count;
-};
-
-#define RCU_RANDOM_MULT 39916801 /* prime */
-#define RCU_RANDOM_ADD 479001701 /* prime */
-#define RCU_RANDOM_REFRESH 10000
-
-#define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 }
-
-/*
- * Crude but fast random-number generator. Uses a linear congruential
- * generator, with occasional help from cpu_clock().
- */
-static unsigned long
-rcu_random(struct rcu_random_state *rrsp)
-{
- if (--rrsp->rrs_count < 0) {
- rrsp->rrs_state += (unsigned long)local_clock();
- rrsp->rrs_count = RCU_RANDOM_REFRESH;
- }
- rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD;
- return swahw32(rrsp->rrs_state);
-}
-
-static void
-rcu_stutter_wait(const char *title)
-{
- while (stutter_pause_test || !rcutorture_runnable) {
- if (rcutorture_runnable)
- schedule_timeout_interruptible(1);
- else
- schedule_timeout_interruptible(round_jiffies_relative(HZ));
- rcutorture_shutdown_absorb(title);
- }
-}
-
/*
* Operations vector for selecting different types of tests.
*/
@@ -365,7 +216,7 @@ rcu_stutter_wait(const char *title)
struct rcu_torture_ops {
void (*init)(void);
int (*readlock)(void);
- void (*read_delay)(struct rcu_random_state *rrsp);
+ void (*read_delay)(struct torture_random_state *rrsp);
void (*readunlock)(int idx);
int (*completed)(void);
void (*deferred_free)(struct rcu_torture *p);
@@ -392,7 +243,7 @@ static int rcu_torture_read_lock(void) __acquires(RCU)
return 0;
}
-static void rcu_read_delay(struct rcu_random_state *rrsp)
+static void rcu_read_delay(struct torture_random_state *rrsp)
{
const unsigned long shortdelay_us = 200;
const unsigned long longdelay_ms = 50;
@@ -401,12 +252,13 @@ static void rcu_read_delay(struct rcu_random_state *rrsp)
* period, and we want a long delay occasionally to trigger
* force_quiescent_state. */
- if (!(rcu_random(rrsp) % (nrealreaders * 2000 * longdelay_ms)))
+ if (!(torture_random(rrsp) % (nrealreaders * 2000 * longdelay_ms)))
mdelay(longdelay_ms);
- if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
+ if (!(torture_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
udelay(shortdelay_us);
#ifdef CONFIG_PREEMPT
- if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000)))
+ if (!preempt_count() &&
+ !(torture_random(rrsp) % (nrealreaders * 20000)))
preempt_schedule(); /* No QS if preempt_disable() in effect */
#endif
}
@@ -427,7 +279,7 @@ rcu_torture_cb(struct rcu_head *p)
int i;
struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
- if (fullstop != FULLSTOP_DONTSTOP) {
+ if (torture_must_stop_irq()) {
/* Test is ending, just drop callbacks on the floor. */
/* The next initialization will pick up the pieces. */
return;
@@ -520,6 +372,48 @@ static struct rcu_torture_ops rcu_bh_ops = {
};
/*
+ * Don't even think about trying any of these in real life!!!
+ * The names includes "busted", and they really means it!
+ * The only purpose of these functions is to provide a buggy RCU
+ * implementation to make sure that rcutorture correctly emits
+ * buggy-RCU error messages.
+ */
+static void rcu_busted_torture_deferred_free(struct rcu_torture *p)
+{
+ /* This is a deliberate bug for testing purposes only! */
+ rcu_torture_cb(&p->rtort_rcu);
+}
+
+static void synchronize_rcu_busted(void)
+{
+ /* This is a deliberate bug for testing purposes only! */
+}
+
+static void
+call_rcu_busted(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+ /* This is a deliberate bug for testing purposes only! */
+ func(head);
+}
+
+static struct rcu_torture_ops rcu_busted_ops = {
+ .init = rcu_sync_torture_init,
+ .readlock = rcu_torture_read_lock,
+ .read_delay = rcu_read_delay, /* just reuse rcu's version. */
+ .readunlock = rcu_torture_read_unlock,
+ .completed = rcu_no_completed,
+ .deferred_free = rcu_busted_torture_deferred_free,
+ .sync = synchronize_rcu_busted,
+ .exp_sync = synchronize_rcu_busted,
+ .call = call_rcu_busted,
+ .cb_barrier = NULL,
+ .fqs = NULL,
+ .stats = NULL,
+ .irq_capable = 1,
+ .name = "rcu_busted"
+};
+
+/*
* Definitions for srcu torture testing.
*/
@@ -530,7 +424,7 @@ static int srcu_torture_read_lock(void) __acquires(&srcu_ctl)
return srcu_read_lock(&srcu_ctl);
}
-static void srcu_read_delay(struct rcu_random_state *rrsp)
+static void srcu_read_delay(struct torture_random_state *rrsp)
{
long delay;
const long uspertick = 1000000 / HZ;
@@ -538,7 +432,8 @@ static void srcu_read_delay(struct rcu_random_state *rrsp)
/* We want there to be long-running readers, but not all the time. */
- delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick);
+ delay = torture_random(rrsp) %
+ (nrealreaders * 2 * longdelay * uspertick);
if (!delay)
schedule_timeout_interruptible(longdelay);
else
@@ -677,12 +572,12 @@ static int rcu_torture_boost(void *arg)
struct rcu_boost_inflight rbi = { .inflight = 0 };
struct sched_param sp;
- VERBOSE_PRINTK_STRING("rcu_torture_boost started");
+ VERBOSE_TOROUT_STRING("rcu_torture_boost started");
/* Set real-time priority. */
sp.sched_priority = 1;
if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) {
- VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!");
+ VERBOSE_TOROUT_STRING("rcu_torture_boost RT prio failed!");
n_rcu_torture_boost_rterror++;
}
@@ -693,9 +588,8 @@ static int rcu_torture_boost(void *arg)
oldstarttime = boost_starttime;
while (ULONG_CMP_LT(jiffies, oldstarttime)) {
schedule_timeout_interruptible(oldstarttime - jiffies);
- rcu_stutter_wait("rcu_torture_boost");
- if (kthread_should_stop() ||
- fullstop != FULLSTOP_DONTSTOP)
+ stutter_wait("rcu_torture_boost");
+ if (torture_must_stop())
goto checkwait;
}
@@ -710,15 +604,14 @@ static int rcu_torture_boost(void *arg)
call_rcu(&rbi.rcu, rcu_torture_boost_cb);
if (jiffies - call_rcu_time >
test_boost_duration * HZ - HZ / 2) {
- VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed");
+ VERBOSE_TOROUT_STRING("rcu_torture_boost boosting failed");
n_rcu_torture_boost_failure++;
}
call_rcu_time = jiffies;
}
cond_resched();
- rcu_stutter_wait("rcu_torture_boost");
- if (kthread_should_stop() ||
- fullstop != FULLSTOP_DONTSTOP)
+ stutter_wait("rcu_torture_boost");
+ if (torture_must_stop())
goto checkwait;
}
@@ -742,16 +635,17 @@ static int rcu_torture_boost(void *arg)
}
/* Go do the stutter. */
-checkwait: rcu_stutter_wait("rcu_torture_boost");
- } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
+checkwait: stutter_wait("rcu_torture_boost");
+ } while (!torture_must_stop());
/* Clean up and exit. */
- VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
- rcutorture_shutdown_absorb("rcu_torture_boost");
- while (!kthread_should_stop() || rbi.inflight)
+ while (!kthread_should_stop() || rbi.inflight) {
+ torture_shutdown_absorb("rcu_torture_boost");
schedule_timeout_uninterruptible(1);
+ }
smp_mb(); /* order accesses to ->inflight before stack-frame death. */
destroy_rcu_head_on_stack(&rbi.rcu);
+ torture_kthread_stopping("rcu_torture_boost");
return 0;
}
@@ -766,7 +660,7 @@ rcu_torture_fqs(void *arg)
unsigned long fqs_resume_time;
int fqs_burst_remaining;
- VERBOSE_PRINTK_STRING("rcu_torture_fqs task started");
+ VERBOSE_TOROUT_STRING("rcu_torture_fqs task started");
do {
fqs_resume_time = jiffies + fqs_stutter * HZ;
while (ULONG_CMP_LT(jiffies, fqs_resume_time) &&
@@ -780,12 +674,9 @@ rcu_torture_fqs(void *arg)
udelay(fqs_holdoff);
fqs_burst_remaining -= fqs_holdoff;
}
- rcu_stutter_wait("rcu_torture_fqs");
- } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
- VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping");
- rcutorture_shutdown_absorb("rcu_torture_fqs");
- while (!kthread_should_stop())
- schedule_timeout_uninterruptible(1);
+ stutter_wait("rcu_torture_fqs");
+ } while (!torture_must_stop());
+ torture_kthread_stopping("rcu_torture_fqs");
return 0;
}
@@ -802,10 +693,10 @@ rcu_torture_writer(void *arg)
struct rcu_torture *rp;
struct rcu_torture *rp1;
struct rcu_torture *old_rp;
- static DEFINE_RCU_RANDOM(rand);
+ static DEFINE_TORTURE_RANDOM(rand);
- VERBOSE_PRINTK_STRING("rcu_torture_writer task started");
- set_user_nice(current, 19);
+ VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
+ set_user_nice(current, MAX_NICE);
do {
schedule_timeout_uninterruptible(1);
@@ -813,7 +704,7 @@ rcu_torture_writer(void *arg)
if (rp == NULL)
continue;
rp->rtort_pipe_count = 0;
- udelay(rcu_random(&rand) & 0x3ff);
+ udelay(torture_random(&rand) & 0x3ff);
old_rp = rcu_dereference_check(rcu_torture_current,
current == writer_task);
rp->rtort_mbtest = 1;
@@ -826,7 +717,7 @@ rcu_torture_writer(void *arg)
atomic_inc(&rcu_torture_wcount[i]);
old_rp->rtort_pipe_count++;
if (gp_normal == gp_exp)
- exp = !!(rcu_random(&rand) & 0x80);
+ exp = !!(torture_random(&rand) & 0x80);
else
exp = gp_exp;
if (!exp) {
@@ -852,12 +743,9 @@ rcu_torture_writer(void *arg)
}
}
rcutorture_record_progress(++rcu_torture_current_version);
- rcu_stutter_wait("rcu_torture_writer");
- } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
- VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
- rcutorture_shutdown_absorb("rcu_torture_writer");
- while (!kthread_should_stop())
- schedule_timeout_uninterruptible(1);
+ stutter_wait("rcu_torture_writer");
+ } while (!torture_must_stop());
+ torture_kthread_stopping("rcu_torture_writer");
return 0;
}
@@ -868,19 +756,19 @@ rcu_torture_writer(void *arg)
static int
rcu_torture_fakewriter(void *arg)
{
- DEFINE_RCU_RANDOM(rand);
+ DEFINE_TORTURE_RANDOM(rand);
- VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started");
- set_user_nice(current, 19);
+ VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started");
+ set_user_nice(current, MAX_NICE);
do {
- schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
- udelay(rcu_random(&rand) & 0x3ff);
+ schedule_timeout_uninterruptible(1 + torture_random(&rand)%10);
+ udelay(torture_random(&rand) & 0x3ff);
if (cur_ops->cb_barrier != NULL &&
- rcu_random(&rand) % (nfakewriters * 8) == 0) {
+ torture_random(&rand) % (nfakewriters * 8) == 0) {
cur_ops->cb_barrier();
} else if (gp_normal == gp_exp) {
- if (rcu_random(&rand) & 0x80)
+ if (torture_random(&rand) & 0x80)
cur_ops->sync();
else
cur_ops->exp_sync();
@@ -889,13 +777,10 @@ rcu_torture_fakewriter(void *arg)
} else {
cur_ops->exp_sync();
}
- rcu_stutter_wait("rcu_torture_fakewriter");
- } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
+ stutter_wait("rcu_torture_fakewriter");
+ } while (!torture_must_stop());
- VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping");
- rcutorture_shutdown_absorb("rcu_torture_fakewriter");
- while (!kthread_should_stop())
- schedule_timeout_uninterruptible(1);
+ torture_kthread_stopping("rcu_torture_fakewriter");
return 0;
}
@@ -921,7 +806,7 @@ static void rcu_torture_timer(unsigned long unused)
int idx;
int completed;
int completed_end;
- static DEFINE_RCU_RANDOM(rand);
+ static DEFINE_TORTURE_RANDOM(rand);
static DEFINE_SPINLOCK(rand_lock);
struct rcu_torture *p;
int pipe_count;
@@ -980,14 +865,14 @@ rcu_torture_reader(void *arg)
int completed;
int completed_end;
int idx;
- DEFINE_RCU_RANDOM(rand);
+ DEFINE_TORTURE_RANDOM(rand);
struct rcu_torture *p;
int pipe_count;
struct timer_list t;
unsigned long long ts;
- VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
- set_user_nice(current, 19);
+ VERBOSE_TOROUT_STRING("rcu_torture_reader task started");
+ set_user_nice(current, MAX_NICE);
if (irqreader && cur_ops->irq_capable)
setup_timer_on_stack(&t, rcu_torture_timer, 0);
@@ -1034,14 +919,11 @@ rcu_torture_reader(void *arg)
preempt_enable();
cur_ops->readunlock(idx);
schedule();
- rcu_stutter_wait("rcu_torture_reader");
- } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
- VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
- rcutorture_shutdown_absorb("rcu_torture_reader");
+ stutter_wait("rcu_torture_reader");
+ } while (!torture_must_stop());
if (irqreader && cur_ops->irq_capable)
del_timer_sync(&t);
- while (!kthread_should_stop())
- schedule_timeout_uninterruptible(1);
+ torture_kthread_stopping("rcu_torture_reader");
return 0;
}
@@ -1083,13 +965,7 @@ rcu_torture_printk(char *page)
n_rcu_torture_boost_failure,
n_rcu_torture_boosts,
n_rcu_torture_timers);
- page += sprintf(page,
- "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ",
- n_online_successes, n_online_attempts,
- n_offline_successes, n_offline_attempts,
- min_online, max_online,
- min_offline, max_offline,
- sum_online, sum_offline, HZ);
+ page = torture_onoff_stats(page);
page += sprintf(page, "barrier: %ld/%ld:%ld",
n_barrier_successes,
n_barrier_attempts,
@@ -1150,123 +1026,17 @@ rcu_torture_stats_print(void)
/*
* Periodically prints torture statistics, if periodic statistics printing
* was specified via the stat_interval module parameter.
- *
- * No need to worry about fullstop here, since this one doesn't reference
- * volatile state or register callbacks.
*/
static int
rcu_torture_stats(void *arg)
{
- VERBOSE_PRINTK_STRING("rcu_torture_stats task started");
+ VERBOSE_TOROUT_STRING("rcu_torture_stats task started");
do {
schedule_timeout_interruptible(stat_interval * HZ);
rcu_torture_stats_print();
- rcutorture_shutdown_absorb("rcu_torture_stats");
- } while (!kthread_should_stop());
- VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping");
- return 0;
-}
-
-static int rcu_idle_cpu; /* Force all torture tasks off this CPU */
-
-/* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case
- * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs.
- */
-static void rcu_torture_shuffle_tasks(void)
-{
- int i;
-
- cpumask_setall(shuffle_tmp_mask);
- get_online_cpus();
-
- /* No point in shuffling if there is only one online CPU (ex: UP) */
- if (num_online_cpus() == 1) {
- put_online_cpus();
- return;
- }
-
- if (rcu_idle_cpu != -1)
- cpumask_clear_cpu(rcu_idle_cpu, shuffle_tmp_mask);
-
- set_cpus_allowed_ptr(current, shuffle_tmp_mask);
-
- if (reader_tasks) {
- for (i = 0; i < nrealreaders; i++)
- if (reader_tasks[i])
- set_cpus_allowed_ptr(reader_tasks[i],
- shuffle_tmp_mask);
- }
- if (fakewriter_tasks) {
- for (i = 0; i < nfakewriters; i++)
- if (fakewriter_tasks[i])
- set_cpus_allowed_ptr(fakewriter_tasks[i],
- shuffle_tmp_mask);
- }
- if (writer_task)
- set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask);
- if (stats_task)
- set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask);
- if (stutter_task)
- set_cpus_allowed_ptr(stutter_task, shuffle_tmp_mask);
- if (fqs_task)
- set_cpus_allowed_ptr(fqs_task, shuffle_tmp_mask);
- if (shutdown_task)
- set_cpus_allowed_ptr(shutdown_task, shuffle_tmp_mask);
-#ifdef CONFIG_HOTPLUG_CPU
- if (onoff_task)
- set_cpus_allowed_ptr(onoff_task, shuffle_tmp_mask);
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
- if (stall_task)
- set_cpus_allowed_ptr(stall_task, shuffle_tmp_mask);
- if (barrier_cbs_tasks)
- for (i = 0; i < n_barrier_cbs; i++)
- if (barrier_cbs_tasks[i])
- set_cpus_allowed_ptr(barrier_cbs_tasks[i],
- shuffle_tmp_mask);
- if (barrier_task)
- set_cpus_allowed_ptr(barrier_task, shuffle_tmp_mask);
-
- if (rcu_idle_cpu == -1)
- rcu_idle_cpu = num_online_cpus() - 1;
- else
- rcu_idle_cpu--;
-
- put_online_cpus();
-}
-
-/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
- * system to become idle at a time and cut off its timer ticks. This is meant
- * to test the support for such tickless idle CPU in RCU.
- */
-static int
-rcu_torture_shuffle(void *arg)
-{
- VERBOSE_PRINTK_STRING("rcu_torture_shuffle task started");
- do {
- schedule_timeout_interruptible(shuffle_interval * HZ);
- rcu_torture_shuffle_tasks();
- rcutorture_shutdown_absorb("rcu_torture_shuffle");
- } while (!kthread_should_stop());
- VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping");
- return 0;
-}
-
-/* Cause the rcutorture test to "stutter", starting and stopping all
- * threads periodically.
- */
-static int
-rcu_torture_stutter(void *arg)
-{
- VERBOSE_PRINTK_STRING("rcu_torture_stutter task started");
- do {
- schedule_timeout_interruptible(stutter * HZ);
- stutter_pause_test = 1;
- if (!kthread_should_stop())
- schedule_timeout_interruptible(stutter * HZ);
- stutter_pause_test = 0;
- rcutorture_shutdown_absorb("rcu_torture_stutter");
- } while (!kthread_should_stop());
- VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping");
+ torture_shutdown_absorb("rcu_torture_stats");
+ } while (!torture_must_stop());
+ torture_kthread_stopping("rcu_torture_stats");
return 0;
}
@@ -1293,10 +1063,6 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
onoff_interval, onoff_holdoff);
}
-static struct notifier_block rcutorture_shutdown_nb = {
- .notifier_call = rcutorture_shutdown_notify,
-};
-
static void rcutorture_booster_cleanup(int cpu)
{
struct task_struct *t;
@@ -1304,14 +1070,12 @@ static void rcutorture_booster_cleanup(int cpu)
if (boost_tasks[cpu] == NULL)
return;
mutex_lock(&boost_mutex);
- VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task");
t = boost_tasks[cpu];
boost_tasks[cpu] = NULL;
mutex_unlock(&boost_mutex);
/* This must be outside of the mutex, otherwise deadlock! */
- kthread_stop(t);
- boost_tasks[cpu] = NULL;
+ torture_stop_kthread(rcu_torture_boost, t);
}
static int rcutorture_booster_init(int cpu)
@@ -1323,13 +1087,13 @@ static int rcutorture_booster_init(int cpu)
/* Don't allow time recalculation while creating a new task. */
mutex_lock(&boost_mutex);
- VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task");
+ VERBOSE_TOROUT_STRING("Creating rcu_torture_boost task");
boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL,
cpu_to_node(cpu),
"rcu_torture_boost");
if (IS_ERR(boost_tasks[cpu])) {
retval = PTR_ERR(boost_tasks[cpu]);
- VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed");
+ VERBOSE_TOROUT_STRING("rcu_torture_boost task create failed");
n_rcu_torture_boost_ktrerror++;
boost_tasks[cpu] = NULL;
mutex_unlock(&boost_mutex);
@@ -1342,175 +1106,6 @@ static int rcutorture_booster_init(int cpu)
}
/*
- * Cause the rcutorture test to shutdown the system after the test has
- * run for the time specified by the shutdown_secs module parameter.
- */
-static int
-rcu_torture_shutdown(void *arg)
-{
- long delta;
- unsigned long jiffies_snap;
-
- VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started");
- jiffies_snap = ACCESS_ONCE(jiffies);
- while (ULONG_CMP_LT(jiffies_snap, shutdown_time) &&
- !kthread_should_stop()) {
- delta = shutdown_time - jiffies_snap;
- if (verbose)
- pr_alert("%s" TORTURE_FLAG
- "rcu_torture_shutdown task: %lu jiffies remaining\n",
- torture_type, delta);
- schedule_timeout_interruptible(delta);
- jiffies_snap = ACCESS_ONCE(jiffies);
- }
- if (kthread_should_stop()) {
- VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping");
- return 0;
- }
-
- /* OK, shut down the system. */
-
- VERBOSE_PRINTK_STRING("rcu_torture_shutdown task shutting down system");
- shutdown_task = NULL; /* Avoid self-kill deadlock. */
- rcu_torture_cleanup(); /* Get the success/failure message. */
- kernel_power_off(); /* Shut down the system. */
- return 0;
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-/*
- * Execute random CPU-hotplug operations at the interval specified
- * by the onoff_interval.
- */
-static int
-rcu_torture_onoff(void *arg)
-{
- int cpu;
- unsigned long delta;
- int maxcpu = -1;
- DEFINE_RCU_RANDOM(rand);
- int ret;
- unsigned long starttime;
-
- VERBOSE_PRINTK_STRING("rcu_torture_onoff task started");
- for_each_online_cpu(cpu)
- maxcpu = cpu;
- WARN_ON(maxcpu < 0);
- if (onoff_holdoff > 0) {
- VERBOSE_PRINTK_STRING("rcu_torture_onoff begin holdoff");
- schedule_timeout_interruptible(onoff_holdoff * HZ);
- VERBOSE_PRINTK_STRING("rcu_torture_onoff end holdoff");
- }
- while (!kthread_should_stop()) {
- cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1);
- if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
- if (verbose)
- pr_alert("%s" TORTURE_FLAG
- "rcu_torture_onoff task: offlining %d\n",
- torture_type, cpu);
- starttime = jiffies;
- n_offline_attempts++;
- ret = cpu_down(cpu);
- if (ret) {
- if (verbose)
- pr_alert("%s" TORTURE_FLAG
- "rcu_torture_onoff task: offline %d failed: errno %d\n",
- torture_type, cpu, ret);
- } else {
- if (verbose)
- pr_alert("%s" TORTURE_FLAG
- "rcu_torture_onoff task: offlined %d\n",
- torture_type, cpu);
- n_offline_successes++;
- delta = jiffies - starttime;
- sum_offline += delta;
- if (min_offline < 0) {
- min_offline = delta;
- max_offline = delta;
- }
- if (min_offline > delta)
- min_offline = delta;
- if (max_offline < delta)
- max_offline = delta;
- }
- } else if (cpu_is_hotpluggable(cpu)) {
- if (verbose)
- pr_alert("%s" TORTURE_FLAG
- "rcu_torture_onoff task: onlining %d\n",
- torture_type, cpu);
- starttime = jiffies;
- n_online_attempts++;
- ret = cpu_up(cpu);
- if (ret) {
- if (verbose)
- pr_alert("%s" TORTURE_FLAG
- "rcu_torture_onoff task: online %d failed: errno %d\n",
- torture_type, cpu, ret);
- } else {
- if (verbose)
- pr_alert("%s" TORTURE_FLAG
- "rcu_torture_onoff task: onlined %d\n",
- torture_type, cpu);
- n_online_successes++;
- delta = jiffies - starttime;
- sum_online += delta;
- if (min_online < 0) {
- min_online = delta;
- max_online = delta;
- }
- if (min_online > delta)
- min_online = delta;
- if (max_online < delta)
- max_online = delta;
- }
- }
- schedule_timeout_interruptible(onoff_interval * HZ);
- }
- VERBOSE_PRINTK_STRING("rcu_torture_onoff task stopping");
- return 0;
-}
-
-static int
-rcu_torture_onoff_init(void)
-{
- int ret;
-
- if (onoff_interval <= 0)
- return 0;
- onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff");
- if (IS_ERR(onoff_task)) {
- ret = PTR_ERR(onoff_task);
- onoff_task = NULL;
- return ret;
- }
- return 0;
-}
-
-static void rcu_torture_onoff_cleanup(void)
-{
- if (onoff_task == NULL)
- return;
- VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task");
- kthread_stop(onoff_task);
- onoff_task = NULL;
-}
-
-#else /* #ifdef CONFIG_HOTPLUG_CPU */
-
-static int
-rcu_torture_onoff_init(void)
-{
- return 0;
-}
-
-static void rcu_torture_onoff_cleanup(void)
-{
-}
-
-#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
-
-/*
* CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then
* induces a CPU stall for the time specified by stall_cpu.
*/
@@ -1518,11 +1113,11 @@ static int rcu_torture_stall(void *args)
{
unsigned long stop_at;
- VERBOSE_PRINTK_STRING("rcu_torture_stall task started");
+ VERBOSE_TOROUT_STRING("rcu_torture_stall task started");
if (stall_cpu_holdoff > 0) {
- VERBOSE_PRINTK_STRING("rcu_torture_stall begin holdoff");
+ VERBOSE_TOROUT_STRING("rcu_torture_stall begin holdoff");
schedule_timeout_interruptible(stall_cpu_holdoff * HZ);
- VERBOSE_PRINTK_STRING("rcu_torture_stall end holdoff");
+ VERBOSE_TOROUT_STRING("rcu_torture_stall end holdoff");
}
if (!kthread_should_stop()) {
stop_at = get_seconds() + stall_cpu;
@@ -1536,7 +1131,7 @@ static int rcu_torture_stall(void *args)
rcu_read_unlock();
pr_alert("rcu_torture_stall end.\n");
}
- rcutorture_shutdown_absorb("rcu_torture_stall");
+ torture_shutdown_absorb("rcu_torture_stall");
while (!kthread_should_stop())
schedule_timeout_interruptible(10 * HZ);
return 0;
@@ -1545,27 +1140,9 @@ static int rcu_torture_stall(void *args)
/* Spawn CPU-stall kthread, if stall_cpu specified. */
static int __init rcu_torture_stall_init(void)
{
- int ret;
-
if (stall_cpu <= 0)
return 0;
- stall_task = kthread_run(rcu_torture_stall, NULL, "rcu_torture_stall");
- if (IS_ERR(stall_task)) {
- ret = PTR_ERR(stall_task);
- stall_task = NULL;
- return ret;
- }
- return 0;
-}
-
-/* Clean up after the CPU-stall kthread, if one was spawned. */
-static void rcu_torture_stall_cleanup(void)
-{
- if (stall_task == NULL)
- return;
- VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task.");
- kthread_stop(stall_task);
- stall_task = NULL;
+ return torture_create_kthread(rcu_torture_stall, NULL, stall_task);
}
/* Callback function for RCU barrier testing. */
@@ -1583,28 +1160,24 @@ static int rcu_torture_barrier_cbs(void *arg)
struct rcu_head rcu;
init_rcu_head_on_stack(&rcu);
- VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started");
- set_user_nice(current, 19);
+ VERBOSE_TOROUT_STRING("rcu_torture_barrier_cbs task started");
+ set_user_nice(current, MAX_NICE);
do {
wait_event(barrier_cbs_wq[myid],
(newphase =
ACCESS_ONCE(barrier_phase)) != lastphase ||
- kthread_should_stop() ||
- fullstop != FULLSTOP_DONTSTOP);
+ torture_must_stop());
lastphase = newphase;
smp_mb(); /* ensure barrier_phase load before ->call(). */
- if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
+ if (torture_must_stop())
break;
cur_ops->call(&rcu, rcu_torture_barrier_cbf);
if (atomic_dec_and_test(&barrier_cbs_count))
wake_up(&barrier_wq);
- } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
- VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping");
- rcutorture_shutdown_absorb("rcu_torture_barrier_cbs");
- while (!kthread_should_stop())
- schedule_timeout_interruptible(1);
+ } while (!torture_must_stop());
cur_ops->cb_barrier();
destroy_rcu_head_on_stack(&rcu);
+ torture_kthread_stopping("rcu_torture_barrier_cbs");
return 0;
}
@@ -1613,7 +1186,7 @@ static int rcu_torture_barrier(void *arg)
{
int i;
- VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting");
+ VERBOSE_TOROUT_STRING("rcu_torture_barrier task starting");
do {
atomic_set(&barrier_cbs_invoked, 0);
atomic_set(&barrier_cbs_count, n_barrier_cbs);
@@ -1623,9 +1196,8 @@ static int rcu_torture_barrier(void *arg)
wake_up(&barrier_cbs_wq[i]);
wait_event(barrier_wq,
atomic_read(&barrier_cbs_count) == 0 ||
- kthread_should_stop() ||
- fullstop != FULLSTOP_DONTSTOP);
- if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP)
+ torture_must_stop());
+ if (torture_must_stop())
break;
n_barrier_attempts++;
cur_ops->cb_barrier(); /* Implies smp_mb() for wait_event(). */
@@ -1635,11 +1207,8 @@ static int rcu_torture_barrier(void *arg)
}
n_barrier_successes++;
schedule_timeout_interruptible(HZ / 10);
- } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
- VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping");
- rcutorture_shutdown_absorb("rcu_torture_barrier");
- while (!kthread_should_stop())
- schedule_timeout_interruptible(1);
+ } while (!torture_must_stop());
+ torture_kthread_stopping("rcu_torture_barrier");
return 0;
}
@@ -1672,24 +1241,13 @@ static int rcu_torture_barrier_init(void)
return -ENOMEM;
for (i = 0; i < n_barrier_cbs; i++) {
init_waitqueue_head(&barrier_cbs_wq[i]);
- barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs,
- (void *)(long)i,
- "rcu_torture_barrier_cbs");
- if (IS_ERR(barrier_cbs_tasks[i])) {
- ret = PTR_ERR(barrier_cbs_tasks[i]);
- VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs");
- barrier_cbs_tasks[i] = NULL;
+ ret = torture_create_kthread(rcu_torture_barrier_cbs,
+ (void *)(long)i,
+ barrier_cbs_tasks[i]);
+ if (ret)
return ret;
- }
}
- barrier_task = kthread_run(rcu_torture_barrier, NULL,
- "rcu_torture_barrier");
- if (IS_ERR(barrier_task)) {
- ret = PTR_ERR(barrier_task);
- VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier");
- barrier_task = NULL;
- }
- return 0;
+ return torture_create_kthread(rcu_torture_barrier, NULL, barrier_task);
}
/* Clean up after RCU barrier testing. */
@@ -1697,19 +1255,11 @@ static void rcu_torture_barrier_cleanup(void)
{
int i;
- if (barrier_task != NULL) {
- VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task");
- kthread_stop(barrier_task);
- barrier_task = NULL;
- }
+ torture_stop_kthread(rcu_torture_barrier, barrier_task);
if (barrier_cbs_tasks != NULL) {
- for (i = 0; i < n_barrier_cbs; i++) {
- if (barrier_cbs_tasks[i] != NULL) {
- VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task");
- kthread_stop(barrier_cbs_tasks[i]);
- barrier_cbs_tasks[i] = NULL;
- }
- }
+ for (i = 0; i < n_barrier_cbs; i++)
+ torture_stop_kthread(rcu_torture_barrier_cbs,
+ barrier_cbs_tasks[i]);
kfree(barrier_cbs_tasks);
barrier_cbs_tasks = NULL;
}
@@ -1747,90 +1297,42 @@ rcu_torture_cleanup(void)
{
int i;
- mutex_lock(&fullstop_mutex);
rcutorture_record_test_transition();
- if (fullstop == FULLSTOP_SHUTDOWN) {
- pr_warn(/* but going down anyway, so... */
- "Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
- mutex_unlock(&fullstop_mutex);
- schedule_timeout_uninterruptible(10);
+ if (torture_cleanup()) {
if (cur_ops->cb_barrier != NULL)
cur_ops->cb_barrier();
return;
}
- fullstop = FULLSTOP_RMMOD;
- mutex_unlock(&fullstop_mutex);
- unregister_reboot_notifier(&rcutorture_shutdown_nb);
- rcu_torture_barrier_cleanup();
- rcu_torture_stall_cleanup();
- if (stutter_task) {
- VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
- kthread_stop(stutter_task);
- }
- stutter_task = NULL;
- if (shuffler_task) {
- VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
- kthread_stop(shuffler_task);
- free_cpumask_var(shuffle_tmp_mask);
- }
- shuffler_task = NULL;
- if (writer_task) {
- VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
- kthread_stop(writer_task);
- }
- writer_task = NULL;
+ rcu_torture_barrier_cleanup();
+ torture_stop_kthread(rcu_torture_stall, stall_task);
+ torture_stop_kthread(rcu_torture_writer, writer_task);
if (reader_tasks) {
- for (i = 0; i < nrealreaders; i++) {
- if (reader_tasks[i]) {
- VERBOSE_PRINTK_STRING(
- "Stopping rcu_torture_reader task");
- kthread_stop(reader_tasks[i]);
- }
- reader_tasks[i] = NULL;
- }
+ for (i = 0; i < nrealreaders; i++)
+ torture_stop_kthread(rcu_torture_reader,
+ reader_tasks[i]);
kfree(reader_tasks);
- reader_tasks = NULL;
}
rcu_torture_current = NULL;
if (fakewriter_tasks) {
for (i = 0; i < nfakewriters; i++) {
- if (fakewriter_tasks[i]) {
- VERBOSE_PRINTK_STRING(
- "Stopping rcu_torture_fakewriter task");
- kthread_stop(fakewriter_tasks[i]);
- }
- fakewriter_tasks[i] = NULL;
+ torture_stop_kthread(rcu_torture_fakewriter,
+ fakewriter_tasks[i]);
}
kfree(fakewriter_tasks);
fakewriter_tasks = NULL;
}
- if (stats_task) {
- VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
- kthread_stop(stats_task);
- }
- stats_task = NULL;
-
- if (fqs_task) {
- VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task");
- kthread_stop(fqs_task);
- }
- fqs_task = NULL;
+ torture_stop_kthread(rcu_torture_stats, stats_task);
+ torture_stop_kthread(rcu_torture_fqs, fqs_task);
if ((test_boost == 1 && cur_ops->can_boost) ||
test_boost == 2) {
unregister_cpu_notifier(&rcutorture_cpu_nb);
for_each_possible_cpu(i)
rcutorture_booster_cleanup(i);
}
- if (shutdown_task != NULL) {
- VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task");
- kthread_stop(shutdown_task);
- }
- shutdown_task = NULL;
- rcu_torture_onoff_cleanup();
/* Wait for all RCU callbacks to fire. */
@@ -1841,8 +1343,7 @@ rcu_torture_cleanup(void)
if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error)
rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
- else if (n_online_successes != n_online_attempts ||
- n_offline_successes != n_offline_attempts)
+ else if (torture_onoff_failures())
rcu_torture_print_module_parms(cur_ops,
"End of test: RCU_HOTPLUG");
else
@@ -1911,12 +1412,11 @@ rcu_torture_init(void)
int i;
int cpu;
int firsterr = 0;
- int retval;
static struct rcu_torture_ops *torture_ops[] = {
- &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops,
+ &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops,
};
- mutex_lock(&fullstop_mutex);
+ torture_init_begin(torture_type, verbose, &rcutorture_runnable);
/* Process args and tell the world that the torturer is on the job. */
for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
@@ -1931,7 +1431,7 @@ rcu_torture_init(void)
for (i = 0; i < ARRAY_SIZE(torture_ops); i++)
pr_alert(" %s", torture_ops[i]->name);
pr_alert("\n");
- mutex_unlock(&fullstop_mutex);
+ torture_init_end();
return -EINVAL;
}
if (cur_ops->fqs == NULL && fqs_duration != 0) {
@@ -1946,7 +1446,6 @@ rcu_torture_init(void)
else
nrealreaders = 2 * num_online_cpus();
rcu_torture_print_module_parms(cur_ops, "Start of test");
- fullstop = FULLSTOP_DONTSTOP;
/* Set up the freelist. */
@@ -1982,108 +1481,61 @@ rcu_torture_init(void)
/* Start up the kthreads. */
- VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task");
- writer_task = kthread_create(rcu_torture_writer, NULL,
- "rcu_torture_writer");
- if (IS_ERR(writer_task)) {
- firsterr = PTR_ERR(writer_task);
- VERBOSE_PRINTK_ERRSTRING("Failed to create writer");
- writer_task = NULL;
+ firsterr = torture_create_kthread(rcu_torture_writer, NULL,
+ writer_task);
+ if (firsterr)
goto unwind;
- }
- wake_up_process(writer_task);
fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]),
GFP_KERNEL);
if (fakewriter_tasks == NULL) {
- VERBOSE_PRINTK_ERRSTRING("out of memory");
+ VERBOSE_TOROUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
for (i = 0; i < nfakewriters; i++) {
- VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task");
- fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL,
- "rcu_torture_fakewriter");
- if (IS_ERR(fakewriter_tasks[i])) {
- firsterr = PTR_ERR(fakewriter_tasks[i]);
- VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter");
- fakewriter_tasks[i] = NULL;
+ firsterr = torture_create_kthread(rcu_torture_fakewriter,
+ NULL, fakewriter_tasks[i]);
+ if (firsterr)
goto unwind;
- }
}
reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]),
GFP_KERNEL);
if (reader_tasks == NULL) {
- VERBOSE_PRINTK_ERRSTRING("out of memory");
+ VERBOSE_TOROUT_ERRSTRING("out of memory");
firsterr = -ENOMEM;
goto unwind;
}
for (i = 0; i < nrealreaders; i++) {
- VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task");
- reader_tasks[i] = kthread_run(rcu_torture_reader, NULL,
- "rcu_torture_reader");
- if (IS_ERR(reader_tasks[i])) {
- firsterr = PTR_ERR(reader_tasks[i]);
- VERBOSE_PRINTK_ERRSTRING("Failed to create reader");
- reader_tasks[i] = NULL;
+ firsterr = torture_create_kthread(rcu_torture_reader, NULL,
+ reader_tasks[i]);
+ if (firsterr)
goto unwind;
- }
}
if (stat_interval > 0) {
- VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task");
- stats_task = kthread_run(rcu_torture_stats, NULL,
- "rcu_torture_stats");
- if (IS_ERR(stats_task)) {
- firsterr = PTR_ERR(stats_task);
- VERBOSE_PRINTK_ERRSTRING("Failed to create stats");
- stats_task = NULL;
+ firsterr = torture_create_kthread(rcu_torture_stats, NULL,
+ stats_task);
+ if (firsterr)
goto unwind;
- }
}
if (test_no_idle_hz) {
- rcu_idle_cpu = num_online_cpus() - 1;
-
- if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) {
- firsterr = -ENOMEM;
- VERBOSE_PRINTK_ERRSTRING("Failed to alloc mask");
- goto unwind;
- }
-
- /* Create the shuffler thread */
- shuffler_task = kthread_run(rcu_torture_shuffle, NULL,
- "rcu_torture_shuffle");
- if (IS_ERR(shuffler_task)) {
- free_cpumask_var(shuffle_tmp_mask);
- firsterr = PTR_ERR(shuffler_task);
- VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler");
- shuffler_task = NULL;
+ firsterr = torture_shuffle_init(shuffle_interval * HZ);
+ if (firsterr)
goto unwind;
- }
}
if (stutter < 0)
stutter = 0;
if (stutter) {
- /* Create the stutter thread */
- stutter_task = kthread_run(rcu_torture_stutter, NULL,
- "rcu_torture_stutter");
- if (IS_ERR(stutter_task)) {
- firsterr = PTR_ERR(stutter_task);
- VERBOSE_PRINTK_ERRSTRING("Failed to create stutter");
- stutter_task = NULL;
+ firsterr = torture_stutter_init(stutter * HZ);
+ if (firsterr)
goto unwind;
- }
}
if (fqs_duration < 0)
fqs_duration = 0;
if (fqs_duration) {
- /* Create the stutter thread */
- fqs_task = kthread_run(rcu_torture_fqs, NULL,
- "rcu_torture_fqs");
- if (IS_ERR(fqs_task)) {
- firsterr = PTR_ERR(fqs_task);
- VERBOSE_PRINTK_ERRSTRING("Failed to create fqs");
- fqs_task = NULL;
+ /* Create the fqs thread */
+ torture_create_kthread(rcu_torture_fqs, NULL, fqs_task);
+ if (firsterr)
goto unwind;
- }
}
if (test_boost_interval < 1)
test_boost_interval = 1;
@@ -2097,49 +1549,31 @@ rcu_torture_init(void)
for_each_possible_cpu(i) {
if (cpu_is_offline(i))
continue; /* Heuristic: CPU can go offline. */
- retval = rcutorture_booster_init(i);
- if (retval < 0) {
- firsterr = retval;
+ firsterr = rcutorture_booster_init(i);
+ if (firsterr)
goto unwind;
- }
}
}
- if (shutdown_secs > 0) {
- shutdown_time = jiffies + shutdown_secs * HZ;
- shutdown_task = kthread_create(rcu_torture_shutdown, NULL,
- "rcu_torture_shutdown");
- if (IS_ERR(shutdown_task)) {
- firsterr = PTR_ERR(shutdown_task);
- VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown");
- shutdown_task = NULL;
- goto unwind;
- }
- wake_up_process(shutdown_task);
- }
- i = rcu_torture_onoff_init();
- if (i != 0) {
- firsterr = i;
+ firsterr = torture_shutdown_init(shutdown_secs, rcu_torture_cleanup);
+ if (firsterr)
goto unwind;
- }
- register_reboot_notifier(&rcutorture_shutdown_nb);
- i = rcu_torture_stall_init();
- if (i != 0) {
- firsterr = i;
+ firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval * HZ);
+ if (firsterr)
goto unwind;
- }
- retval = rcu_torture_barrier_init();
- if (retval != 0) {
- firsterr = retval;
+ firsterr = rcu_torture_stall_init();
+ if (firsterr)
+ goto unwind;
+ firsterr = rcu_torture_barrier_init();
+ if (firsterr)
goto unwind;
- }
if (object_debug)
rcu_test_debug_objects();
rcutorture_record_test_transition();
- mutex_unlock(&fullstop_mutex);
+ torture_init_end();
return 0;
unwind:
- mutex_unlock(&fullstop_mutex);
+ torture_init_end();
rcu_torture_cleanup();
return firsterr;
}
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 3318d828438..c639556f3fa 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -12,8 +12,8 @@
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
*
* Copyright (C) IBM Corporation, 2006
* Copyright (C) Fujitsu, 2012
@@ -36,8 +36,6 @@
#include <linux/delay.h>
#include <linux/srcu.h>
-#include <trace/events/rcu.h>
-
#include "rcu.h"
/*
@@ -398,7 +396,7 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
rcu_batch_queue(&sp->batch_queue, head);
if (!sp->running) {
sp->running = true;
- schedule_delayed_work(&sp->work, 0);
+ queue_delayed_work(system_power_efficient_wq, &sp->work, 0);
}
spin_unlock_irqrestore(&sp->queue_lock, flags);
}
@@ -674,7 +672,8 @@ static void srcu_reschedule(struct srcu_struct *sp)
}
if (pending)
- schedule_delayed_work(&sp->work, SRCU_INTERVAL);
+ queue_delayed_work(system_power_efficient_wq,
+ &sp->work, SRCU_INTERVAL);
}
/*
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index 1254f312d02..d9efcc13008 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -12,8 +12,8 @@
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
*
* Copyright IBM Corporation, 2008
*
@@ -37,10 +37,6 @@
#include <linux/prefetch.h>
#include <linux/ftrace_event.h>
-#ifdef CONFIG_RCU_TRACE
-#include <trace/events/rcu.h>
-#endif /* #else #ifdef CONFIG_RCU_TRACE */
-
#include "rcu.h"
/* Forward declarations for tiny_plugin.h. */
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 280d06cae35..43152852056 100644
--- a/kernel/rcu/tiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
@@ -14,8 +14,8 @@
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
*
* Copyright (c) 2010 Linaro
*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index b3d116cd072..0c47e300210 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -12,8 +12,8 @@
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
*
* Copyright IBM Corporation, 2008
*
@@ -58,8 +58,6 @@
#include <linux/suspend.h>
#include "tree.h"
-#include <trace/events/rcu.h>
-
#include "rcu.h"
MODULE_ALIAS("rcutree");
@@ -837,7 +835,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
* to the next. Only do this for the primary flavor of RCU.
*/
if (rdp->rsp == rcu_state &&
- ULONG_CMP_GE(ACCESS_ONCE(jiffies), rdp->rsp->jiffies_resched)) {
+ ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
rdp->rsp->jiffies_resched += 5;
resched_cpu(rdp->cpu);
}
@@ -847,7 +845,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
static void record_gp_stall_check_time(struct rcu_state *rsp)
{
- unsigned long j = ACCESS_ONCE(jiffies);
+ unsigned long j = jiffies;
unsigned long j1;
rsp->gp_start = j;
@@ -1005,7 +1003,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp))
return;
- j = ACCESS_ONCE(jiffies);
+ j = jiffies;
/*
* Lots of memory barriers to reject false positives.
@@ -1423,13 +1421,14 @@ static int rcu_gp_init(struct rcu_state *rsp)
/* Advance to a new grace period and initialize state. */
record_gp_stall_check_time(rsp);
- smp_wmb(); /* Record GP times before starting GP. */
- rsp->gpnum++;
+ /* Record GP times before starting GP, hence smp_store_release(). */
+ smp_store_release(&rsp->gpnum, rsp->gpnum + 1);
trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start"));
raw_spin_unlock_irq(&rnp->lock);
/* Exclude any concurrent CPU-hotplug operations. */
mutex_lock(&rsp->onoff_mutex);
+ smp_mb__after_unlock_lock(); /* ->gpnum increment before GP! */
/*
* Set the quiescent-state-needed bits in all the rcu_node
@@ -1557,10 +1556,11 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
}
rnp = rcu_get_root(rsp);
raw_spin_lock_irq(&rnp->lock);
- smp_mb__after_unlock_lock();
+ smp_mb__after_unlock_lock(); /* Order GP before ->completed update. */
rcu_nocb_gp_set(rnp, nocb);
- rsp->completed = rsp->gpnum; /* Declare grace period done. */
+ /* Declare grace period done. */
+ ACCESS_ONCE(rsp->completed) = rsp->gpnum;
trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end"));
rsp->fqs_state = RCU_GP_IDLE;
rdp = this_cpu_ptr(rsp->rda);
@@ -2304,7 +2304,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
if (rnp_old != NULL)
raw_spin_unlock(&rnp_old->fqslock);
if (ret) {
- rsp->n_force_qs_lh++;
+ ACCESS_ONCE(rsp->n_force_qs_lh)++;
return;
}
rnp_old = rnp;
@@ -2316,7 +2316,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
smp_mb__after_unlock_lock();
raw_spin_unlock(&rnp_old->fqslock);
if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
- rsp->n_force_qs_lh++;
+ ACCESS_ONCE(rsp->n_force_qs_lh)++;
raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
return; /* Someone beat us to it. */
}
@@ -2639,6 +2639,58 @@ void synchronize_rcu_bh(void)
}
EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
+/**
+ * get_state_synchronize_rcu - Snapshot current RCU state
+ *
+ * Returns a cookie that is used by a later call to cond_synchronize_rcu()
+ * to determine whether or not a full grace period has elapsed in the
+ * meantime.
+ */
+unsigned long get_state_synchronize_rcu(void)
+{
+ /*
+ * Any prior manipulation of RCU-protected data must happen
+ * before the load from ->gpnum.
+ */
+ smp_mb(); /* ^^^ */
+
+ /*
+ * Make sure this load happens before the purportedly
+ * time-consuming work between get_state_synchronize_rcu()
+ * and cond_synchronize_rcu().
+ */
+ return smp_load_acquire(&rcu_state->gpnum);
+}
+EXPORT_SYMBOL_GPL(get_state_synchronize_rcu);
+
+/**
+ * cond_synchronize_rcu - Conditionally wait for an RCU grace period
+ *
+ * @oldstate: return value from earlier call to get_state_synchronize_rcu()
+ *
+ * If a full RCU grace period has elapsed since the earlier call to
+ * get_state_synchronize_rcu(), just return. Otherwise, invoke
+ * synchronize_rcu() to wait for a full grace period.
+ *
+ * Yes, this function does not take counter wrap into account. But
+ * counter wrap is harmless. If the counter wraps, we have waited for
+ * more than 2 billion grace periods (and way more on a 64-bit system!),
+ * so waiting for one additional grace period should be just fine.
+ */
+void cond_synchronize_rcu(unsigned long oldstate)
+{
+ unsigned long newstate;
+
+ /*
+ * Ensure that this load happens before any RCU-destructive
+ * actions the caller might carry out after we return.
+ */
+ newstate = smp_load_acquire(&rcu_state->completed);
+ if (ULONG_CMP_GE(oldstate, newstate))
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
+
static int synchronize_sched_expedited_cpu_stop(void *data)
{
/*
@@ -2880,7 +2932,7 @@ static int rcu_pending(int cpu)
* non-NULL, store an indication of whether all callbacks are lazy.
* (If there are no callbacks, all of them are deemed to be lazy.)
*/
-static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
+static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
{
bool al = true;
bool hc = false;
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 8c19873f1ac..75dc3c39a02 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -13,8 +13,8 @@
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
*
* Copyright IBM Corporation, 2008
*
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 6e2ef4b2b92..962d1d58992 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -14,8 +14,8 @@
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
*
* Copyright Red Hat, 2009
* Copyright IBM Corporation, 2009
@@ -1586,11 +1586,13 @@ static void rcu_prepare_kthreads(int cpu)
* Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
* any flavor of RCU.
*/
+#ifndef CONFIG_RCU_NOCB_CPU_ALL
int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
{
*delta_jiffies = ULONG_MAX;
return rcu_cpu_has_callbacks(cpu, NULL);
}
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
/*
* Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
@@ -1656,7 +1658,7 @@ extern int tick_nohz_active;
* only if it has been awhile since the last time we did so. Afterwards,
* if there are any callbacks ready for immediate invocation, return true.
*/
-static bool rcu_try_advance_all_cbs(void)
+static bool __maybe_unused rcu_try_advance_all_cbs(void)
{
bool cbs_ready = false;
struct rcu_data *rdp;
@@ -1696,6 +1698,7 @@ static bool rcu_try_advance_all_cbs(void)
*
* The caller must have disabled interrupts.
*/
+#ifndef CONFIG_RCU_NOCB_CPU_ALL
int rcu_needs_cpu(int cpu, unsigned long *dj)
{
struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
@@ -1726,6 +1729,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj)
}
return 0;
}
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
/*
* Prepare a CPU for idle from an RCU perspective. The first major task
@@ -1739,6 +1743,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj)
*/
static void rcu_prepare_for_idle(int cpu)
{
+#ifndef CONFIG_RCU_NOCB_CPU_ALL
struct rcu_data *rdp;
struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
struct rcu_node *rnp;
@@ -1790,6 +1795,7 @@ static void rcu_prepare_for_idle(int cpu)
rcu_accelerate_cbs(rsp, rnp, rdp);
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
}
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
}
/*
@@ -1799,11 +1805,12 @@ static void rcu_prepare_for_idle(int cpu)
*/
static void rcu_cleanup_after_idle(int cpu)
{
-
+#ifndef CONFIG_RCU_NOCB_CPU_ALL
if (rcu_is_nocb_cpu(cpu))
return;
if (rcu_try_advance_all_cbs())
invoke_rcu_core();
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
}
/*
@@ -2101,6 +2108,7 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
init_waitqueue_head(&rnp->nocb_gp_wq[1]);
}
+#ifndef CONFIG_RCU_NOCB_CPU_ALL
/* Is the specified CPU a no-CPUs CPU? */
bool rcu_is_nocb_cpu(int cpu)
{
@@ -2108,6 +2116,7 @@ bool rcu_is_nocb_cpu(int cpu)
return cpumask_test_cpu(cpu, rcu_nocb_mask);
return false;
}
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
/*
* Enqueue the specified string of rcu_head structures onto the specified
@@ -2893,7 +2902,7 @@ static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
* CPU unless the grace period has extended for too long.
*
* This code relies on the fact that all NO_HZ_FULL CPUs are also
- * CONFIG_RCU_NOCB_CPUs.
+ * CONFIG_RCU_NOCB_CPU CPUs.
*/
static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
{
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 4def475336d..5cdc62e1bee 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -12,8 +12,8 @@
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
*
* Copyright IBM Corporation, 2008
*
@@ -273,7 +273,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n",
rsp->n_force_qs, rsp->n_force_qs_ngp,
rsp->n_force_qs - rsp->n_force_qs_ngp,
- rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen);
+ ACCESS_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen);
for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) {
if (rnp->level != level) {
seq_puts(m, "\n");
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index c54609faf23..4c0a9b0af46 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -12,8 +12,8 @@
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
*
* Copyright IBM Corporation, 2001
*
@@ -49,7 +49,6 @@
#include <linux/module.h>
#define CREATE_TRACE_POINTS
-#include <trace/events/rcu.h>
#include "rcu.h"
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 9a95c8c2af2..ab32b7b0db5 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -13,7 +13,7 @@ endif
obj-y += core.o proc.o clock.o cputime.o
obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
-obj-y += wait.o completion.o
+obj-y += wait.o completion.o idle.o
obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 4a073539c58..e73efba9830 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -203,7 +203,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
struct autogroup *ag;
int err;
- if (nice < -20 || nice > 19)
+ if (nice < MIN_NICE || nice > MAX_NICE)
return -EINVAL;
err = security_task_setnice(current, nice);
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 43c2bcc3576..b30a2924ef1 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -301,14 +301,14 @@ u64 sched_clock_cpu(int cpu)
if (unlikely(!sched_clock_running))
return 0ull;
- preempt_disable();
+ preempt_disable_notrace();
scd = cpu_sdc(cpu);
if (cpu != smp_processor_id())
clock = sched_clock_remote(scd);
else
clock = sched_clock_local(scd);
- preempt_enable();
+ preempt_enable_notrace();
return clock;
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b46131ef6aa..ae365aaa818 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1745,8 +1745,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
p->numa_scan_period = sysctl_numa_balancing_scan_delay;
p->numa_work.next = &p->numa_work;
- p->numa_faults = NULL;
- p->numa_faults_buffer = NULL;
+ p->numa_faults_memory = NULL;
+ p->numa_faults_buffer_memory = NULL;
+ p->last_task_numa_placement = 0;
+ p->last_sum_exec_runtime = 0;
INIT_LIST_HEAD(&p->numa_entry);
p->numa_group = NULL;
@@ -1952,7 +1954,7 @@ static int dl_overflow(struct task_struct *p, int policy,
{
struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
- u64 period = attr->sched_period;
+ u64 period = attr->sched_period ?: attr->sched_deadline;
u64 runtime = attr->sched_runtime;
u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
int cpus, err = -1;
@@ -2149,8 +2151,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
if (mm)
mmdrop(mm);
if (unlikely(prev_state == TASK_DEAD)) {
- task_numa_free(prev);
-
if (prev->sched_class->task_dead)
prev->sched_class->task_dead(prev);
@@ -2167,13 +2167,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
#ifdef CONFIG_SMP
-/* assumes rq->lock is held */
-static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
-{
- if (prev->sched_class->pre_schedule)
- prev->sched_class->pre_schedule(rq, prev);
-}
-
/* rq->lock is NOT held, but preemption is disabled */
static inline void post_schedule(struct rq *rq)
{
@@ -2191,10 +2184,6 @@ static inline void post_schedule(struct rq *rq)
#else
-static inline void pre_schedule(struct rq *rq, struct task_struct *p)
-{
-}
-
static inline void post_schedule(struct rq *rq)
{
}
@@ -2510,8 +2499,13 @@ void __kprobes preempt_count_add(int val)
DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
PREEMPT_MASK - 10);
#endif
- if (preempt_count() == val)
- trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
+ if (preempt_count() == val) {
+ unsigned long ip = get_parent_ip(CALLER_ADDR1);
+#ifdef CONFIG_DEBUG_PREEMPT
+ current->preempt_disable_ip = ip;
+#endif
+ trace_preempt_off(CALLER_ADDR0, ip);
+ }
}
EXPORT_SYMBOL(preempt_count_add);
@@ -2554,6 +2548,13 @@ static noinline void __schedule_bug(struct task_struct *prev)
print_modules();
if (irqs_disabled())
print_irqtrace_events(prev);
+#ifdef CONFIG_DEBUG_PREEMPT
+ if (in_atomic_preempt_off()) {
+ pr_err("Preemption disabled at:");
+ print_ip_sym(current->preempt_disable_ip);
+ pr_cont("\n");
+ }
+#endif
dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
@@ -2577,36 +2578,34 @@ static inline void schedule_debug(struct task_struct *prev)
schedstat_inc(this_rq(), sched_count);
}
-static void put_prev_task(struct rq *rq, struct task_struct *prev)
-{
- if (prev->on_rq || rq->skip_clock_update < 0)
- update_rq_clock(rq);
- prev->sched_class->put_prev_task(rq, prev);
-}
-
/*
* Pick up the highest-prio task:
*/
static inline struct task_struct *
-pick_next_task(struct rq *rq)
+pick_next_task(struct rq *rq, struct task_struct *prev)
{
- const struct sched_class *class;
+ const struct sched_class *class = &fair_sched_class;
struct task_struct *p;
/*
* Optimization: we know that if all tasks are in
* the fair class we can call that function directly:
*/
- if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
- p = fair_sched_class.pick_next_task(rq);
- if (likely(p))
+ if (likely(prev->sched_class == class &&
+ rq->nr_running == rq->cfs.h_nr_running)) {
+ p = fair_sched_class.pick_next_task(rq, prev);
+ if (likely(p && p != RETRY_TASK))
return p;
}
+again:
for_each_class(class) {
- p = class->pick_next_task(rq);
- if (p)
+ p = class->pick_next_task(rq, prev);
+ if (p) {
+ if (unlikely(p == RETRY_TASK))
+ goto again;
return p;
+ }
}
BUG(); /* the idle class will always have a runnable task */
@@ -2700,13 +2699,10 @@ need_resched:
switch_count = &prev->nvcsw;
}
- pre_schedule(rq, prev);
-
- if (unlikely(!rq->nr_running))
- idle_balance(cpu, rq);
+ if (prev->on_rq || rq->skip_clock_update < 0)
+ update_rq_clock(rq);
- put_prev_task(rq, prev);
- next = pick_next_task(rq);
+ next = pick_next_task(rq, prev);
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
rq->skip_clock_update = 0;
@@ -2908,7 +2904,8 @@ EXPORT_SYMBOL(sleep_on_timeout);
* This function changes the 'effective' priority of a task. It does
* not touch ->normal_prio like __setscheduler().
*
- * Used by the rt_mutex code to implement priority inheritance logic.
+ * Used by the rt_mutex code to implement priority inheritance
+ * logic. Call site only calls if the priority of the task changed.
*/
void rt_mutex_setprio(struct task_struct *p, int prio)
{
@@ -2998,7 +2995,7 @@ void set_user_nice(struct task_struct *p, long nice)
unsigned long flags;
struct rq *rq;
- if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
+ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
return;
/*
* We have to be careful, if called from sys_setpriority(),
@@ -3076,11 +3073,11 @@ SYSCALL_DEFINE1(nice, int, increment)
if (increment > 40)
increment = 40;
- nice = TASK_NICE(current) + increment;
- if (nice < -20)
- nice = -20;
- if (nice > 19)
- nice = 19;
+ nice = task_nice(current) + increment;
+ if (nice < MIN_NICE)
+ nice = MIN_NICE;
+ if (nice > MAX_NICE)
+ nice = MAX_NICE;
if (increment < 0 && !can_nice(current, nice))
return -EPERM;
@@ -3109,18 +3106,6 @@ int task_prio(const struct task_struct *p)
}
/**
- * task_nice - return the nice value of a given task.
- * @p: the task in question.
- *
- * Return: The nice value [ -20 ... 0 ... 19 ].
- */
-int task_nice(const struct task_struct *p)
-{
- return TASK_NICE(p);
-}
-EXPORT_SYMBOL(task_nice);
-
-/**
* idle_cpu - is a given cpu idle currently?
* @cpu: the processor in question.
*
@@ -3189,9 +3174,8 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
dl_se->dl_new = 1;
}
-/* Actually do priority change: must hold pi & rq lock. */
-static void __setscheduler(struct rq *rq, struct task_struct *p,
- const struct sched_attr *attr)
+static void __setscheduler_params(struct task_struct *p,
+ const struct sched_attr *attr)
{
int policy = attr->sched_policy;
@@ -3211,9 +3195,21 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
* getparam()/getattr() don't report silly values for !rt tasks.
*/
p->rt_priority = attr->sched_priority;
-
p->normal_prio = normal_prio(p);
- p->prio = rt_mutex_getprio(p);
+ set_load_weight(p);
+}
+
+/* Actually do priority change: must hold pi & rq lock. */
+static void __setscheduler(struct rq *rq, struct task_struct *p,
+ const struct sched_attr *attr)
+{
+ __setscheduler_params(p, attr);
+
+ /*
+ * If we get here, there was no pi waiters boosting the
+ * task. It is safe to use the normal prio.
+ */
+ p->prio = normal_prio(p);
if (dl_prio(p->prio))
p->sched_class = &dl_sched_class;
@@ -3221,8 +3217,6 @@ static void __setscheduler(struct rq *rq, struct task_struct *p,
p->sched_class = &rt_sched_class;
else
p->sched_class = &fair_sched_class;
-
- set_load_weight(p);
}
static void
@@ -3275,6 +3269,8 @@ static int __sched_setscheduler(struct task_struct *p,
const struct sched_attr *attr,
bool user)
{
+ int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
+ MAX_RT_PRIO - 1 - attr->sched_priority;
int retval, oldprio, oldpolicy = -1, on_rq, running;
int policy = attr->sched_policy;
unsigned long flags;
@@ -3319,7 +3315,7 @@ recheck:
*/
if (user && !capable(CAP_SYS_NICE)) {
if (fair_policy(policy)) {
- if (attr->sched_nice < TASK_NICE(p) &&
+ if (attr->sched_nice < task_nice(p) &&
!can_nice(p, attr->sched_nice))
return -EPERM;
}
@@ -3338,12 +3334,21 @@ recheck:
return -EPERM;
}
+ /*
+ * Can't set/change SCHED_DEADLINE policy at all for now
+ * (safest behavior); in the future we would like to allow
+ * unprivileged DL tasks to increase their relative deadline
+ * or reduce their runtime (both ways reducing utilization)
+ */
+ if (dl_policy(policy))
+ return -EPERM;
+
/*
* Treat SCHED_IDLE as nice 20. Only allow a switch to
* SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
*/
if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
- if (!can_nice(p, TASK_NICE(p)))
+ if (!can_nice(p, task_nice(p)))
return -EPERM;
}
@@ -3380,16 +3385,18 @@ recheck:
}
/*
- * If not changing anything there's no need to proceed further:
+ * If not changing anything there's no need to proceed further,
+ * but store a possible modification of reset_on_fork.
*/
if (unlikely(policy == p->policy)) {
- if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p))
+ if (fair_policy(policy) && attr->sched_nice != task_nice(p))
goto change;
if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
goto change;
if (dl_policy(policy))
goto change;
+ p->sched_reset_on_fork = reset_on_fork;
task_rq_unlock(rq, p, &flags);
return 0;
}
@@ -3443,6 +3450,24 @@ change:
return -EBUSY;
}
+ p->sched_reset_on_fork = reset_on_fork;
+ oldprio = p->prio;
+
+ /*
+ * Special case for priority boosted tasks.
+ *
+ * If the new priority is lower or equal (user space view)
+ * than the current (boosted) priority, we just store the new
+ * normal parameters and do not touch the scheduler class and
+ * the runqueue. This will be done when the task deboost
+ * itself.
+ */
+ if (rt_mutex_check_prio(p, newprio)) {
+ __setscheduler_params(p, attr);
+ task_rq_unlock(rq, p, &flags);
+ return 0;
+ }
+
on_rq = p->on_rq;
running = task_current(rq, p);
if (on_rq)
@@ -3450,16 +3475,18 @@ change:
if (running)
p->sched_class->put_prev_task(rq, p);
- p->sched_reset_on_fork = reset_on_fork;
-
- oldprio = p->prio;
prev_class = p->sched_class;
__setscheduler(rq, p, attr);
if (running)
p->sched_class->set_curr_task(rq);
- if (on_rq)
- enqueue_task(rq, p, 0);
+ if (on_rq) {
+ /*
+ * We enqueue to tail when the priority of a task is
+ * increased (user space view).
+ */
+ enqueue_task(rq, p, oldprio <= p->prio ? ENQUEUE_HEAD : 0);
+ }
check_class_changed(rq, p, prev_class, oldprio);
task_rq_unlock(rq, p, &flags);
@@ -3615,7 +3642,7 @@ static int sched_copy_attr(struct sched_attr __user *uattr,
* XXX: do we want to be lenient like existing syscalls; or do we want
* to be strict and return an error on out-of-bounds values?
*/
- attr->sched_nice = clamp(attr->sched_nice, -20, 19);
+ attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
out:
return ret;
@@ -3661,13 +3688,14 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
* @pid: the pid in question.
* @uattr: structure containing the extended parameters.
*/
-SYSCALL_DEFINE2(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr)
+SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
+ unsigned int, flags)
{
struct sched_attr attr;
struct task_struct *p;
int retval;
- if (!uattr || pid < 0)
+ if (!uattr || pid < 0 || flags)
return -EINVAL;
if (sched_copy_attr(uattr, &attr))
@@ -3786,7 +3814,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
attr->size = usize;
}
- ret = copy_to_user(uattr, attr, usize);
+ ret = copy_to_user(uattr, attr, attr->size);
if (ret)
return -EFAULT;
@@ -3804,8 +3832,8 @@ err_size:
* @uattr: structure containing the extended parameters.
* @size: sizeof(attr) for fwd/bwd comp.
*/
-SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
- unsigned int, size)
+SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
+ unsigned int, size, unsigned int, flags)
{
struct sched_attr attr = {
.size = sizeof(struct sched_attr),
@@ -3814,7 +3842,7 @@ SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
int retval;
if (!uattr || pid < 0 || size > PAGE_SIZE ||
- size < SCHED_ATTR_SIZE_VER0)
+ size < SCHED_ATTR_SIZE_VER0 || flags)
return -EINVAL;
rcu_read_lock();
@@ -3835,7 +3863,7 @@ SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
else if (task_has_rt_policy(p))
attr.sched_priority = p->rt_priority;
else
- attr.sched_nice = TASK_NICE(p);
+ attr.sched_nice = task_nice(p);
rcu_read_unlock();
@@ -4473,6 +4501,7 @@ void init_idle(struct task_struct *idle, int cpu)
rcu_read_unlock();
rq->curr = rq->idle = idle;
+ idle->on_rq = 1;
#if defined(CONFIG_SMP)
idle->on_cpu = 1;
#endif
@@ -4711,6 +4740,22 @@ static void calc_load_migrate(struct rq *rq)
atomic_long_add(delta, &calc_load_tasks);
}
+static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
+{
+}
+
+static const struct sched_class fake_sched_class = {
+ .put_prev_task = put_prev_task_fake,
+};
+
+static struct task_struct fake_task = {
+ /*
+ * Avoid pull_{rt,dl}_task()
+ */
+ .prio = MAX_PRIO + 1,
+ .sched_class = &fake_sched_class,
+};
+
/*
* Migrate all tasks from the rq, sleeping tasks will be migrated by
* try_to_wake_up()->select_task_rq().
@@ -4751,7 +4796,7 @@ static void migrate_tasks(unsigned int dead_cpu)
if (rq->nr_running == 1)
break;
- next = pick_next_task(rq);
+ next = pick_next_task(rq, &fake_task);
BUG_ON(!next);
next->sched_class->put_prev_task(rq, next);
@@ -4841,7 +4886,7 @@ set_table_entry(struct ctl_table *entry,
static struct ctl_table *
sd_alloc_ctl_domain_table(struct sched_domain *sd)
{
- struct ctl_table *table = sd_alloc_ctl_entry(13);
+ struct ctl_table *table = sd_alloc_ctl_entry(14);
if (table == NULL)
return NULL;
@@ -4869,9 +4914,12 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
sizeof(int), 0644, proc_dointvec_minmax, false);
set_table_entry(&table[10], "flags", &sd->flags,
sizeof(int), 0644, proc_dointvec_minmax, false);
- set_table_entry(&table[11], "name", sd->name,
+ set_table_entry(&table[11], "max_newidle_lb_cost",
+ &sd->max_newidle_lb_cost,
+ sizeof(long), 0644, proc_doulongvec_minmax, false);
+ set_table_entry(&table[12], "name", sd->name,
CORENAME_MAX_SIZE, 0444, proc_dostring, false);
- /* &table[12] is terminator */
+ /* &table[13] is terminator */
return table;
}
@@ -6848,7 +6896,6 @@ void __init sched_init(void)
rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
#ifdef CONFIG_RT_GROUP_SCHED
- INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
#endif
@@ -6937,7 +6984,8 @@ void __might_sleep(const char *file, int line, int preempt_offset)
static unsigned long prev_jiffy; /* ratelimiting */
rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
- if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
+ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
+ !is_idle_task(current)) ||
system_state != SYSTEM_RUNNING || oops_in_progress)
return;
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
@@ -6955,6 +7003,13 @@ void __might_sleep(const char *file, int line, int preempt_offset)
debug_show_held_locks(current);
if (irqs_disabled())
print_irqtrace_events(current);
+#ifdef CONFIG_DEBUG_PREEMPT
+ if (!preempt_count_equals(preempt_offset)) {
+ pr_err("Preemption disabled at:");
+ print_ip_sym(current->preempt_disable_ip);
+ pr_cont("\n");
+ }
+#endif
dump_stack();
}
EXPORT_SYMBOL(__might_sleep);
@@ -7008,7 +7063,7 @@ void normalize_rt_tasks(void)
* Renice negative nice level userspace
* tasks back to 0:
*/
- if (TASK_NICE(p) < 0 && p->mm)
+ if (task_nice(p) < 0 && p->mm)
set_user_nice(p, 0);
continue;
}
@@ -7422,6 +7477,7 @@ static int sched_dl_global_constraints(void)
u64 period = global_rt_period();
u64 new_bw = to_ratio(period, runtime);
int cpu, ret = 0;
+ unsigned long flags;
/*
* Here we want to check the bandwidth not being set to some
@@ -7435,10 +7491,10 @@ static int sched_dl_global_constraints(void)
for_each_possible_cpu(cpu) {
struct dl_bw *dl_b = dl_bw_of(cpu);
- raw_spin_lock(&dl_b->lock);
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
if (new_bw < dl_b->total_bw)
ret = -EBUSY;
- raw_spin_unlock(&dl_b->lock);
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
if (ret)
break;
@@ -7451,6 +7507,7 @@ static void sched_dl_do_global(void)
{
u64 new_bw = -1;
int cpu;
+ unsigned long flags;
def_dl_bandwidth.dl_period = global_rt_period();
def_dl_bandwidth.dl_runtime = global_rt_runtime();
@@ -7464,9 +7521,9 @@ static void sched_dl_do_global(void)
for_each_possible_cpu(cpu) {
struct dl_bw *dl_b = dl_bw_of(cpu);
- raw_spin_lock(&dl_b->lock);
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
dl_b->bw = new_bw;
- raw_spin_unlock(&dl_b->lock);
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
}
}
@@ -7475,7 +7532,8 @@ static int sched_rt_global_validate(void)
if (sysctl_sched_rt_period <= 0)
return -EINVAL;
- if (sysctl_sched_rt_runtime > sysctl_sched_rt_period)
+ if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
+ (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
return -EINVAL;
return 0;
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 045fc74e3f0..5b9bb42b2d4 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -70,7 +70,7 @@ static void cpudl_heapify(struct cpudl *cp, int idx)
static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl)
{
- WARN_ON(idx > num_present_cpus() || idx == IDX_INVALID);
+ WARN_ON(idx == IDX_INVALID || !cpu_present(idx));
if (dl_time_before(new_dl, cp->elements[idx].dl)) {
cp->elements[idx].dl = new_dl;
@@ -117,7 +117,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
}
out:
- WARN_ON(best_cpu > num_present_cpus() && best_cpu != -1);
+ WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
return best_cpu;
}
@@ -137,7 +137,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
int old_idx, new_cpu;
unsigned long flags;
- WARN_ON(cpu > num_present_cpus());
+ WARN_ON(!cpu_present(cpu));
raw_spin_lock_irqsave(&cp->lock, flags);
old_idx = cp->cpu_to_idx[cpu];
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 99947919e30..58624a65f12 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -142,7 +142,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
p->utimescaled += cputime_scaled;
account_group_user_time(p, cputime);
- index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
+ index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
/* Add user time to cpustat. */
task_group_account_field(p, index, (__force u64) cputime);
@@ -169,7 +169,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
p->gtime += cputime;
/* Add guest time to cpustat. */
- if (TASK_NICE(p) > 0) {
+ if (task_nice(p) > 0) {
cpustat[CPUTIME_NICE] += (__force u64) cputime;
cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
} else {
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 0dd5e0971a0..27ef4092552 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -121,7 +121,7 @@ static inline void dl_clear_overload(struct rq *rq)
static void update_dl_migration(struct dl_rq *dl_rq)
{
- if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_total > 1) {
+ if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) {
if (!dl_rq->overloaded) {
dl_set_overload(rq_of_dl_rq(dl_rq));
dl_rq->overloaded = 1;
@@ -135,9 +135,7 @@ static void update_dl_migration(struct dl_rq *dl_rq)
static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
struct task_struct *p = dl_task_of(dl_se);
- dl_rq = &rq_of_dl_rq(dl_rq)->dl;
- dl_rq->dl_nr_total++;
if (p->nr_cpus_allowed > 1)
dl_rq->dl_nr_migratory++;
@@ -147,9 +145,7 @@ static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
struct task_struct *p = dl_task_of(dl_se);
- dl_rq = &rq_of_dl_rq(dl_rq)->dl;
- dl_rq->dl_nr_total--;
if (p->nr_cpus_allowed > 1)
dl_rq->dl_nr_migratory--;
@@ -214,6 +210,16 @@ static inline int has_pushable_dl_tasks(struct rq *rq)
static int push_dl_task(struct rq *rq);
+static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
+{
+ return dl_task(prev);
+}
+
+static inline void set_post_schedule(struct rq *rq)
+{
+ rq->post_schedule = has_pushable_dl_tasks(rq);
+}
+
#else
static inline
@@ -236,6 +242,19 @@ void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
}
+static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
+{
+ return false;
+}
+
+static inline int pull_dl_task(struct rq *rq)
+{
+ return 0;
+}
+
+static inline void set_post_schedule(struct rq *rq)
+{
+}
#endif /* CONFIG_SMP */
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
@@ -566,6 +585,8 @@ int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
return 1;
}
+extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
+
/*
* Update the current task's runtime statistics (provided it is still
* a -deadline task and has not been removed from the dl_rq).
@@ -588,8 +609,8 @@ static void update_curr_dl(struct rq *rq)
* approach need further study.
*/
delta_exec = rq_clock_task(rq) - curr->se.exec_start;
- if (unlikely((s64)delta_exec < 0))
- delta_exec = 0;
+ if (unlikely((s64)delta_exec <= 0))
+ return;
schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec));
@@ -629,11 +650,13 @@ static void update_curr_dl(struct rq *rq)
struct rt_rq *rt_rq = &rq->rt;
raw_spin_lock(&rt_rq->rt_runtime_lock);
- rt_rq->rt_time += delta_exec;
/*
* We'll let actual RT tasks worry about the overflow here, we
- * have our own CBS to keep us inline -- see above.
+ * have our own CBS to keep us inline; only account when RT
+ * bandwidth is relevant.
*/
+ if (sched_rt_bandwidth_account(rt_rq))
+ rt_rq->rt_time += delta_exec;
raw_spin_unlock(&rt_rq->rt_runtime_lock);
}
}
@@ -717,6 +740,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
WARN_ON(!dl_prio(prio));
dl_rq->dl_nr_running++;
+ inc_nr_running(rq_of_dl_rq(dl_rq));
inc_dl_deadline(dl_rq, deadline);
inc_dl_migration(dl_se, dl_rq);
@@ -730,6 +754,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
WARN_ON(!dl_prio(prio));
WARN_ON(!dl_rq->dl_nr_running);
dl_rq->dl_nr_running--;
+ dec_nr_running(rq_of_dl_rq(dl_rq));
dec_dl_deadline(dl_rq, dl_se->deadline);
dec_dl_migration(dl_se, dl_rq);
@@ -836,8 +861,6 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_dl_task(rq, p);
-
- inc_nr_running(rq);
}
static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
@@ -850,8 +873,6 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
{
update_curr_dl(rq);
__dequeue_task_dl(rq, p, flags);
-
- dec_nr_running(rq);
}
/*
@@ -944,6 +965,8 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
resched_task(rq->curr);
}
+static int pull_dl_task(struct rq *this_rq);
+
#endif /* CONFIG_SMP */
/*
@@ -990,7 +1013,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
return rb_entry(left, struct sched_dl_entity, rb_node);
}
-struct task_struct *pick_next_task_dl(struct rq *rq)
+struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
{
struct sched_dl_entity *dl_se;
struct task_struct *p;
@@ -998,9 +1021,20 @@ struct task_struct *pick_next_task_dl(struct rq *rq)
dl_rq = &rq->dl;
+ if (need_pull_dl_task(rq, prev))
+ pull_dl_task(rq);
+ /*
+ * When prev is DL, we may throttle it in put_prev_task().
+ * So, we update time before we check for dl_nr_running.
+ */
+ if (prev->sched_class == &dl_sched_class)
+ update_curr_dl(rq);
+
if (unlikely(!dl_rq->dl_nr_running))
return NULL;
+ put_prev_task(rq, prev);
+
dl_se = pick_next_dl_entity(rq, dl_rq);
BUG_ON(!dl_se);
@@ -1015,9 +1049,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq)
start_hrtick_dl(rq, p);
#endif
-#ifdef CONFIG_SMP
- rq->post_schedule = has_pushable_dl_tasks(rq);
-#endif /* CONFIG_SMP */
+ set_post_schedule(rq);
return p;
}
@@ -1426,13 +1458,6 @@ skip:
return ret;
}
-static void pre_schedule_dl(struct rq *rq, struct task_struct *prev)
-{
- /* Try to pull other tasks here */
- if (dl_task(prev))
- pull_dl_task(rq);
-}
-
static void post_schedule_dl(struct rq *rq)
{
push_dl_tasks(rq);
@@ -1560,7 +1585,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
if (unlikely(p->dl.dl_throttled))
return;
- if (p->on_rq || rq->curr != p) {
+ if (p->on_rq && rq->curr != p) {
#ifdef CONFIG_SMP
if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p))
/* Only reschedule if pushing failed */
@@ -1625,7 +1650,6 @@ const struct sched_class dl_sched_class = {
.set_cpus_allowed = set_cpus_allowed_dl,
.rq_online = rq_online_dl,
.rq_offline = rq_offline_dl,
- .pre_schedule = pre_schedule_dl,
.post_schedule = post_schedule_dl,
.task_woken = task_woken_dl,
#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index dd52e7ffb10..f3344c31632 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -321,6 +321,7 @@ do { \
P(sched_goidle);
#ifdef CONFIG_SMP
P64(avg_idle);
+ P64(max_idle_balance_cost);
#endif
P(ttwu_count);
@@ -533,15 +534,15 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
unsigned long nr_faults = -1;
int cpu_current, home_node;
- if (p->numa_faults)
- nr_faults = p->numa_faults[2*node + i];
+ if (p->numa_faults_memory)
+ nr_faults = p->numa_faults_memory[2*node + i];
cpu_current = !i ? (task_node(p) == node) :
(pol && node_isset(node, pol->v.nodes));
home_node = (p->numa_preferred_nid == node);
- SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n",
+ SEQ_printf(m, "numa_faults_memory, %d, %d, %d, %d, %ld\n",
i, node, cpu_current, home_node, nr_faults);
}
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 966cc2bfcb7..7e9bd0b1fa9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -322,13 +322,13 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
/* Do the two (enqueued) entities belong to the same group ? */
-static inline int
+static inline struct cfs_rq *
is_same_group(struct sched_entity *se, struct sched_entity *pse)
{
if (se->cfs_rq == pse->cfs_rq)
- return 1;
+ return se->cfs_rq;
- return 0;
+ return NULL;
}
static inline struct sched_entity *parent_entity(struct sched_entity *se)
@@ -336,17 +336,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
return se->parent;
}
-/* return depth at which a sched entity is present in the hierarchy */
-static inline int depth_se(struct sched_entity *se)
-{
- int depth = 0;
-
- for_each_sched_entity(se)
- depth++;
-
- return depth;
-}
-
static void
find_matching_se(struct sched_entity **se, struct sched_entity **pse)
{
@@ -360,8 +349,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
*/
/* First walk up until both entities are at same depth */
- se_depth = depth_se(*se);
- pse_depth = depth_se(*pse);
+ se_depth = (*se)->depth;
+ pse_depth = (*pse)->depth;
while (se_depth > pse_depth) {
se_depth--;
@@ -426,12 +415,6 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
-static inline int
-is_same_group(struct sched_entity *se, struct sched_entity *pse)
-{
- return 1;
-}
-
static inline struct sched_entity *parent_entity(struct sched_entity *se)
{
return NULL;
@@ -819,14 +802,6 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
unsigned int sysctl_numa_balancing_scan_delay = 1000;
-/*
- * After skipping a page migration on a shared page, skip N more numa page
- * migrations unconditionally. This reduces the number of NUMA migrations
- * in shared memory workloads, and has the effect of pulling tasks towards
- * where their memory lives, over pulling the memory towards the task.
- */
-unsigned int sysctl_numa_balancing_migrate_deferred = 16;
-
static unsigned int task_nr_scan_windows(struct task_struct *p)
{
unsigned long rss = 0;
@@ -893,10 +868,26 @@ struct numa_group {
struct list_head task_list;
struct rcu_head rcu;
+ nodemask_t active_nodes;
unsigned long total_faults;
+ /*
+ * Faults_cpu is used to decide whether memory should move
+ * towards the CPU. As a consequence, these stats are weighted
+ * more by CPU use than by memory faults.
+ */
+ unsigned long *faults_cpu;
unsigned long faults[0];
};
+/* Shared or private faults. */
+#define NR_NUMA_HINT_FAULT_TYPES 2
+
+/* Memory and CPU locality */
+#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
+
+/* Averaged statistics, and temporary buffers. */
+#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
+
pid_t task_numa_group_id(struct task_struct *p)
{
return p->numa_group ? p->numa_group->gid : 0;
@@ -904,16 +895,16 @@ pid_t task_numa_group_id(struct task_struct *p)
static inline int task_faults_idx(int nid, int priv)
{
- return 2 * nid + priv;
+ return NR_NUMA_HINT_FAULT_TYPES * nid + priv;
}
static inline unsigned long task_faults(struct task_struct *p, int nid)
{
- if (!p->numa_faults)
+ if (!p->numa_faults_memory)
return 0;
- return p->numa_faults[task_faults_idx(nid, 0)] +
- p->numa_faults[task_faults_idx(nid, 1)];
+ return p->numa_faults_memory[task_faults_idx(nid, 0)] +
+ p->numa_faults_memory[task_faults_idx(nid, 1)];
}
static inline unsigned long group_faults(struct task_struct *p, int nid)
@@ -925,6 +916,12 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
p->numa_group->faults[task_faults_idx(nid, 1)];
}
+static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
+{
+ return group->faults_cpu[task_faults_idx(nid, 0)] +
+ group->faults_cpu[task_faults_idx(nid, 1)];
+}
+
/*
* These return the fraction of accesses done by a particular task, or
* task group, on a particular numa node. The group weight is given a
@@ -935,7 +932,7 @@ static inline unsigned long task_weight(struct task_struct *p, int nid)
{
unsigned long total_faults;
- if (!p->numa_faults)
+ if (!p->numa_faults_memory)
return 0;
total_faults = p->total_numa_faults;
@@ -954,6 +951,69 @@ static inline unsigned long group_weight(struct task_struct *p, int nid)
return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
}
+bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
+ int src_nid, int dst_cpu)
+{
+ struct numa_group *ng = p->numa_group;
+ int dst_nid = cpu_to_node(dst_cpu);
+ int last_cpupid, this_cpupid;
+
+ this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
+
+ /*
+ * Multi-stage node selection is used in conjunction with a periodic
+ * migration fault to build a temporal task<->page relation. By using
+ * a two-stage filter we remove short/unlikely relations.
+ *
+ * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
+ * a task's usage of a particular page (n_p) per total usage of this
+ * page (n_t) (in a given time-span) to a probability.
+ *
+ * Our periodic faults will sample this probability and getting the
+ * same result twice in a row, given these samples are fully
+ * independent, is then given by P(n)^2, provided our sample period
+ * is sufficiently short compared to the usage pattern.
+ *
+ * This quadric squishes small probabilities, making it less likely we
+ * act on an unlikely task<->page relation.
+ */
+ last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
+ if (!cpupid_pid_unset(last_cpupid) &&
+ cpupid_to_nid(last_cpupid) != dst_nid)
+ return false;
+
+ /* Always allow migrate on private faults */
+ if (cpupid_match_pid(p, last_cpupid))
+ return true;
+
+ /* A shared fault, but p->numa_group has not been set up yet. */
+ if (!ng)
+ return true;
+
+ /*
+ * Do not migrate if the destination is not a node that
+ * is actively used by this numa group.
+ */
+ if (!node_isset(dst_nid, ng->active_nodes))
+ return false;
+
+ /*
+ * Source is a node that is not actively used by this
+ * numa group, while the destination is. Migrate.
+ */
+ if (!node_isset(src_nid, ng->active_nodes))
+ return true;
+
+ /*
+ * Both source and destination are nodes in active
+ * use by this numa group. Maximize memory bandwidth
+ * by migrating from more heavily used groups, to less
+ * heavily used ones, spreading the load around.
+ * Use a 1/4 hysteresis to avoid spurious page movement.
+ */
+ return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
+}
+
static unsigned long weighted_cpuload(const int cpu);
static unsigned long source_load(int cpu, int type);
static unsigned long target_load(int cpu, int type);
@@ -1267,7 +1327,7 @@ static int task_numa_migrate(struct task_struct *p)
static void numa_migrate_preferred(struct task_struct *p)
{
/* This task has no NUMA fault statistics yet */
- if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
+ if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
return;
/* Periodically retry migrating the task to the preferred node */
@@ -1282,6 +1342,38 @@ static void numa_migrate_preferred(struct task_struct *p)
}
/*
+ * Find the nodes on which the workload is actively running. We do this by
+ * tracking the nodes from which NUMA hinting faults are triggered. This can
+ * be different from the set of nodes where the workload's memory is currently
+ * located.
+ *
+ * The bitmask is used to make smarter decisions on when to do NUMA page
+ * migrations, To prevent flip-flopping, and excessive page migrations, nodes
+ * are added when they cause over 6/16 of the maximum number of faults, but
+ * only removed when they drop below 3/16.
+ */
+static void update_numa_active_node_mask(struct numa_group *numa_group)
+{
+ unsigned long faults, max_faults = 0;
+ int nid;
+
+ for_each_online_node(nid) {
+ faults = group_faults_cpu(numa_group, nid);
+ if (faults > max_faults)
+ max_faults = faults;
+ }
+
+ for_each_online_node(nid) {
+ faults = group_faults_cpu(numa_group, nid);
+ if (!node_isset(nid, numa_group->active_nodes)) {
+ if (faults > max_faults * 6 / 16)
+ node_set(nid, numa_group->active_nodes);
+ } else if (faults < max_faults * 3 / 16)
+ node_clear(nid, numa_group->active_nodes);
+ }
+}
+
+/*
* When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
* increments. The more local the fault statistics are, the higher the scan
* period will be for the next scan window. If local/remote ratio is below
@@ -1355,11 +1447,41 @@ static void update_task_scan_period(struct task_struct *p,
memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
}
+/*
+ * Get the fraction of time the task has been running since the last
+ * NUMA placement cycle. The scheduler keeps similar statistics, but
+ * decays those on a 32ms period, which is orders of magnitude off
+ * from the dozens-of-seconds NUMA balancing period. Use the scheduler
+ * stats only if the task is so new there are no NUMA statistics yet.
+ */
+static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
+{
+ u64 runtime, delta, now;
+ /* Use the start of this time slice to avoid calculations. */
+ now = p->se.exec_start;
+ runtime = p->se.sum_exec_runtime;
+
+ if (p->last_task_numa_placement) {
+ delta = runtime - p->last_sum_exec_runtime;
+ *period = now - p->last_task_numa_placement;
+ } else {
+ delta = p->se.avg.runnable_avg_sum;
+ *period = p->se.avg.runnable_avg_period;
+ }
+
+ p->last_sum_exec_runtime = runtime;
+ p->last_task_numa_placement = now;
+
+ return delta;
+}
+
static void task_numa_placement(struct task_struct *p)
{
int seq, nid, max_nid = -1, max_group_nid = -1;
unsigned long max_faults = 0, max_group_faults = 0;
unsigned long fault_types[2] = { 0, 0 };
+ unsigned long total_faults;
+ u64 runtime, period;
spinlock_t *group_lock = NULL;
seq = ACCESS_ONCE(p->mm->numa_scan_seq);
@@ -1368,6 +1490,10 @@ static void task_numa_placement(struct task_struct *p)
p->numa_scan_seq = seq;
p->numa_scan_period_max = task_scan_max(p);
+ total_faults = p->numa_faults_locality[0] +
+ p->numa_faults_locality[1];
+ runtime = numa_get_avg_runtime(p, &period);
+
/* If the task is part of a group prevent parallel updates to group stats */
if (p->numa_group) {
group_lock = &p->numa_group->lock;
@@ -1379,24 +1505,37 @@ static void task_numa_placement(struct task_struct *p)
unsigned long faults = 0, group_faults = 0;
int priv, i;
- for (priv = 0; priv < 2; priv++) {
- long diff;
+ for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
+ long diff, f_diff, f_weight;
i = task_faults_idx(nid, priv);
- diff = -p->numa_faults[i];
/* Decay existing window, copy faults since last scan */
- p->numa_faults[i] >>= 1;
- p->numa_faults[i] += p->numa_faults_buffer[i];
- fault_types[priv] += p->numa_faults_buffer[i];
- p->numa_faults_buffer[i] = 0;
+ diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2;
+ fault_types[priv] += p->numa_faults_buffer_memory[i];
+ p->numa_faults_buffer_memory[i] = 0;
- faults += p->numa_faults[i];
- diff += p->numa_faults[i];
+ /*
+ * Normalize the faults_from, so all tasks in a group
+ * count according to CPU use, instead of by the raw
+ * number of faults. Tasks with little runtime have
+ * little over-all impact on throughput, and thus their
+ * faults are less important.
+ */
+ f_weight = div64_u64(runtime << 16, period + 1);
+ f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) /
+ (total_faults + 1);
+ f_diff = f_weight - p->numa_faults_cpu[i] / 2;
+ p->numa_faults_buffer_cpu[i] = 0;
+
+ p->numa_faults_memory[i] += diff;
+ p->numa_faults_cpu[i] += f_diff;
+ faults += p->numa_faults_memory[i];
p->total_numa_faults += diff;
if (p->numa_group) {
/* safe because we can only change our own group */
p->numa_group->faults[i] += diff;
+ p->numa_group->faults_cpu[i] += f_diff;
p->numa_group->total_faults += diff;
group_faults += p->numa_group->faults[i];
}
@@ -1416,6 +1555,7 @@ static void task_numa_placement(struct task_struct *p)
update_task_scan_period(p, fault_types[0], fault_types[1]);
if (p->numa_group) {
+ update_numa_active_node_mask(p->numa_group);
/*
* If the preferred task and group nids are different,
* iterate over the nodes again to find the best place.
@@ -1465,7 +1605,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
if (unlikely(!p->numa_group)) {
unsigned int size = sizeof(struct numa_group) +
- 2*nr_node_ids*sizeof(unsigned long);
+ 4*nr_node_ids*sizeof(unsigned long);
grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
if (!grp)
@@ -1475,9 +1615,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
spin_lock_init(&grp->lock);
INIT_LIST_HEAD(&grp->task_list);
grp->gid = p->pid;
+ /* Second half of the array tracks nids where faults happen */
+ grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
+ nr_node_ids;
+
+ node_set(task_node(current), grp->active_nodes);
- for (i = 0; i < 2*nr_node_ids; i++)
- grp->faults[i] = p->numa_faults[i];
+ for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
+ grp->faults[i] = p->numa_faults_memory[i];
grp->total_faults = p->total_numa_faults;
@@ -1534,9 +1679,9 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
double_lock(&my_grp->lock, &grp->lock);
- for (i = 0; i < 2*nr_node_ids; i++) {
- my_grp->faults[i] -= p->numa_faults[i];
- grp->faults[i] += p->numa_faults[i];
+ for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
+ my_grp->faults[i] -= p->numa_faults_memory[i];
+ grp->faults[i] += p->numa_faults_memory[i];
}
my_grp->total_faults -= p->total_numa_faults;
grp->total_faults += p->total_numa_faults;
@@ -1562,12 +1707,12 @@ void task_numa_free(struct task_struct *p)
{
struct numa_group *grp = p->numa_group;
int i;
- void *numa_faults = p->numa_faults;
+ void *numa_faults = p->numa_faults_memory;
if (grp) {
spin_lock(&grp->lock);
- for (i = 0; i < 2*nr_node_ids; i++)
- grp->faults[i] -= p->numa_faults[i];
+ for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
+ grp->faults[i] -= p->numa_faults_memory[i];
grp->total_faults -= p->total_numa_faults;
list_del(&p->numa_entry);
@@ -1577,18 +1722,21 @@ void task_numa_free(struct task_struct *p)
put_numa_group(grp);
}
- p->numa_faults = NULL;
- p->numa_faults_buffer = NULL;
+ p->numa_faults_memory = NULL;
+ p->numa_faults_buffer_memory = NULL;
+ p->numa_faults_cpu= NULL;
+ p->numa_faults_buffer_cpu = NULL;
kfree(numa_faults);
}
/*
* Got a PROT_NONE fault for a page on @node.
*/
-void task_numa_fault(int last_cpupid, int node, int pages, int flags)
+void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
{
struct task_struct *p = current;
bool migrated = flags & TNF_MIGRATED;
+ int cpu_node = task_node(current);
int priv;
if (!numabalancing_enabled)
@@ -1603,16 +1751,24 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
return;
/* Allocate buffer to track faults on a per-node basis */
- if (unlikely(!p->numa_faults)) {
- int size = sizeof(*p->numa_faults) * 2 * nr_node_ids;
+ if (unlikely(!p->numa_faults_memory)) {
+ int size = sizeof(*p->numa_faults_memory) *
+ NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
- /* numa_faults and numa_faults_buffer share the allocation */
- p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN);
- if (!p->numa_faults)
+ p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
+ if (!p->numa_faults_memory)
return;
- BUG_ON(p->numa_faults_buffer);
- p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
+ BUG_ON(p->numa_faults_buffer_memory);
+ /*
+ * The averaged statistics, shared & private, memory & cpu,
+ * occupy the first half of the array. The second half of the
+ * array is for current counters, which are averaged into the
+ * first set by task_numa_placement.
+ */
+ p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);
+ p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids);
+ p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);
p->total_numa_faults = 0;
memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
}
@@ -1641,7 +1797,8 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
if (migrated)
p->numa_pages_migrated += pages;
- p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
+ p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
+ p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
}
@@ -1757,6 +1914,8 @@ void task_numa_work(struct callback_head *work)
start = end;
if (pages <= 0)
goto out;
+
+ cond_resched();
} while (end != vma->vm_end);
}
@@ -2217,13 +2376,20 @@ static inline void __update_group_entity_contrib(struct sched_entity *se)
se->avg.load_avg_contrib >>= NICE_0_SHIFT;
}
}
-#else
+
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
+{
+ __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
+ __update_tg_runnable_avg(&rq->avg, &rq->cfs);
+}
+#else /* CONFIG_FAIR_GROUP_SCHED */
static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
int force_update) {}
static inline void __update_tg_runnable_avg(struct sched_avg *sa,
struct cfs_rq *cfs_rq) {}
static inline void __update_group_entity_contrib(struct sched_entity *se) {}
-#endif
+static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
+#endif /* CONFIG_FAIR_GROUP_SCHED */
static inline void __update_task_entity_contrib(struct sched_entity *se)
{
@@ -2321,12 +2487,6 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
}
-static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
-{
- __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
- __update_tg_runnable_avg(&rq->avg, &rq->cfs);
-}
-
/* Add the load generated by se into cfs_rq's child load-average */
static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
struct sched_entity *se,
@@ -2414,7 +2574,10 @@ void idle_exit_fair(struct rq *this_rq)
update_rq_runnable_avg(this_rq, 0);
}
-#else
+static int idle_balance(struct rq *this_rq);
+
+#else /* CONFIG_SMP */
+
static inline void update_entity_load_avg(struct sched_entity *se,
int update_cfs_rq) {}
static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
@@ -2426,7 +2589,13 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
int sleep) {}
static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
int force_update) {}
-#endif
+
+static inline int idle_balance(struct rq *rq)
+{
+ return 0;
+}
+
+#endif /* CONFIG_SMP */
static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
@@ -2576,10 +2745,10 @@ static void __clear_buddies_last(struct sched_entity *se)
{
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->last == se)
- cfs_rq->last = NULL;
- else
+ if (cfs_rq->last != se)
break;
+
+ cfs_rq->last = NULL;
}
}
@@ -2587,10 +2756,10 @@ static void __clear_buddies_next(struct sched_entity *se)
{
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->next == se)
- cfs_rq->next = NULL;
- else
+ if (cfs_rq->next != se)
break;
+
+ cfs_rq->next = NULL;
}
}
@@ -2598,10 +2767,10 @@ static void __clear_buddies_skip(struct sched_entity *se)
{
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->skip == se)
- cfs_rq->skip = NULL;
- else
+ if (cfs_rq->skip != se)
break;
+
+ cfs_rq->skip = NULL;
}
}
@@ -2744,17 +2913,36 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
* 3) pick the "last" process, for cache locality
* 4) do not run the "skip" process, if something else is available
*/
-static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
+static struct sched_entity *
+pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
- struct sched_entity *se = __pick_first_entity(cfs_rq);
- struct sched_entity *left = se;
+ struct sched_entity *left = __pick_first_entity(cfs_rq);
+ struct sched_entity *se;
+
+ /*
+ * If curr is set we have to see if its left of the leftmost entity
+ * still in the tree, provided there was anything in the tree at all.
+ */
+ if (!left || (curr && entity_before(curr, left)))
+ left = curr;
+
+ se = left; /* ideally we run the leftmost entity */
/*
* Avoid running the skip buddy, if running something else can
* be done without getting too unfair.
*/
if (cfs_rq->skip == se) {
- struct sched_entity *second = __pick_next_entity(se);
+ struct sched_entity *second;
+
+ if (se == curr) {
+ second = __pick_first_entity(cfs_rq);
+ } else {
+ second = __pick_next_entity(se);
+ if (!second || (curr && entity_before(curr, second)))
+ second = curr;
+ }
+
if (second && wakeup_preempt_entity(second, left) < 1)
se = second;
}
@@ -2776,7 +2964,7 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
return se;
}
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
{
@@ -3431,22 +3619,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
}
/* conditionally throttle active cfs_rq's from put_prev_entity() */
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
if (!cfs_bandwidth_used())
- return;
+ return false;
if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
- return;
+ return false;
/*
* it's possible for a throttled entity to be forced into a running
* state (e.g. set_curr_task), in this case we're finished.
*/
if (cfs_rq_throttled(cfs_rq))
- return;
+ return true;
throttle_cfs_rq(cfs_rq);
+ return true;
}
static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
@@ -3556,7 +3745,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
}
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -4211,13 +4400,14 @@ done:
}
/*
- * sched_balance_self: balance the current task (running on cpu) in domains
- * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
- * SD_BALANCE_EXEC.
+ * select_task_rq_fair: Select target runqueue for the waking task in domains
+ * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
+ * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
*
- * Balance, ie. select the least loaded group.
+ * Balances load by selecting the idlest cpu in the idlest group, or under
+ * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
*
- * Returns the target CPU number, or the same CPU if no balancing is needed.
+ * Returns the target cpu number.
*
* preempt must be disabled.
*/
@@ -4492,26 +4682,124 @@ preempt:
set_last_buddy(se);
}
-static struct task_struct *pick_next_task_fair(struct rq *rq)
+static struct task_struct *
+pick_next_task_fair(struct rq *rq, struct task_struct *prev)
{
- struct task_struct *p;
struct cfs_rq *cfs_rq = &rq->cfs;
struct sched_entity *se;
+ struct task_struct *p;
+ int new_tasks;
+again:
+#ifdef CONFIG_FAIR_GROUP_SCHED
if (!cfs_rq->nr_running)
- return NULL;
+ goto idle;
+
+ if (prev->sched_class != &fair_sched_class)
+ goto simple;
+
+ /*
+ * Because of the set_next_buddy() in dequeue_task_fair() it is rather
+ * likely that a next task is from the same cgroup as the current.
+ *
+ * Therefore attempt to avoid putting and setting the entire cgroup
+ * hierarchy, only change the part that actually changes.
+ */
do {
- se = pick_next_entity(cfs_rq);
+ struct sched_entity *curr = cfs_rq->curr;
+
+ /*
+ * Since we got here without doing put_prev_entity() we also
+ * have to consider cfs_rq->curr. If it is still a runnable
+ * entity, update_curr() will update its vruntime, otherwise
+ * forget we've ever seen it.
+ */
+ if (curr && curr->on_rq)
+ update_curr(cfs_rq);
+ else
+ curr = NULL;
+
+ /*
+ * This call to check_cfs_rq_runtime() will do the throttle and
+ * dequeue its entity in the parent(s). Therefore the 'simple'
+ * nr_running test will indeed be correct.
+ */
+ if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+ goto simple;
+
+ se = pick_next_entity(cfs_rq, curr);
+ cfs_rq = group_cfs_rq(se);
+ } while (cfs_rq);
+
+ p = task_of(se);
+
+ /*
+ * Since we haven't yet done put_prev_entity and if the selected task
+ * is a different task than we started out with, try and touch the
+ * least amount of cfs_rqs.
+ */
+ if (prev != p) {
+ struct sched_entity *pse = &prev->se;
+
+ while (!(cfs_rq = is_same_group(se, pse))) {
+ int se_depth = se->depth;
+ int pse_depth = pse->depth;
+
+ if (se_depth <= pse_depth) {
+ put_prev_entity(cfs_rq_of(pse), pse);
+ pse = parent_entity(pse);
+ }
+ if (se_depth >= pse_depth) {
+ set_next_entity(cfs_rq_of(se), se);
+ se = parent_entity(se);
+ }
+ }
+
+ put_prev_entity(cfs_rq, pse);
+ set_next_entity(cfs_rq, se);
+ }
+
+ if (hrtick_enabled(rq))
+ hrtick_start_fair(rq, p);
+
+ return p;
+simple:
+ cfs_rq = &rq->cfs;
+#endif
+
+ if (!cfs_rq->nr_running)
+ goto idle;
+
+ put_prev_task(rq, prev);
+
+ do {
+ se = pick_next_entity(cfs_rq, NULL);
set_next_entity(cfs_rq, se);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
p = task_of(se);
+
if (hrtick_enabled(rq))
hrtick_start_fair(rq, p);
return p;
+
+idle:
+ new_tasks = idle_balance(rq);
+ /*
+ * Because idle_balance() releases (and re-acquires) rq->lock, it is
+ * possible for any higher priority task to appear. In that case we
+ * must re-start the pick_next_entity() loop.
+ */
+ if (new_tasks < 0)
+ return RETRY_TASK;
+
+ if (new_tasks > 0)
+ goto again;
+
+ return NULL;
}
/*
@@ -4749,7 +5037,7 @@ static void move_task(struct task_struct *p, struct lb_env *env)
* Is this task likely cache-hot:
*/
static int
-task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
+task_hot(struct task_struct *p, u64 now)
{
s64 delta;
@@ -4783,7 +5071,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
{
int src_nid, dst_nid;
- if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
+ if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
!(env->sd->flags & SD_NUMA)) {
return false;
}
@@ -4814,7 +5102,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
return false;
- if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
+ if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA))
return false;
src_nid = cpu_to_node(env->src_cpu);
@@ -4910,7 +5198,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
* 2) task is cache cold, or
* 3) too many balance attempts have failed.
*/
- tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
+ tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq));
if (!tsk_cache_hot)
tsk_cache_hot = migrate_degrades_locality(p, env);
@@ -5773,12 +6061,10 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
pwr_now /= SCHED_POWER_SCALE;
/* Amount of load we'd subtract */
- tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
- busiest->group_power;
- if (busiest->avg_load > tmp) {
+ if (busiest->avg_load > scaled_busy_load_per_task) {
pwr_move += busiest->group_power *
min(busiest->load_per_task,
- busiest->avg_load - tmp);
+ busiest->avg_load - scaled_busy_load_per_task);
}
/* Amount of load we'd add */
@@ -6357,17 +6643,23 @@ out:
* idle_balance is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
*/
-void idle_balance(int this_cpu, struct rq *this_rq)
+static int idle_balance(struct rq *this_rq)
{
struct sched_domain *sd;
int pulled_task = 0;
unsigned long next_balance = jiffies + HZ;
u64 curr_cost = 0;
+ int this_cpu = this_rq->cpu;
+ idle_enter_fair(this_rq);
+ /*
+ * We must set idle_stamp _before_ calling idle_balance(), such that we
+ * measure the duration of idle_balance() as idle time.
+ */
this_rq->idle_stamp = rq_clock(this_rq);
if (this_rq->avg_idle < sysctl_sched_migration_cost)
- return;
+ goto out;
/*
* Drop the rq->lock, but keep IRQ/preempt disabled.
@@ -6405,15 +6697,22 @@ void idle_balance(int this_cpu, struct rq *this_rq)
interval = msecs_to_jiffies(sd->balance_interval);
if (time_after(next_balance, sd->last_balance + interval))
next_balance = sd->last_balance + interval;
- if (pulled_task) {
- this_rq->idle_stamp = 0;
+ if (pulled_task)
break;
- }
}
rcu_read_unlock();
raw_spin_lock(&this_rq->lock);
+ /*
+ * While browsing the domains, we released the rq lock.
+ * A task could have be enqueued in the meantime
+ */
+ if (this_rq->cfs.h_nr_running && !pulled_task) {
+ pulled_task = 1;
+ goto out;
+ }
+
if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
/*
* We are going idle. next_balance may be set based on
@@ -6424,6 +6723,20 @@ void idle_balance(int this_cpu, struct rq *this_rq)
if (curr_cost > this_rq->max_idle_balance_cost)
this_rq->max_idle_balance_cost = curr_cost;
+
+out:
+ /* Is there a task of a high priority class? */
+ if (this_rq->nr_running != this_rq->cfs.h_nr_running &&
+ (this_rq->dl.dl_nr_running ||
+ (this_rq->rt.rt_nr_running && !rt_rq_throttled(&this_rq->rt))))
+ pulled_task = -1;
+
+ if (pulled_task) {
+ idle_exit_fair(this_rq);
+ this_rq->idle_stamp = 0;
+ }
+
+ return pulled_task;
}
/*
@@ -6494,6 +6807,11 @@ out_unlock:
return 0;
}
+static inline int on_null_domain(struct rq *rq)
+{
+ return unlikely(!rcu_dereference_sched(rq->sd));
+}
+
#ifdef CONFIG_NO_HZ_COMMON
/*
* idle load balancing details
@@ -6548,8 +6866,13 @@ static void nohz_balancer_kick(void)
static inline void nohz_balance_exit_idle(int cpu)
{
if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
- cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
- atomic_dec(&nohz.nr_cpus);
+ /*
+ * Completely isolated CPUs don't ever set, so we must test.
+ */
+ if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
+ cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
+ atomic_dec(&nohz.nr_cpus);
+ }
clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
}
}
@@ -6603,6 +6926,12 @@ void nohz_balance_enter_idle(int cpu)
if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
return;
+ /*
+ * If we're a completely isolated CPU, we don't play.
+ */
+ if (on_null_domain(cpu_rq(cpu)))
+ return;
+
cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
atomic_inc(&nohz.nr_cpus);
set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
@@ -6865,11 +7194,6 @@ static void run_rebalance_domains(struct softirq_action *h)
nohz_idle_balance(this_rq, idle);
}
-static inline int on_null_domain(struct rq *rq)
-{
- return !rcu_dereference_sched(rq->sd);
-}
-
/*
* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
*/
@@ -6999,15 +7323,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
struct cfs_rq *cfs_rq = cfs_rq_of(se);
/*
- * Ensure the task's vruntime is normalized, so that when its
+ * Ensure the task's vruntime is normalized, so that when it's
* switched back to the fair class the enqueue_entity(.flags=0) will
* do the right thing.
*
- * If it was on_rq, then the dequeue_entity(.flags=0) will already
- * have normalized the vruntime, if it was !on_rq, then only when
+ * If it's on_rq, then the dequeue_entity(.flags=0) will already
+ * have normalized the vruntime, if it's !on_rq, then only when
* the task is sleeping will it still have non-normalized vruntime.
*/
- if (!se->on_rq && p->state != TASK_RUNNING) {
+ if (!p->on_rq && p->state != TASK_RUNNING) {
/*
* Fix up our vruntime so that the current sleep doesn't
* cause 'unlimited' sleep bonus.
@@ -7034,7 +7358,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
*/
static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
- if (!p->se.on_rq)
+ struct sched_entity *se = &p->se;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ /*
+ * Since the real-depth could have been changed (only FAIR
+ * class maintain depth value), reset depth properly.
+ */
+ se->depth = se->parent ? se->parent->depth + 1 : 0;
+#endif
+ if (!se->on_rq)
return;
/*
@@ -7082,7 +7414,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
#ifdef CONFIG_FAIR_GROUP_SCHED
static void task_move_group_fair(struct task_struct *p, int on_rq)
{
+ struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq;
+
/*
* If the task was not on the rq at the time of this cgroup movement
* it must have been asleep, sleeping tasks keep their ->vruntime
@@ -7108,23 +7442,24 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
* To prevent boost or penalty in the new cfs_rq caused by delta
* min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
*/
- if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
+ if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
on_rq = 1;
if (!on_rq)
- p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
+ se->vruntime -= cfs_rq_of(se)->min_vruntime;
set_task_rq(p, task_cpu(p));
+ se->depth = se->parent ? se->parent->depth + 1 : 0;
if (!on_rq) {
- cfs_rq = cfs_rq_of(&p->se);
- p->se.vruntime += cfs_rq->min_vruntime;
+ cfs_rq = cfs_rq_of(se);
+ se->vruntime += cfs_rq->min_vruntime;
#ifdef CONFIG_SMP
/*
* migrate_task_rq_fair() will have removed our previous
* contribution, but we must synchronize for ongoing future
* decay.
*/
- p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
- cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
+ se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+ cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
#endif
}
}
@@ -7220,10 +7555,13 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
if (!se)
return;
- if (!parent)
+ if (!parent) {
se->cfs_rq = &rq->cfs;
- else
+ se->depth = 0;
+ } else {
se->cfs_rq = parent->my_q;
+ se->depth = parent->depth + 1;
+ }
se->my_q = cfs_rq;
/* guarantee group entities always have weight */
diff --git a/kernel/cpu/idle.c b/kernel/sched/idle.c
index 277f494c2a9..b7976a12717 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/sched/idle.c
@@ -3,6 +3,7 @@
*/
#include <linux/sched.h>
#include <linux/cpu.h>
+#include <linux/cpuidle.h>
#include <linux/tick.h>
#include <linux/mm.h>
#include <linux/stackprotector.h>
@@ -95,8 +96,10 @@ static void cpu_idle_loop(void)
if (!current_clr_polling_and_test()) {
stop_critical_timings();
rcu_idle_enter();
- arch_cpu_idle();
- WARN_ON_ONCE(irqs_disabled());
+ if (cpuidle_idle_call())
+ arch_cpu_idle();
+ if (WARN_ON_ONCE(irqs_disabled()))
+ local_irq_enable();
rcu_idle_exit();
start_critical_timings();
} else {
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 516c3d9ceea..879f2b75266 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -13,18 +13,8 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
{
return task_cpu(p); /* IDLE tasks as never migrated */
}
-
-static void pre_schedule_idle(struct rq *rq, struct task_struct *prev)
-{
- idle_exit_fair(rq);
- rq_last_tick_reset(rq);
-}
-
-static void post_schedule_idle(struct rq *rq)
-{
- idle_enter_fair(rq);
-}
#endif /* CONFIG_SMP */
+
/*
* Idle tasks are unconditionally rescheduled:
*/
@@ -33,13 +23,12 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
resched_task(rq->idle);
}
-static struct task_struct *pick_next_task_idle(struct rq *rq)
+static struct task_struct *
+pick_next_task_idle(struct rq *rq, struct task_struct *prev)
{
+ put_prev_task(rq, prev);
+
schedstat_inc(rq, sched_goidle);
-#ifdef CONFIG_SMP
- /* Trigger the post schedule to do an idle_enter for CFS */
- rq->post_schedule = 1;
-#endif
return rq->idle;
}
@@ -58,6 +47,8 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
{
+ idle_exit_fair(rq);
+ rq_last_tick_reset(rq);
}
static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
@@ -101,8 +92,6 @@ const struct sched_class idle_sched_class = {
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_idle,
- .pre_schedule = pre_schedule_idle,
- .post_schedule = post_schedule_idle,
#endif
.set_curr_task = set_curr_task_idle,
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a2740b775b4..d8cdf161855 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -229,6 +229,14 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
#ifdef CONFIG_SMP
+static int pull_rt_task(struct rq *this_rq);
+
+static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
+{
+ /* Try to pull RT tasks here if we lower this rq's prio */
+ return rq->rt.highest_prio.curr > prev->prio;
+}
+
static inline int rt_overloaded(struct rq *rq)
{
return atomic_read(&rq->rd->rto_count);
@@ -315,6 +323,15 @@ static inline int has_pushable_tasks(struct rq *rq)
return !plist_head_empty(&rq->rt.pushable_tasks);
}
+static inline void set_post_schedule(struct rq *rq)
+{
+ /*
+ * We detect this state here so that we can avoid taking the RQ
+ * lock again later if there is no need to push
+ */
+ rq->post_schedule = has_pushable_tasks(rq);
+}
+
static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
{
plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
@@ -359,6 +376,19 @@ void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
}
+static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
+{
+ return false;
+}
+
+static inline int pull_rt_task(struct rq *this_rq)
+{
+ return 0;
+}
+
+static inline void set_post_schedule(struct rq *rq)
+{
+}
#endif /* CONFIG_SMP */
static inline int on_rt_rq(struct sched_rt_entity *rt_se)
@@ -440,11 +470,6 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
dequeue_rt_entity(rt_se);
}
-static inline int rt_rq_throttled(struct rt_rq *rt_rq)
-{
- return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
-}
-
static int rt_se_boosted(struct sched_rt_entity *rt_se)
{
struct rt_rq *rt_rq = group_rt_rq(rt_se);
@@ -515,11 +540,6 @@ static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
{
}
-static inline int rt_rq_throttled(struct rt_rq *rt_rq)
-{
- return rt_rq->rt_throttled;
-}
-
static inline const struct cpumask *sched_rt_period_mask(void)
{
return cpu_online_mask;
@@ -538,6 +558,14 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
#endif /* CONFIG_RT_GROUP_SCHED */
+bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
+{
+ struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+
+ return (hrtimer_active(&rt_b->rt_period_timer) ||
+ rt_rq->rt_time < rt_b->rt_runtime);
+}
+
#ifdef CONFIG_SMP
/*
* We ran out of runtime, see if we can borrow some from our neighbours.
@@ -1310,15 +1338,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
{
struct sched_rt_entity *rt_se;
struct task_struct *p;
- struct rt_rq *rt_rq;
-
- rt_rq = &rq->rt;
-
- if (!rt_rq->rt_nr_running)
- return NULL;
-
- if (rt_rq_throttled(rt_rq))
- return NULL;
+ struct rt_rq *rt_rq = &rq->rt;
do {
rt_se = pick_next_rt_entity(rq, rt_rq);
@@ -1332,21 +1352,45 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
return p;
}
-static struct task_struct *pick_next_task_rt(struct rq *rq)
+static struct task_struct *
+pick_next_task_rt(struct rq *rq, struct task_struct *prev)
{
- struct task_struct *p = _pick_next_task_rt(rq);
+ struct task_struct *p;
+ struct rt_rq *rt_rq = &rq->rt;
+
+ if (need_pull_rt_task(rq, prev)) {
+ pull_rt_task(rq);
+ /*
+ * pull_rt_task() can drop (and re-acquire) rq->lock; this
+ * means a dl task can slip in, in which case we need to
+ * re-start task selection.
+ */
+ if (unlikely(rq->dl.dl_nr_running))
+ return RETRY_TASK;
+ }
+
+ /*
+ * We may dequeue prev's rt_rq in put_prev_task().
+ * So, we update time before rt_nr_running check.
+ */
+ if (prev->sched_class == &rt_sched_class)
+ update_curr_rt(rq);
+
+ if (!rt_rq->rt_nr_running)
+ return NULL;
+
+ if (rt_rq_throttled(rt_rq))
+ return NULL;
+
+ put_prev_task(rq, prev);
+
+ p = _pick_next_task_rt(rq);
/* The running task is never eligible for pushing */
if (p)
dequeue_pushable_task(rq, p);
-#ifdef CONFIG_SMP
- /*
- * We detect this state here so that we can avoid taking the RQ
- * lock again later if there is no need to push
- */
- rq->post_schedule = has_pushable_tasks(rq);
-#endif
+ set_post_schedule(rq);
return p;
}
@@ -1716,13 +1760,6 @@ skip:
return ret;
}
-static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
-{
- /* Try to pull RT tasks here if we lower this rq's prio */
- if (rq->rt.highest_prio.curr > prev->prio)
- pull_rt_task(rq);
-}
-
static void post_schedule_rt(struct rq *rq)
{
push_rt_tasks(rq);
@@ -1825,7 +1862,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
resched_task(rq->curr);
}
-void init_sched_rt_class(void)
+void __init init_sched_rt_class(void)
{
unsigned int i;
@@ -1999,7 +2036,6 @@ const struct sched_class rt_sched_class = {
.set_cpus_allowed = set_cpus_allowed_rt,
.rq_online = rq_online_rt,
.rq_offline = rq_offline_rt,
- .pre_schedule = pre_schedule_rt,
.post_schedule = post_schedule_rt,
.task_woken = task_woken_rt,
.switched_from = switched_from_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c2119fd20f8..f2de7a17562 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -24,24 +24,6 @@ extern long calc_load_fold_active(struct rq *this_rq);
extern void update_cpu_load_active(struct rq *this_rq);
/*
- * Convert user-nice values [ -20 ... 0 ... 19 ]
- * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
- * and back.
- */
-#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
-#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
-#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
-
-/*
- * 'User priority' is the nice value converted to something we
- * can work with better when scaling various scheduler parameters,
- * it's a [ 0 ... 39 ] range.
- */
-#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
-#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
-#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
-
-/*
* Helpers for converting nanosecond timing to jiffy resolution
*/
#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
@@ -441,6 +423,18 @@ struct rt_rq {
#endif
};
+#ifdef CONFIG_RT_GROUP_SCHED
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+ return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
+}
+#else
+static inline int rt_rq_throttled(struct rt_rq *rt_rq)
+{
+ return rt_rq->rt_throttled;
+}
+#endif
+
/* Deadline class' related fields in a runqueue */
struct dl_rq {
/* runqueue is an rbtree, ordered by deadline */
@@ -462,7 +456,6 @@ struct dl_rq {
} earliest_dl;
unsigned long dl_nr_migratory;
- unsigned long dl_nr_total;
int overloaded;
/*
@@ -559,11 +552,9 @@ struct rq {
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
struct list_head leaf_cfs_rq_list;
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-#ifdef CONFIG_RT_GROUP_SCHED
- struct list_head leaf_rt_rq_list;
-#endif
+ struct sched_avg avg;
+#endif /* CONFIG_FAIR_GROUP_SCHED */
/*
* This is part of a global counter where only the total sum
@@ -652,8 +643,6 @@ struct rq {
#ifdef CONFIG_SMP
struct llist_head wake_list;
#endif
-
- struct sched_avg avg;
};
static inline int cpu_of(struct rq *rq)
@@ -1113,6 +1102,8 @@ static const u32 prio_to_wmult[40] = {
#define DEQUEUE_SLEEP 1
+#define RETRY_TASK ((void *)-1UL)
+
struct sched_class {
const struct sched_class *next;
@@ -1123,14 +1114,22 @@ struct sched_class {
void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
- struct task_struct * (*pick_next_task) (struct rq *rq);
+ /*
+ * It is the responsibility of the pick_next_task() method that will
+ * return the next task to call put_prev_task() on the @prev task or
+ * something equivalent.
+ *
+ * May return RETRY_TASK when it finds a higher prio class has runnable
+ * tasks.
+ */
+ struct task_struct * (*pick_next_task) (struct rq *rq,
+ struct task_struct *prev);
void (*put_prev_task) (struct rq *rq, struct task_struct *p);
#ifdef CONFIG_SMP
int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
- void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
void (*post_schedule) (struct rq *this_rq);
void (*task_waking) (struct task_struct *task);
void (*task_woken) (struct rq *this_rq, struct task_struct *task);
@@ -1160,6 +1159,11 @@ struct sched_class {
#endif
};
+static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
+{
+ prev->sched_class->put_prev_task(rq, prev);
+}
+
#define sched_class_highest (&stop_sched_class)
#define for_each_class(class) \
for (class = sched_class_highest; class; class = class->next)
@@ -1176,16 +1180,14 @@ extern const struct sched_class idle_sched_class;
extern void update_group_power(struct sched_domain *sd, int cpu);
extern void trigger_load_balance(struct rq *rq);
-extern void idle_balance(int this_cpu, struct rq *this_rq);
extern void idle_enter_fair(struct rq *this_rq);
extern void idle_exit_fair(struct rq *this_rq);
-#else /* CONFIG_SMP */
+#else
-static inline void idle_balance(int cpu, struct rq *rq)
-{
-}
+static inline void idle_enter_fair(struct rq *rq) { }
+static inline void idle_exit_fair(struct rq *rq) { }
#endif
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index fdb6bb0b335..d6ce65dde54 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -23,16 +23,19 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
/* we're never preempted */
}
-static struct task_struct *pick_next_task_stop(struct rq *rq)
+static struct task_struct *
+pick_next_task_stop(struct rq *rq, struct task_struct *prev)
{
struct task_struct *stop = rq->stop;
- if (stop && stop->on_rq) {
- stop->se.exec_start = rq_clock_task(rq);
- return stop;
- }
+ if (!stop || !stop->on_rq)
+ return NULL;
- return NULL;
+ put_prev_task(rq, prev);
+
+ stop->se.exec_start = rq_clock_task(rq);
+
+ return stop;
}
static void
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 84571e09c90..01fbae5b97b 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -293,7 +293,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
*/
smp_call_function_single(min(cpu1, cpu2),
&irq_cpu_stop_queue_work,
- &call_args, 0);
+ &call_args, 1);
lg_local_unlock(&stop_cpus_lock);
preempt_enable();
diff --git a/kernel/sys.c b/kernel/sys.c
index c0a58be780a..adaeab6f7a8 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -174,10 +174,10 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
/* normalize: avoid signed division (rounding problems) */
error = -ESRCH;
- if (niceval < -20)
- niceval = -20;
- if (niceval > 19)
- niceval = 19;
+ if (niceval < MIN_NICE)
+ niceval = MIN_NICE;
+ if (niceval > MAX_NICE)
+ niceval = MAX_NICE;
rcu_read_lock();
read_lock(&tasklist_lock);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 49e13e1f8fe..7754ff16f33 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -386,13 +386,6 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
{
- .procname = "numa_balancing_migrate_deferred",
- .data = &sysctl_numa_balancing_migrate_deferred,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
.procname = "numa_balancing",
.data = NULL, /* filled in by handler */
.maxlen = sizeof(unsigned int),
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 0abb3646428..4d23dc4d813 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -116,20 +116,42 @@ static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
void __init sched_clock_register(u64 (*read)(void), int bits,
unsigned long rate)
{
+ u64 res, wrap, new_mask, new_epoch, cyc, ns;
+ u32 new_mult, new_shift;
+ ktime_t new_wrap_kt;
unsigned long r;
- u64 res, wrap;
char r_unit;
if (cd.rate > rate)
return;
WARN_ON(!irqs_disabled());
- read_sched_clock = read;
- sched_clock_mask = CLOCKSOURCE_MASK(bits);
- cd.rate = rate;
/* calculate the mult/shift to convert counter ticks to ns. */
- clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 3600);
+ clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600);
+
+ new_mask = CLOCKSOURCE_MASK(bits);
+
+ /* calculate how many ns until we wrap */
+ wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask);
+ new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
+
+ /* update epoch for new counter and update epoch_ns from old counter*/
+ new_epoch = read();
+ cyc = read_sched_clock();
+ ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
+ cd.mult, cd.shift);
+
+ raw_write_seqcount_begin(&cd.seq);
+ read_sched_clock = read;
+ sched_clock_mask = new_mask;
+ cd.rate = rate;
+ cd.wrap_kt = new_wrap_kt;
+ cd.mult = new_mult;
+ cd.shift = new_shift;
+ cd.epoch_cyc = new_epoch;
+ cd.epoch_ns = ns;
+ raw_write_seqcount_end(&cd.seq);
r = rate;
if (r >= 4000000) {
@@ -141,22 +163,12 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
} else
r_unit = ' ';
- /* calculate how many ns until we wrap */
- wrap = clocks_calc_max_nsecs(cd.mult, cd.shift, 0, sched_clock_mask);
- cd.wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
-
/* calculate the ns resolution of this counter */
- res = cyc_to_ns(1ULL, cd.mult, cd.shift);
+ res = cyc_to_ns(1ULL, new_mult, new_shift);
+
pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n",
bits, r, r_unit, res, wrap);
- update_sched_clock();
-
- /*
- * Ensure that sched_clock() starts off at 0ns
- */
- cd.epoch_ns = 0;
-
/* Enable IRQ time accounting if we have a fast enough sched_clock */
if (irqtime > 0 || (irqtime == -1 && rate >= 1000000))
enable_sched_clock_irqtime();
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 0aa4ce81bc1..5b40279ecd7 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1435,7 +1435,8 @@ void update_wall_time(void)
out:
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
if (clock_set)
- clock_was_set();
+ /* Have to call _delayed version, since in irq context*/
+ clock_was_set_delayed();
}
/**
diff --git a/kernel/torture.c b/kernel/torture.c
new file mode 100644
index 00000000000..acc9afc2f26
--- /dev/null
+++ b/kernel/torture.c
@@ -0,0 +1,719 @@
+/*
+ * Common functions for in-kernel torture tests.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, you can access it online at
+ * http://www.gnu.org/licenses/gpl-2.0.html.
+ *
+ * Copyright (C) IBM Corporation, 2014
+ *
+ * Author: Paul E. McKenney <paulmck@us.ibm.com>
+ * Based on kernel/rcu/torture.c.
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/freezer.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include <linux/trace_clock.h>
+#include <asm/byteorder.h>
+#include <linux/torture.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>");
+
+static char *torture_type;
+static bool verbose;
+
+/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */
+#define FULLSTOP_DONTSTOP 0 /* Normal operation. */
+#define FULLSTOP_SHUTDOWN 1 /* System shutdown with torture running. */
+#define FULLSTOP_RMMOD 2 /* Normal rmmod of torture. */
+static int fullstop = FULLSTOP_RMMOD;
+static DEFINE_MUTEX(fullstop_mutex);
+static int *torture_runnable;
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Variables for online-offline handling. Only present if CPU hotplug
+ * is enabled, otherwise does nothing.
+ */
+
+static struct task_struct *onoff_task;
+static long onoff_holdoff;
+static long onoff_interval;
+static long n_offline_attempts;
+static long n_offline_successes;
+static unsigned long sum_offline;
+static int min_offline = -1;
+static int max_offline;
+static long n_online_attempts;
+static long n_online_successes;
+static unsigned long sum_online;
+static int min_online = -1;
+static int max_online;
+
+/*
+ * Execute random CPU-hotplug operations at the interval specified
+ * by the onoff_interval.
+ */
+static int
+torture_onoff(void *arg)
+{
+ int cpu;
+ unsigned long delta;
+ int maxcpu = -1;
+ DEFINE_TORTURE_RANDOM(rand);
+ int ret;
+ unsigned long starttime;
+
+ VERBOSE_TOROUT_STRING("torture_onoff task started");
+ for_each_online_cpu(cpu)
+ maxcpu = cpu;
+ WARN_ON(maxcpu < 0);
+ if (onoff_holdoff > 0) {
+ VERBOSE_TOROUT_STRING("torture_onoff begin holdoff");
+ schedule_timeout_interruptible(onoff_holdoff);
+ VERBOSE_TOROUT_STRING("torture_onoff end holdoff");
+ }
+ while (!torture_must_stop()) {
+ cpu = (torture_random(&rand) >> 4) % (maxcpu + 1);
+ if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_onoff task: offlining %d\n",
+ torture_type, cpu);
+ starttime = jiffies;
+ n_offline_attempts++;
+ ret = cpu_down(cpu);
+ if (ret) {
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_onoff task: offline %d failed: errno %d\n",
+ torture_type, cpu, ret);
+ } else {
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_onoff task: offlined %d\n",
+ torture_type, cpu);
+ n_offline_successes++;
+ delta = jiffies - starttime;
+ sum_offline += delta;
+ if (min_offline < 0) {
+ min_offline = delta;
+ max_offline = delta;
+ }
+ if (min_offline > delta)
+ min_offline = delta;
+ if (max_offline < delta)
+ max_offline = delta;
+ }
+ } else if (cpu_is_hotpluggable(cpu)) {
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_onoff task: onlining %d\n",
+ torture_type, cpu);
+ starttime = jiffies;
+ n_online_attempts++;
+ ret = cpu_up(cpu);
+ if (ret) {
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_onoff task: online %d failed: errno %d\n",
+ torture_type, cpu, ret);
+ } else {
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_onoff task: onlined %d\n",
+ torture_type, cpu);
+ n_online_successes++;
+ delta = jiffies - starttime;
+ sum_online += delta;
+ if (min_online < 0) {
+ min_online = delta;
+ max_online = delta;
+ }
+ if (min_online > delta)
+ min_online = delta;
+ if (max_online < delta)
+ max_online = delta;
+ }
+ }
+ schedule_timeout_interruptible(onoff_interval);
+ }
+ torture_kthread_stopping("torture_onoff");
+ return 0;
+}
+
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+
+/*
+ * Initiate online-offline handling.
+ */
+int torture_onoff_init(long ooholdoff, long oointerval)
+{
+ int ret = 0;
+
+#ifdef CONFIG_HOTPLUG_CPU
+ onoff_holdoff = ooholdoff;
+ onoff_interval = oointerval;
+ if (onoff_interval <= 0)
+ return 0;
+ ret = torture_create_kthread(torture_onoff, NULL, onoff_task);
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+ return ret;
+}
+EXPORT_SYMBOL_GPL(torture_onoff_init);
+
+/*
+ * Clean up after online/offline testing.
+ */
+static void torture_onoff_cleanup(void)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+ if (onoff_task == NULL)
+ return;
+ VERBOSE_TOROUT_STRING("Stopping torture_onoff task");
+ kthread_stop(onoff_task);
+ onoff_task = NULL;
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+}
+EXPORT_SYMBOL_GPL(torture_onoff_cleanup);
+
+/*
+ * Print online/offline testing statistics.
+ */
+char *torture_onoff_stats(char *page)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+ page += sprintf(page,
+ "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ",
+ n_online_successes, n_online_attempts,
+ n_offline_successes, n_offline_attempts,
+ min_online, max_online,
+ min_offline, max_offline,
+ sum_online, sum_offline, HZ);
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+ return page;
+}
+EXPORT_SYMBOL_GPL(torture_onoff_stats);
+
+/*
+ * Were all the online/offline operations successful?
+ */
+bool torture_onoff_failures(void)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+ return n_online_successes != n_online_attempts ||
+ n_offline_successes != n_offline_attempts;
+#else /* #ifdef CONFIG_HOTPLUG_CPU */
+ return false;
+#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
+}
+EXPORT_SYMBOL_GPL(torture_onoff_failures);
+
+#define TORTURE_RANDOM_MULT 39916801 /* prime */
+#define TORTURE_RANDOM_ADD 479001701 /* prime */
+#define TORTURE_RANDOM_REFRESH 10000
+
+/*
+ * Crude but fast random-number generator. Uses a linear congruential
+ * generator, with occasional help from cpu_clock().
+ */
+unsigned long
+torture_random(struct torture_random_state *trsp)
+{
+ if (--trsp->trs_count < 0) {
+ trsp->trs_state += (unsigned long)local_clock();
+ trsp->trs_count = TORTURE_RANDOM_REFRESH;
+ }
+ trsp->trs_state = trsp->trs_state * TORTURE_RANDOM_MULT +
+ TORTURE_RANDOM_ADD;
+ return swahw32(trsp->trs_state);
+}
+EXPORT_SYMBOL_GPL(torture_random);
+
+/*
+ * Variables for shuffling. The idea is to ensure that each CPU stays
+ * idle for an extended period to test interactions with dyntick idle,
+ * as well as interactions with any per-CPU varibles.
+ */
+struct shuffle_task {
+ struct list_head st_l;
+ struct task_struct *st_t;
+};
+
+static long shuffle_interval; /* In jiffies. */
+static struct task_struct *shuffler_task;
+static cpumask_var_t shuffle_tmp_mask;
+static int shuffle_idle_cpu; /* Force all torture tasks off this CPU */
+static struct list_head shuffle_task_list = LIST_HEAD_INIT(shuffle_task_list);
+static DEFINE_MUTEX(shuffle_task_mutex);
+
+/*
+ * Register a task to be shuffled. If there is no memory, just splat
+ * and don't bother registering.
+ */
+void torture_shuffle_task_register(struct task_struct *tp)
+{
+ struct shuffle_task *stp;
+
+ if (WARN_ON_ONCE(tp == NULL))
+ return;
+ stp = kmalloc(sizeof(*stp), GFP_KERNEL);
+ if (WARN_ON_ONCE(stp == NULL))
+ return;
+ stp->st_t = tp;
+ mutex_lock(&shuffle_task_mutex);
+ list_add(&stp->st_l, &shuffle_task_list);
+ mutex_unlock(&shuffle_task_mutex);
+}
+EXPORT_SYMBOL_GPL(torture_shuffle_task_register);
+
+/*
+ * Unregister all tasks, for example, at the end of the torture run.
+ */
+static void torture_shuffle_task_unregister_all(void)
+{
+ struct shuffle_task *stp;
+ struct shuffle_task *p;
+
+ mutex_lock(&shuffle_task_mutex);
+ list_for_each_entry_safe(stp, p, &shuffle_task_list, st_l) {
+ list_del(&stp->st_l);
+ kfree(stp);
+ }
+ mutex_unlock(&shuffle_task_mutex);
+}
+
+/* Shuffle tasks such that we allow shuffle_idle_cpu to become idle.
+ * A special case is when shuffle_idle_cpu = -1, in which case we allow
+ * the tasks to run on all CPUs.
+ */
+static void torture_shuffle_tasks(void)
+{
+ struct shuffle_task *stp;
+
+ cpumask_setall(shuffle_tmp_mask);
+ get_online_cpus();
+
+ /* No point in shuffling if there is only one online CPU (ex: UP) */
+ if (num_online_cpus() == 1) {
+ put_online_cpus();
+ return;
+ }
+
+ /* Advance to the next CPU. Upon overflow, don't idle any CPUs. */
+ shuffle_idle_cpu = cpumask_next(shuffle_idle_cpu, shuffle_tmp_mask);
+ if (shuffle_idle_cpu >= nr_cpu_ids)
+ shuffle_idle_cpu = -1;
+ if (shuffle_idle_cpu != -1) {
+ cpumask_clear_cpu(shuffle_idle_cpu, shuffle_tmp_mask);
+ if (cpumask_empty(shuffle_tmp_mask)) {
+ put_online_cpus();
+ return;
+ }
+ }
+
+ mutex_lock(&shuffle_task_mutex);
+ list_for_each_entry(stp, &shuffle_task_list, st_l)
+ set_cpus_allowed_ptr(stp->st_t, shuffle_tmp_mask);
+ mutex_unlock(&shuffle_task_mutex);
+
+ put_online_cpus();
+}
+
+/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
+ * system to become idle at a time and cut off its timer ticks. This is meant
+ * to test the support for such tickless idle CPU in RCU.
+ */
+static int torture_shuffle(void *arg)
+{
+ VERBOSE_TOROUT_STRING("torture_shuffle task started");
+ do {
+ schedule_timeout_interruptible(shuffle_interval);
+ torture_shuffle_tasks();
+ torture_shutdown_absorb("torture_shuffle");
+ } while (!torture_must_stop());
+ torture_kthread_stopping("torture_shuffle");
+ return 0;
+}
+
+/*
+ * Start the shuffler, with shuffint in jiffies.
+ */
+int torture_shuffle_init(long shuffint)
+{
+ shuffle_interval = shuffint;
+
+ shuffle_idle_cpu = -1;
+
+ if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) {
+ VERBOSE_TOROUT_ERRSTRING("Failed to alloc mask");
+ return -ENOMEM;
+ }
+
+ /* Create the shuffler thread */
+ return torture_create_kthread(torture_shuffle, NULL, shuffler_task);
+}
+EXPORT_SYMBOL_GPL(torture_shuffle_init);
+
+/*
+ * Stop the shuffling.
+ */
+static void torture_shuffle_cleanup(void)
+{
+ torture_shuffle_task_unregister_all();
+ if (shuffler_task) {
+ VERBOSE_TOROUT_STRING("Stopping torture_shuffle task");
+ kthread_stop(shuffler_task);
+ free_cpumask_var(shuffle_tmp_mask);
+ }
+ shuffler_task = NULL;
+}
+EXPORT_SYMBOL_GPL(torture_shuffle_cleanup);
+
+/*
+ * Variables for auto-shutdown. This allows "lights out" torture runs
+ * to be fully scripted.
+ */
+static int shutdown_secs; /* desired test duration in seconds. */
+static struct task_struct *shutdown_task;
+static unsigned long shutdown_time; /* jiffies to system shutdown. */
+static void (*torture_shutdown_hook)(void);
+
+/*
+ * Absorb kthreads into a kernel function that won't return, so that
+ * they won't ever access module text or data again.
+ */
+void torture_shutdown_absorb(const char *title)
+{
+ while (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
+ pr_notice("torture thread %s parking due to system shutdown\n",
+ title);
+ schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT);
+ }
+}
+EXPORT_SYMBOL_GPL(torture_shutdown_absorb);
+
+/*
+ * Cause the torture test to shutdown the system after the test has
+ * run for the time specified by the shutdown_secs parameter.
+ */
+static int torture_shutdown(void *arg)
+{
+ long delta;
+ unsigned long jiffies_snap;
+
+ VERBOSE_TOROUT_STRING("torture_shutdown task started");
+ jiffies_snap = jiffies;
+ while (ULONG_CMP_LT(jiffies_snap, shutdown_time) &&
+ !torture_must_stop()) {
+ delta = shutdown_time - jiffies_snap;
+ if (verbose)
+ pr_alert("%s" TORTURE_FLAG
+ "torture_shutdown task: %lu jiffies remaining\n",
+ torture_type, delta);
+ schedule_timeout_interruptible(delta);
+ jiffies_snap = jiffies;
+ }
+ if (torture_must_stop()) {
+ torture_kthread_stopping("torture_shutdown");
+ return 0;
+ }
+
+ /* OK, shut down the system. */
+
+ VERBOSE_TOROUT_STRING("torture_shutdown task shutting down system");
+ shutdown_task = NULL; /* Avoid self-kill deadlock. */
+ if (torture_shutdown_hook)
+ torture_shutdown_hook();
+ else
+ VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping.");
+ kernel_power_off(); /* Shut down the system. */
+ return 0;
+}
+
+/*
+ * Start up the shutdown task.
+ */
+int torture_shutdown_init(int ssecs, void (*cleanup)(void))
+{
+ int ret = 0;
+
+ shutdown_secs = ssecs;
+ torture_shutdown_hook = cleanup;
+ if (shutdown_secs > 0) {
+ shutdown_time = jiffies + shutdown_secs * HZ;
+ ret = torture_create_kthread(torture_shutdown, NULL,
+ shutdown_task);
+ }
+ return ret;
+}
+EXPORT_SYMBOL_GPL(torture_shutdown_init);
+
+/*
+ * Detect and respond to a system shutdown.
+ */
+static int torture_shutdown_notify(struct notifier_block *unused1,
+ unsigned long unused2, void *unused3)
+{
+ mutex_lock(&fullstop_mutex);
+ if (ACCESS_ONCE(fullstop) == FULLSTOP_DONTSTOP) {
+ VERBOSE_TOROUT_STRING("Unscheduled system shutdown detected");
+ ACCESS_ONCE(fullstop) = FULLSTOP_SHUTDOWN;
+ } else {
+ pr_warn("Concurrent rmmod and shutdown illegal!\n");
+ }
+ mutex_unlock(&fullstop_mutex);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block torture_shutdown_nb = {
+ .notifier_call = torture_shutdown_notify,
+};
+
+/*
+ * Shut down the shutdown task. Say what??? Heh! This can happen if
+ * the torture module gets an rmmod before the shutdown time arrives. ;-)
+ */
+static void torture_shutdown_cleanup(void)
+{
+ unregister_reboot_notifier(&torture_shutdown_nb);
+ if (shutdown_task != NULL) {
+ VERBOSE_TOROUT_STRING("Stopping torture_shutdown task");
+ kthread_stop(shutdown_task);
+ }
+ shutdown_task = NULL;
+}
+
+/*
+ * Variables for stuttering, which means to periodically pause and
+ * restart testing in order to catch bugs that appear when load is
+ * suddenly applied to or removed from the system.
+ */
+static struct task_struct *stutter_task;
+static int stutter_pause_test;
+static int stutter;
+
+/*
+ * Block until the stutter interval ends. This must be called periodically
+ * by all running kthreads that need to be subject to stuttering.
+ */
+void stutter_wait(const char *title)
+{
+ while (ACCESS_ONCE(stutter_pause_test) ||
+ (torture_runnable && !ACCESS_ONCE(*torture_runnable))) {
+ if (stutter_pause_test)
+ schedule_timeout_interruptible(1);
+ else
+ schedule_timeout_interruptible(round_jiffies_relative(HZ));
+ torture_shutdown_absorb(title);
+ }
+}
+EXPORT_SYMBOL_GPL(stutter_wait);
+
+/*
+ * Cause the torture test to "stutter", starting and stopping all
+ * threads periodically.
+ */
+static int torture_stutter(void *arg)
+{
+ VERBOSE_TOROUT_STRING("torture_stutter task started");
+ do {
+ if (!torture_must_stop()) {
+ schedule_timeout_interruptible(stutter);
+ ACCESS_ONCE(stutter_pause_test) = 1;
+ }
+ if (!torture_must_stop())
+ schedule_timeout_interruptible(stutter);
+ ACCESS_ONCE(stutter_pause_test) = 0;
+ torture_shutdown_absorb("torture_stutter");
+ } while (!torture_must_stop());
+ torture_kthread_stopping("torture_stutter");
+ return 0;
+}
+
+/*
+ * Initialize and kick off the torture_stutter kthread.
+ */
+int torture_stutter_init(int s)
+{
+ int ret;
+
+ stutter = s;
+ ret = torture_create_kthread(torture_stutter, NULL, stutter_task);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(torture_stutter_init);
+
+/*
+ * Cleanup after the torture_stutter kthread.
+ */
+static void torture_stutter_cleanup(void)
+{
+ if (!stutter_task)
+ return;
+ VERBOSE_TOROUT_STRING("Stopping torture_stutter task");
+ kthread_stop(stutter_task);
+ stutter_task = NULL;
+}
+
+/*
+ * Initialize torture module. Please note that this is -not- invoked via
+ * the usual module_init() mechanism, but rather by an explicit call from
+ * the client torture module. This call must be paired with a later
+ * torture_init_end().
+ *
+ * The runnable parameter points to a flag that controls whether or not
+ * the test is currently runnable. If there is no such flag, pass in NULL.
+ */
+void __init torture_init_begin(char *ttype, bool v, int *runnable)
+{
+ mutex_lock(&fullstop_mutex);
+ torture_type = ttype;
+ verbose = v;
+ torture_runnable = runnable;
+ fullstop = FULLSTOP_DONTSTOP;
+
+}
+EXPORT_SYMBOL_GPL(torture_init_begin);
+
+/*
+ * Tell the torture module that initialization is complete.
+ */
+void __init torture_init_end(void)
+{
+ mutex_unlock(&fullstop_mutex);
+ register_reboot_notifier(&torture_shutdown_nb);
+}
+EXPORT_SYMBOL_GPL(torture_init_end);
+
+/*
+ * Clean up torture module. Please note that this is -not- invoked via
+ * the usual module_exit() mechanism, but rather by an explicit call from
+ * the client torture module. Returns true if a race with system shutdown
+ * is detected, otherwise, all kthreads started by functions in this file
+ * will be shut down.
+ *
+ * This must be called before the caller starts shutting down its own
+ * kthreads.
+ */
+bool torture_cleanup(void)
+{
+ mutex_lock(&fullstop_mutex);
+ if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
+ pr_warn("Concurrent rmmod and shutdown illegal!\n");
+ mutex_unlock(&fullstop_mutex);
+ schedule_timeout_uninterruptible(10);
+ return true;
+ }
+ ACCESS_ONCE(fullstop) = FULLSTOP_RMMOD;
+ mutex_unlock(&fullstop_mutex);
+ torture_shutdown_cleanup();
+ torture_shuffle_cleanup();
+ torture_stutter_cleanup();
+ torture_onoff_cleanup();
+ return false;
+}
+EXPORT_SYMBOL_GPL(torture_cleanup);
+
+/*
+ * Is it time for the current torture test to stop?
+ */
+bool torture_must_stop(void)
+{
+ return torture_must_stop_irq() || kthread_should_stop();
+}
+EXPORT_SYMBOL_GPL(torture_must_stop);
+
+/*
+ * Is it time for the current torture test to stop? This is the irq-safe
+ * version, hence no check for kthread_should_stop().
+ */
+bool torture_must_stop_irq(void)
+{
+ return ACCESS_ONCE(fullstop) != FULLSTOP_DONTSTOP;
+}
+EXPORT_SYMBOL_GPL(torture_must_stop_irq);
+
+/*
+ * Each kthread must wait for kthread_should_stop() before returning from
+ * its top-level function, otherwise segfaults ensue. This function
+ * prints a "stopping" message and waits for kthread_should_stop(), and
+ * should be called from all torture kthreads immediately prior to
+ * returning.
+ */
+void torture_kthread_stopping(char *title)
+{
+ if (verbose)
+ VERBOSE_TOROUT_STRING(title);
+ while (!kthread_should_stop()) {
+ torture_shutdown_absorb(title);
+ schedule_timeout_uninterruptible(1);
+ }
+}
+EXPORT_SYMBOL_GPL(torture_kthread_stopping);
+
+/*
+ * Create a generic torture kthread that is immediately runnable. If you
+ * need the kthread to be stopped so that you can do something to it before
+ * it starts, you will need to open-code your own.
+ */
+int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m,
+ char *f, struct task_struct **tp)
+{
+ int ret = 0;
+
+ VERBOSE_TOROUT_STRING(m);
+ *tp = kthread_run(fn, arg, s);
+ if (IS_ERR(*tp)) {
+ ret = PTR_ERR(*tp);
+ VERBOSE_TOROUT_ERRSTRING(f);
+ *tp = NULL;
+ }
+ torture_shuffle_task_register(*tp);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(_torture_create_kthread);
+
+/*
+ * Stop a generic kthread, emitting a message.
+ */
+void _torture_stop_kthread(char *m, struct task_struct **tp)
+{
+ if (*tp == NULL)
+ return;
+ VERBOSE_TOROUT_STRING(m);
+ kthread_stop(*tp);
+ *tp = NULL;
+}
+EXPORT_SYMBOL_GPL(_torture_stop_kthread);
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index a5457d577b9..0434ff1b808 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -40,8 +40,8 @@ static int write_iteration = 50;
module_param(write_iteration, uint, 0644);
MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings");
-static int producer_nice = 19;
-static int consumer_nice = 19;
+static int producer_nice = MAX_NICE;
+static int consumer_nice = MAX_NICE;
static int producer_fifo = -1;
static int consumer_fifo = -1;
@@ -308,7 +308,7 @@ static void ring_buffer_producer(void)
/* Let the user know that the test is running at low priority */
if (producer_fifo < 0 && consumer_fifo < 0 &&
- producer_nice == 19 && consumer_nice == 19)
+ producer_nice == MAX_NICE && consumer_nice == MAX_NICE)
trace_printk("WARNING!!! This test is running at lowest priority.\n");
trace_printk("Time: %lld (usecs)\n", time);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 815c878f409..24c1f238255 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1600,15 +1600,31 @@ void trace_buffer_unlock_commit(struct ring_buffer *buffer,
}
EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit);
+static struct ring_buffer *temp_buffer;
+
struct ring_buffer_event *
trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
struct ftrace_event_file *ftrace_file,
int type, unsigned long len,
unsigned long flags, int pc)
{
+ struct ring_buffer_event *entry;
+
*current_rb = ftrace_file->tr->trace_buffer.buffer;
- return trace_buffer_lock_reserve(*current_rb,
+ entry = trace_buffer_lock_reserve(*current_rb,
type, len, flags, pc);
+ /*
+ * If tracing is off, but we have triggers enabled
+ * we still need to look at the event data. Use the temp_buffer
+ * to store the trace event for the tigger to use. It's recusive
+ * safe and will not be recorded anywhere.
+ */
+ if (!entry && ftrace_file->flags & FTRACE_EVENT_FL_TRIGGER_COND) {
+ *current_rb = temp_buffer;
+ entry = trace_buffer_lock_reserve(*current_rb,
+ type, len, flags, pc);
+ }
+ return entry;
}
EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve);
@@ -6494,11 +6510,16 @@ __init static int tracer_alloc_buffers(void)
raw_spin_lock_init(&global_trace.start_lock);
+ /* Used for event triggers */
+ temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE);
+ if (!temp_buffer)
+ goto out_free_cpumask;
+
/* TODO: make the number of buffers hot pluggable with CPUS */
if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
WARN_ON(1);
- goto out_free_cpumask;
+ goto out_free_temp_buffer;
}
if (global_trace.buffer_disabled)
@@ -6540,6 +6561,8 @@ __init static int tracer_alloc_buffers(void)
return 0;
+out_free_temp_buffer:
+ ring_buffer_free(temp_buffer);
out_free_cpumask:
free_percpu(global_trace.trace_buffer.data);
#ifdef CONFIG_TRACER_MAX_TRACE
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index e854f420e03..c894614de14 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -31,9 +31,25 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
}
/* The ftrace function trace is allowed only for root. */
- if (ftrace_event_is_function(tp_event) &&
- perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
- return -EPERM;
+ if (ftrace_event_is_function(tp_event)) {
+ if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /*
+ * We don't allow user space callchains for function trace
+ * event, due to issues with page faults while tracing page
+ * fault handler and its overall trickiness nature.
+ */
+ if (!p_event->attr.exclude_callchain_user)
+ return -EINVAL;
+
+ /*
+ * Same reason to disable user stack dump as for user space
+ * callchains above.
+ */
+ if (p_event->attr.sample_type & PERF_SAMPLE_STACK_USER)
+ return -EINVAL;
+ }
/* No tracing, just counting, so no obvious leak */
if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e71ffd4eccb..7b16d40bd64 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -27,12 +27,6 @@
DEFINE_MUTEX(event_mutex);
-DEFINE_MUTEX(event_storage_mutex);
-EXPORT_SYMBOL_GPL(event_storage_mutex);
-
-char event_storage[EVENT_STORAGE_SIZE];
-EXPORT_SYMBOL_GPL(event_storage);
-
LIST_HEAD(ftrace_events);
static LIST_HEAD(ftrace_common_fields);
@@ -1777,6 +1771,16 @@ static void trace_module_add_events(struct module *mod)
{
struct ftrace_event_call **call, **start, **end;
+ if (!mod->num_trace_events)
+ return;
+
+ /* Don't add infrastructure for mods without tracepoints */
+ if (trace_module_has_bad_taint(mod)) {
+ pr_err("%s: module has bad taint, not creating trace events\n",
+ mod->name);
+ return;
+ }
+
start = mod->trace_events;
end = mod->trace_events + mod->num_trace_events;
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 7c3e3e72e2b..ee0a5098ac4 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -95,15 +95,12 @@ static void __always_unused ____ftrace_check_##name(void) \
#undef __array
#define __array(type, item, len) \
do { \
+ char *type_str = #type"["__stringify(len)"]"; \
BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
- mutex_lock(&event_storage_mutex); \
- snprintf(event_storage, sizeof(event_storage), \
- "%s[%d]", #type, len); \
- ret = trace_define_field(event_call, event_storage, #item, \
+ ret = trace_define_field(event_call, type_str, #item, \
offsetof(typeof(field), item), \
sizeof(field.item), \
is_signed_type(type), filter_type); \
- mutex_unlock(&event_storage_mutex); \
if (ret) \
return ret; \
} while (0);
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 29f26540e9c..031cc5655a5 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -631,6 +631,11 @@ void tracepoint_iter_reset(struct tracepoint_iter *iter)
EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
#ifdef CONFIG_MODULES
+bool trace_module_has_bad_taint(struct module *mod)
+{
+ return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP));
+}
+
static int tracepoint_module_coming(struct module *mod)
{
struct tp_module *tp_mod, *iter;
@@ -641,7 +646,7 @@ static int tracepoint_module_coming(struct module *mod)
* module headers (for forced load), to make sure we don't cause a crash.
* Staging and out-of-tree GPL modules are fine.
*/
- if (mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP)))
+ if (trace_module_has_bad_taint(mod))
return 0;
mutex_lock(&tracepoints_mutex);
tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 240fb62cf39..dd06439b9c8 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -225,7 +225,7 @@ static u32 map_id_up(struct uid_gid_map *map, u32 id)
*
* When there is no mapping defined for the user-namespace uid
* pair INVALID_UID is returned. Callers are expected to test
- * for and handle handle INVALID_UID being returned. INVALID_UID
+ * for and handle INVALID_UID being returned. INVALID_UID
* may be tested for using uid_valid().
*/
kuid_t make_kuid(struct user_namespace *ns, uid_t uid)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 82ef9f3b747..3fa5b8f3aae 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1851,6 +1851,12 @@ static void destroy_worker(struct worker *worker)
if (worker->flags & WORKER_IDLE)
pool->nr_idle--;
+ /*
+ * Once WORKER_DIE is set, the kworker may destroy itself at any
+ * point. Pin to ensure the task stays until we're done with it.
+ */
+ get_task_struct(worker->task);
+
list_del_init(&worker->entry);
worker->flags |= WORKER_DIE;
@@ -1859,6 +1865,7 @@ static void destroy_worker(struct worker *worker)
spin_unlock_irq(&pool->lock);
kthread_stop(worker->task);
+ put_task_struct(worker->task);
kfree(worker);
spin_lock_irq(&pool->lock);
@@ -3218,7 +3225,7 @@ static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
return -ENOMEM;
if (sscanf(buf, "%d", &attrs->nice) == 1 &&
- attrs->nice >= -20 && attrs->nice <= 19)
+ attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
ret = apply_workqueue_attrs(wq, attrs);
else
ret = -EINVAL;