aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c2
-rw-r--r--kernel/audit_tree.c1
-rw-r--r--kernel/auditfilter.c3
-rw-r--r--kernel/cgroup.c31
-rw-r--r--kernel/context_tracking.c41
-rw-r--r--kernel/cpu.c55
-rw-r--r--kernel/cpu/idle.c19
-rw-r--r--kernel/events/core.c473
-rw-r--r--kernel/events/hw_breakpoint.c6
-rw-r--r--kernel/events/internal.h4
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/irq/irqdomain.c9
-rw-r--r--kernel/kmod.c5
-rw-r--r--kernel/kprobes.c30
-rw-r--r--kernel/module.c21
-rw-r--r--kernel/printk.c91
-rw-r--r--kernel/ptrace.c20
-rw-r--r--kernel/range.c19
-rw-r--r--kernel/rcutree.c21
-rw-r--r--kernel/rcutree.h2
-rw-r--r--kernel/rcutree_plugin.h4
-rw-r--r--kernel/sched/core.c23
-rw-r--r--kernel/sched/cputime.c6
-rw-r--r--kernel/softirq.c13
-rw-r--r--kernel/sys.c29
-rw-r--r--kernel/time/Kconfig5
-rw-r--r--kernel/time/ntp.c1
-rw-r--r--kernel/time/tick-broadcast.c29
-rw-r--r--kernel/time/tick-sched.c5
-rw-r--r--kernel/time/timekeeping.c8
-rw-r--r--kernel/timer.c2
-rw-r--r--kernel/trace/ftrace.c18
-rw-r--r--kernel/trace/ring_buffer.c3
-rw-r--r--kernel/trace/trace.c27
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_events.c4
-rw-r--r--kernel/trace/trace_events_filter.c4
-rw-r--r--kernel/trace/trace_kprobe.c53
-rw-r--r--kernel/trace/trace_selftest.c2
-rw-r--r--kernel/workqueue.c19
40 files changed, 667 insertions, 445 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index 21c7fa615bd..91e53d04b6a 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1056,7 +1056,7 @@ static inline void audit_get_stamp(struct audit_context *ctx,
static void wait_for_auditd(unsigned long sleep_time)
{
DECLARE_WAITQUEUE(wait, current);
- set_current_state(TASK_INTERRUPTIBLE);
+ set_current_state(TASK_UNINTERRUPTIBLE);
add_wait_queue(&audit_backlog_wait, &wait);
if (audit_backlog_limit &&
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index a291aa23fb3..43c307dc945 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -658,6 +658,7 @@ int audit_add_tree_rule(struct audit_krule *rule)
struct vfsmount *mnt;
int err;
+ rule->tree = NULL;
list_for_each_entry(tree, &tree_list, list) {
if (!strcmp(seed->pathname, tree->pathname)) {
put_tree(seed);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 83a2970295d..6bd4a90d199 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1021,9 +1021,6 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re
* @seq: netlink audit message sequence (serial) number
* @data: payload data
* @datasz: size of payload data
- * @loginuid: loginuid of sender
- * @sessionid: sessionid for netlink audit message
- * @sid: SE Linux Security ID of sender
*/
int audit_receive_filter(int type, int pid, int seq, void *data, size_t datasz)
{
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2a9926275f8..a7c9e6ddb97 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1686,11 +1686,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
*/
cgroup_drop_root(opts.new_root);
- if (((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) &&
- root->flags != opts.flags) {
- pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
- ret = -EINVAL;
- goto drop_new_super;
+ if (root->flags != opts.flags) {
+ if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
+ pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
+ ret = -EINVAL;
+ goto drop_new_super;
+ } else {
+ pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n");
+ }
}
/* no subsys rebinding, so refcounts don't change */
@@ -2699,13 +2702,14 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
goto out;
}
+ cfe->type = (void *)cft;
+ cfe->dentry = dentry;
+ dentry->d_fsdata = cfe;
+ simple_xattrs_init(&cfe->xattrs);
+
mode = cgroup_file_mode(cft);
error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb);
if (!error) {
- cfe->type = (void *)cft;
- cfe->dentry = dentry;
- dentry->d_fsdata = cfe;
- simple_xattrs_init(&cfe->xattrs);
list_add_tail(&cfe->node, &parent->files);
cfe = NULL;
}
@@ -2953,11 +2957,8 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
WARN_ON_ONCE(!rcu_read_lock_held());
/* if first iteration, pretend we just visited @cgroup */
- if (!pos) {
- if (list_empty(&cgroup->children))
- return NULL;
+ if (!pos)
pos = cgroup;
- }
/* visit the first child if exists */
next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling);
@@ -2965,14 +2966,14 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos,
return next;
/* no child, visit my or the closest ancestor's next sibling */
- do {
+ while (pos != cgroup) {
next = list_entry_rcu(pos->sibling.next, struct cgroup,
sibling);
if (&next->sibling != &pos->parent->children)
return next;
pos = pos->parent;
- } while (pos != cgroup);
+ }
return NULL;
}
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 65349f07b87..383f8231e43 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -15,7 +15,6 @@
*/
#include <linux/context_tracking.h>
-#include <linux/kvm_host.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/hardirq.h>
@@ -71,6 +70,46 @@ void user_enter(void)
local_irq_restore(flags);
}
+#ifdef CONFIG_PREEMPT
+/**
+ * preempt_schedule_context - preempt_schedule called by tracing
+ *
+ * The tracing infrastructure uses preempt_enable_notrace to prevent
+ * recursion and tracing preempt enabling caused by the tracing
+ * infrastructure itself. But as tracing can happen in areas coming
+ * from userspace or just about to enter userspace, a preempt enable
+ * can occur before user_exit() is called. This will cause the scheduler
+ * to be called when the system is still in usermode.
+ *
+ * To prevent this, the preempt_enable_notrace will use this function
+ * instead of preempt_schedule() to exit user context if needed before
+ * calling the scheduler.
+ */
+void __sched notrace preempt_schedule_context(void)
+{
+ struct thread_info *ti = current_thread_info();
+ enum ctx_state prev_ctx;
+
+ if (likely(ti->preempt_count || irqs_disabled()))
+ return;
+
+ /*
+ * Need to disable preemption in case user_exit() is traced
+ * and the tracer calls preempt_enable_notrace() causing
+ * an infinite recursion.
+ */
+ preempt_disable_notrace();
+ prev_ctx = exception_enter();
+ preempt_enable_no_resched_notrace();
+
+ preempt_schedule();
+
+ preempt_disable_notrace();
+ exception_exit(prev_ctx);
+ preempt_enable_notrace();
+}
+EXPORT_SYMBOL_GPL(preempt_schedule_context);
+#endif /* CONFIG_PREEMPT */
/**
* user_exit - Inform the context tracking that the CPU is
diff --git a/kernel/cpu.c b/kernel/cpu.c
index b5e4ab2d427..198a38883e6 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -133,6 +133,27 @@ static void cpu_hotplug_done(void)
mutex_unlock(&cpu_hotplug.lock);
}
+/*
+ * Wait for currently running CPU hotplug operations to complete (if any) and
+ * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects
+ * the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the
+ * hotplug path before performing hotplug operations. So acquiring that lock
+ * guarantees mutual exclusion from any currently running hotplug operations.
+ */
+void cpu_hotplug_disable(void)
+{
+ cpu_maps_update_begin();
+ cpu_hotplug_disabled = 1;
+ cpu_maps_update_done();
+}
+
+void cpu_hotplug_enable(void)
+{
+ cpu_maps_update_begin();
+ cpu_hotplug_disabled = 0;
+ cpu_maps_update_done();
+}
+
#else /* #if CONFIG_HOTPLUG_CPU */
static void cpu_hotplug_begin(void) {}
static void cpu_hotplug_done(void) {}
@@ -541,36 +562,6 @@ static int __init alloc_frozen_cpus(void)
core_initcall(alloc_frozen_cpus);
/*
- * Prevent regular CPU hotplug from racing with the freezer, by disabling CPU
- * hotplug when tasks are about to be frozen. Also, don't allow the freezer
- * to continue until any currently running CPU hotplug operation gets
- * completed.
- * To modify the 'cpu_hotplug_disabled' flag, we need to acquire the
- * 'cpu_add_remove_lock'. And this same lock is also taken by the regular
- * CPU hotplug path and released only after it is complete. Thus, we
- * (and hence the freezer) will block here until any currently running CPU
- * hotplug operation gets completed.
- */
-void cpu_hotplug_disable_before_freeze(void)
-{
- cpu_maps_update_begin();
- cpu_hotplug_disabled = 1;
- cpu_maps_update_done();
-}
-
-
-/*
- * When tasks have been thawed, re-enable regular CPU hotplug (which had been
- * disabled while beginning to freeze tasks).
- */
-void cpu_hotplug_enable_after_thaw(void)
-{
- cpu_maps_update_begin();
- cpu_hotplug_disabled = 0;
- cpu_maps_update_done();
-}
-
-/*
* When callbacks for CPU hotplug notifications are being executed, we must
* ensure that the state of the system with respect to the tasks being frozen
* or not, as reported by the notification, remains unchanged *throughout the
@@ -589,12 +580,12 @@ cpu_hotplug_pm_callback(struct notifier_block *nb,
case PM_SUSPEND_PREPARE:
case PM_HIBERNATION_PREPARE:
- cpu_hotplug_disable_before_freeze();
+ cpu_hotplug_disable();
break;
case PM_POST_SUSPEND:
case PM_POST_HIBERNATION:
- cpu_hotplug_enable_after_thaw();
+ cpu_hotplug_enable();
break;
default:
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
index 8b86c0c68ed..e695c0a0bcb 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/cpu/idle.c
@@ -5,6 +5,7 @@
#include <linux/cpu.h>
#include <linux/tick.h>
#include <linux/mm.h>
+#include <linux/stackprotector.h>
#include <asm/tlb.h>
@@ -40,11 +41,13 @@ __setup("hlt", cpu_idle_nopoll_setup);
static inline int cpu_idle_poll(void)
{
+ rcu_idle_enter();
trace_cpu_idle_rcuidle(0, smp_processor_id());
local_irq_enable();
while (!need_resched())
cpu_relax();
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
+ rcu_idle_exit();
return 1;
}
@@ -56,6 +59,7 @@ void __weak arch_cpu_idle_dead(void) { }
void __weak arch_cpu_idle(void)
{
cpu_idle_force_poll = 1;
+ local_irq_enable();
}
/*
@@ -110,6 +114,21 @@ static void cpu_idle_loop(void)
void cpu_startup_entry(enum cpuhp_state state)
{
+ /*
+ * This #ifdef needs to die, but it's too late in the cycle to
+ * make this generic (arm and sh have never invoked the canary
+ * init for the non boot cpus!). Will be fixed in 3.11
+ */
+#ifdef CONFIG_X86
+ /*
+ * If we're the non-boot CPU, nothing set the stack canary up
+ * for us. The boot CPU already has it initialized but no harm
+ * in doing it again. This is a good place for updating it, as
+ * we wont ever return from this function (so the invalid
+ * canaries already on the stack wont ever trigger).
+ */
+ boot_init_stack_canary();
+#endif
current_set_polling();
arch_cpu_idle_prepare();
cpu_idle_loop();
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6b41c1899a8..b391907d535 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -196,9 +196,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
static void update_context_time(struct perf_event_context *ctx);
static u64 perf_event_time(struct perf_event *event);
-static void ring_buffer_attach(struct perf_event *event,
- struct ring_buffer *rb);
-
void __weak perf_event_print_debug(void) { }
extern __weak const char *perf_pmu_name(void)
@@ -2918,6 +2915,7 @@ static void free_event_rcu(struct rcu_head *head)
}
static void ring_buffer_put(struct ring_buffer *rb);
+static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb);
static void free_event(struct perf_event *event)
{
@@ -2942,15 +2940,30 @@ static void free_event(struct perf_event *event)
if (has_branch_stack(event)) {
static_key_slow_dec_deferred(&perf_sched_events);
/* is system-wide event */
- if (!(event->attach_state & PERF_ATTACH_TASK))
+ if (!(event->attach_state & PERF_ATTACH_TASK)) {
atomic_dec(&per_cpu(perf_branch_stack_events,
event->cpu));
+ }
}
}
if (event->rb) {
- ring_buffer_put(event->rb);
- event->rb = NULL;
+ struct ring_buffer *rb;
+
+ /*
+ * Can happen when we close an event with re-directed output.
+ *
+ * Since we have a 0 refcount, perf_mmap_close() will skip
+ * over us; possibly making our ring_buffer_put() the last.
+ */
+ mutex_lock(&event->mmap_mutex);
+ rb = event->rb;
+ if (rb) {
+ rcu_assign_pointer(event->rb, NULL);
+ ring_buffer_detach(event, rb);
+ ring_buffer_put(rb); /* could be last */
+ }
+ mutex_unlock(&event->mmap_mutex);
}
if (is_cgroup_event(event))
@@ -3188,30 +3201,13 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
unsigned int events = POLL_HUP;
/*
- * Race between perf_event_set_output() and perf_poll(): perf_poll()
- * grabs the rb reference but perf_event_set_output() overrides it.
- * Here is the timeline for two threads T1, T2:
- * t0: T1, rb = rcu_dereference(event->rb)
- * t1: T2, old_rb = event->rb
- * t2: T2, event->rb = new rb
- * t3: T2, ring_buffer_detach(old_rb)
- * t4: T1, ring_buffer_attach(rb1)
- * t5: T1, poll_wait(event->waitq)
- *
- * To avoid this problem, we grab mmap_mutex in perf_poll()
- * thereby ensuring that the assignment of the new ring buffer
- * and the detachment of the old buffer appear atomic to perf_poll()
+ * Pin the event->rb by taking event->mmap_mutex; otherwise
+ * perf_event_set_output() can swizzle our rb and make us miss wakeups.
*/
mutex_lock(&event->mmap_mutex);
-
- rcu_read_lock();
- rb = rcu_dereference(event->rb);
- if (rb) {
- ring_buffer_attach(event, rb);
+ rb = event->rb;
+ if (rb)
events = atomic_xchg(&rb->poll, 0);
- }
- rcu_read_unlock();
-
mutex_unlock(&event->mmap_mutex);
poll_wait(file, &event->waitq, wait);
@@ -3521,16 +3517,12 @@ static void ring_buffer_attach(struct perf_event *event,
return;
spin_lock_irqsave(&rb->event_lock, flags);
- if (!list_empty(&event->rb_entry))
- goto unlock;
-
- list_add(&event->rb_entry, &rb->event_list);
-unlock:
+ if (list_empty(&event->rb_entry))
+ list_add(&event->rb_entry, &rb->event_list);
spin_unlock_irqrestore(&rb->event_lock, flags);
}
-static void ring_buffer_detach(struct perf_event *event,
- struct ring_buffer *rb)
+static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb)
{
unsigned long flags;
@@ -3549,13 +3541,10 @@ static void ring_buffer_wakeup(struct perf_event *event)
rcu_read_lock();
rb = rcu_dereference(event->rb);
- if (!rb)
- goto unlock;
-
- list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
- wake_up_all(&event->waitq);
-
-unlock:
+ if (rb) {
+ list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
+ wake_up_all(&event->waitq);
+ }
rcu_read_unlock();
}
@@ -3584,18 +3573,10 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
static void ring_buffer_put(struct ring_buffer *rb)
{
- struct perf_event *event, *n;
- unsigned long flags;
-
if (!atomic_dec_and_test(&rb->refcount))
return;
- spin_lock_irqsave(&rb->event_lock, flags);
- list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) {
- list_del_init(&event->rb_entry);
- wake_up_all(&event->waitq);
- }
- spin_unlock_irqrestore(&rb->event_lock, flags);
+ WARN_ON_ONCE(!list_empty(&rb->event_list));
call_rcu(&rb->rcu_head, rb_free_rcu);
}
@@ -3605,26 +3586,100 @@ static void perf_mmap_open(struct vm_area_struct *vma)
struct perf_event *event = vma->vm_file->private_data;
atomic_inc(&event->mmap_count);
+ atomic_inc(&event->rb->mmap_count);
}
+/*
+ * A buffer can be mmap()ed multiple times; either directly through the same
+ * event, or through other events by use of perf_event_set_output().
+ *
+ * In order to undo the VM accounting done by perf_mmap() we need to destroy
+ * the buffer here, where we still have a VM context. This means we need
+ * to detach all events redirecting to us.
+ */
static void perf_mmap_close(struct vm_area_struct *vma)
{
struct perf_event *event = vma->vm_file->private_data;
- if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
- unsigned long size = perf_data_size(event->rb);
- struct user_struct *user = event->mmap_user;
- struct ring_buffer *rb = event->rb;
+ struct ring_buffer *rb = event->rb;
+ struct user_struct *mmap_user = rb->mmap_user;
+ int mmap_locked = rb->mmap_locked;
+ unsigned long size = perf_data_size(rb);
+
+ atomic_dec(&rb->mmap_count);
+
+ if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
+ return;
+
+ /* Detach current event from the buffer. */
+ rcu_assign_pointer(event->rb, NULL);
+ ring_buffer_detach(event, rb);
+ mutex_unlock(&event->mmap_mutex);
- atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
- vma->vm_mm->pinned_vm -= event->mmap_locked;
- rcu_assign_pointer(event->rb, NULL);
- ring_buffer_detach(event, rb);
+ /* If there's still other mmap()s of this buffer, we're done. */
+ if (atomic_read(&rb->mmap_count)) {
+ ring_buffer_put(rb); /* can't be last */
+ return;
+ }
+
+ /*
+ * No other mmap()s, detach from all other events that might redirect
+ * into the now unreachable buffer. Somewhat complicated by the
+ * fact that rb::event_lock otherwise nests inside mmap_mutex.
+ */
+again:
+ rcu_read_lock();
+ list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
+ if (!atomic_long_inc_not_zero(&event->refcount)) {
+ /*
+ * This event is en-route to free_event() which will
+ * detach it and remove it from the list.
+ */
+ continue;
+ }
+ rcu_read_unlock();
+
+ mutex_lock(&event->mmap_mutex);
+ /*
+ * Check we didn't race with perf_event_set_output() which can
+ * swizzle the rb from under us while we were waiting to
+ * acquire mmap_mutex.
+ *
+ * If we find a different rb; ignore this event, a next
+ * iteration will no longer find it on the list. We have to
+ * still restart the iteration to make sure we're not now
+ * iterating the wrong list.
+ */
+ if (event->rb == rb) {
+ rcu_assign_pointer(event->rb, NULL);
+ ring_buffer_detach(event, rb);
+ ring_buffer_put(rb); /* can't be last, we still have one */
+ }
mutex_unlock(&event->mmap_mutex);
+ put_event(event);
- ring_buffer_put(rb);
- free_uid(user);
+ /*
+ * Restart the iteration; either we're on the wrong list or
+ * destroyed its integrity by doing a deletion.
+ */
+ goto again;
}
+ rcu_read_unlock();
+
+ /*
+ * It could be there's still a few 0-ref events on the list; they'll
+ * get cleaned up by free_event() -- they'll also still have their
+ * ref on the rb and will free it whenever they are done with it.
+ *
+ * Aside from that, this buffer is 'fully' detached and unmapped,
+ * undo the VM accounting.
+ */
+
+ atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
+ vma->vm_mm->pinned_vm -= mmap_locked;
+ free_uid(mmap_user);
+
+ ring_buffer_put(rb); /* could be last */
}
static const struct vm_operations_struct perf_mmap_vmops = {
@@ -3674,12 +3729,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
return -EINVAL;
WARN_ON_ONCE(event->ctx->parent_ctx);
+again:
mutex_lock(&event->mmap_mutex);
if (event->rb) {
- if (event->rb->nr_pages == nr_pages)
- atomic_inc(&event->rb->refcount);
- else
+ if (event->rb->nr_pages != nr_pages) {
ret = -EINVAL;
+ goto unlock;
+ }
+
+ if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
+ /*
+ * Raced against perf_mmap_close() through
+ * perf_event_set_output(). Try again, hope for better
+ * luck.
+ */
+ mutex_unlock(&event->mmap_mutex);
+ goto again;
+ }
+
goto unlock;
}
@@ -3720,12 +3787,16 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
ret = -ENOMEM;
goto unlock;
}
- rcu_assign_pointer(event->rb, rb);
+
+ atomic_set(&rb->mmap_count, 1);
+ rb->mmap_locked = extra;
+ rb->mmap_user = get_current_user();
atomic_long_add(user_extra, &user->locked_vm);
- event->mmap_locked = extra;
- event->mmap_user = get_current_user();
- vma->vm_mm->pinned_vm += event->mmap_locked;
+ vma->vm_mm->pinned_vm += extra;
+
+ ring_buffer_attach(event, rb);
+ rcu_assign_pointer(event->rb, rb);
perf_event_update_userpage(event);
@@ -3734,7 +3805,11 @@ unlock:
atomic_inc(&event->mmap_count);
mutex_unlock(&event->mmap_mutex);
- vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
+ /*
+ * Since pinned accounting is per vm we cannot allow fork() to copy our
+ * vma.
+ */
+ vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
vma->vm_ops = &perf_mmap_vmops;
return ret;
@@ -4394,6 +4469,64 @@ perf_event_read_event(struct perf_event *event,
perf_output_end(&handle);
}
+typedef int (perf_event_aux_match_cb)(struct perf_event *event, void *data);
+typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
+
+static void
+perf_event_aux_ctx(struct perf_event_context *ctx,
+ perf_event_aux_match_cb match,
+ perf_event_aux_output_cb output,
+ void *data)
+{
+ struct perf_event *event;
+
+ list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
+ if (event->state < PERF_EVENT_STATE_INACTIVE)
+ continue;
+ if (!event_filter_match(event))
+ continue;
+ if (match(event, data))
+ output(event, data);
+ }
+}
+
+static void
+perf_event_aux(perf_event_aux_match_cb match,
+ perf_event_aux_output_cb output,
+ void *data,
+ struct perf_event_context *task_ctx)
+{
+ struct perf_cpu_context *cpuctx;
+ struct perf_event_context *ctx;
+ struct pmu *pmu;
+ int ctxn;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pmu, &pmus, entry) {
+ cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
+ if (cpuctx->unique_pmu != pmu)
+ goto next;
+ perf_event_aux_ctx(&cpuctx->ctx, match, output, data);
+ if (task_ctx)
+ goto next;
+ ctxn = pmu->task_ctx_nr;
+ if (ctxn < 0)
+ goto next;
+ ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+ if (ctx)
+ perf_event_aux_ctx(ctx, match, output, data);
+next:
+ put_cpu_ptr(pmu->pmu_cpu_context);
+ }
+
+ if (task_ctx) {
+ preempt_disable();
+ perf_event_aux_ctx(task_ctx, match, output, data);
+ preempt_enable();
+ }
+ rcu_read_unlock();
+}
+
/*
* task tracking -- fork/exit
*
@@ -4416,8 +4549,9 @@ struct perf_task_event {
};
static void perf_event_task_output(struct perf_event *event,
- struct perf_task_event *task_event)
+ void *data)
{
+ struct perf_task_event *task_event = data;
struct perf_output_handle handle;
struct perf_sample_data sample;
struct task_struct *task = task_event->task;
@@ -4445,62 +4579,11 @@ out:
task_event->event_id.header.size = size;
}
-static int perf_event_task_match(struct perf_event *event)
-{
- if (event->state < PERF_EVENT_STATE_INACTIVE)
- return 0;
-
- if (!event_filter_match(event))
- return 0;
-
- if (event->attr.comm || event->attr.mmap ||
- event->attr.mmap_data || event->attr.task)
- return 1;
-
- return 0;
-}
-
-static void perf_event_task_ctx(struct perf_event_context *ctx,
- struct perf_task_event *task_event)
+static int perf_event_task_match(struct perf_event *event,
+ void *data __maybe_unused)
{
- struct perf_event *event;
-
- list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
- if (perf_event_task_match(event))
- perf_event_task_output(event, task_event);
- }
-}
-
-static void perf_event_task_event(struct perf_task_event *task_event)
-{
- struct perf_cpu_context *cpuctx;
- struct perf_event_context *ctx;
- struct pmu *pmu;
- int ctxn;
-
- rcu_read_lock();
- list_for_each_entry_rcu(pmu, &pmus, entry) {
- cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
- if (cpuctx->unique_pmu != pmu)
- goto next;
- perf_event_task_ctx(&cpuctx->ctx, task_event);
-
- ctx = task_event->task_ctx;
- if (!ctx) {
- ctxn = pmu->task_ctx_nr;
- if (ctxn < 0)
- goto next;
- ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
- if (ctx)
- perf_event_task_ctx(ctx, task_event);
- }
-next:
- put_cpu_ptr(pmu->pmu_cpu_context);
- }
- if (task_event->task_ctx)
- perf_event_task_ctx(task_event->task_ctx, task_event);
-
- rcu_read_unlock();
+ return event->attr.comm || event->attr.mmap ||
+ event->attr.mmap_data || event->attr.task;
}
static void perf_event_task(struct task_struct *task,
@@ -4531,7 +4614,10 @@ static void perf_event_task(struct task_struct *task,
},
};
- perf_event_task_event(&task_event);
+ perf_event_aux(perf_event_task_match,
+ perf_event_task_output,
+ &task_event,
+ task_ctx);
}
void perf_event_fork(struct task_struct *task)
@@ -4557,8 +4643,9 @@ struct perf_comm_event {
};
static void perf_event_comm_output(struct perf_event *event,
- struct perf_comm_event *comm_event)
+ void *data)
{
+ struct perf_comm_event *comm_event = data;
struct perf_output_handle handle;
struct perf_sample_data sample;
int size = comm_event->event_id.header.size;
@@ -4585,39 +4672,16 @@ out:
comm_event->event_id.header.size = size;
}
-static int perf_event_comm_match(struct perf_event *event)
+static int perf_event_comm_match(struct perf_event *event,
+ void *data __maybe_unused)
{
- if (event->state < PERF_EVENT_STATE_INACTIVE)
- return 0;
-
- if (!event_filter_match(event))
- return 0;
-
- if (event->attr.comm)
- return 1;
-
- return 0;
-}
-
-static void perf_event_comm_ctx(struct perf_event_context *ctx,
- struct perf_comm_event *comm_event)
-{
- struct perf_event *event;
-
- list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
- if (perf_event_comm_match(event))
- perf_event_comm_output(event, comm_event);
- }
+ return event->attr.comm;
}
static void perf_event_comm_event(struct perf_comm_event *comm_event)
{
- struct perf_cpu_context *cpuctx;
- struct perf_event_context *ctx;
char comm[TASK_COMM_LEN];
unsigned int size;
- struct pmu *pmu;
- int ctxn;
memset(comm, 0, sizeof(comm));
strlcpy(comm, comm_event->task->comm, sizeof(comm));
@@ -4627,24 +4691,11 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
comm_event->comm_size = size;
comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
- rcu_read_lock();
- list_for_each_entry_rcu(pmu, &pmus, entry) {
- cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
- if (cpuctx->unique_pmu != pmu)
- goto next;
- perf_event_comm_ctx(&cpuctx->ctx, comm_event);
- ctxn = pmu->task_ctx_nr;
- if (ctxn < 0)
- goto next;
-
- ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
- if (ctx)
- perf_event_comm_ctx(ctx, comm_event);
-next:
- put_cpu_ptr(pmu->pmu_cpu_context);
- }
- rcu_read_unlock();
+ perf_event_aux(perf_event_comm_match,
+ perf_event_comm_output,
+ comm_event,
+ NULL);
}
void perf_event_comm(struct task_struct *task)
@@ -4706,8 +4757,9 @@ struct perf_mmap_event {
};
static void perf_event_mmap_output(struct perf_event *event,
- struct perf_mmap_event *mmap_event)
+ void *data)
{
+ struct perf_mmap_event *mmap_event = data;
struct perf_output_handle handle;
struct perf_sample_data sample;
int size = mmap_event->event_id.header.size;
@@ -4734,46 +4786,24 @@ out:
}
static int perf_event_mmap_match(struct perf_event *event,
- struct perf_mmap_event *mmap_event,
- int executable)
+ void *data)
{
- if (event->state < PERF_EVENT_STATE_INACTIVE)
- return 0;
-
- if (!event_filter_match(event))
- return 0;
-
- if ((!executable && event->attr.mmap_data) ||
- (executable && event->attr.mmap))
- return 1;
-
- return 0;
-}
-
-static void perf_event_mmap_ctx(struct perf_event_context *ctx,
- struct perf_mmap_event *mmap_event,
- int executable)
-{
- struct perf_event *event;
+ struct perf_mmap_event *mmap_event = data;
+ struct vm_area_struct *vma = mmap_event->vma;
+ int executable = vma->vm_flags & VM_EXEC;
- list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
- if (perf_event_mmap_match(event, mmap_event, executable))
- perf_event_mmap_output(event, mmap_event);
- }
+ return (!executable && event->attr.mmap_data) ||
+ (executable && event->attr.mmap);
}
static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
{
- struct perf_cpu_context *cpuctx;
- struct perf_event_context *ctx;
struct vm_area_struct *vma = mmap_event->vma;
struct file *file = vma->vm_file;
unsigned int size;
char tmp[16];
char *buf = NULL;
const char *name;
- struct pmu *pmu;
- int ctxn;
memset(tmp, 0, sizeof(tmp));
@@ -4829,27 +4859,10 @@ got_name:
mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
- rcu_read_lock();
- list_for_each_entry_rcu(pmu, &pmus, entry) {
- cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
- if (cpuctx->unique_pmu != pmu)
- goto next;
- perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
- vma->vm_flags & VM_EXEC);
-
- ctxn = pmu->task_ctx_nr;
- if (ctxn < 0)
- goto next;
-
- ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
- if (ctx) {
- perf_event_mmap_ctx(ctx, mmap_event,
- vma->vm_flags & VM_EXEC);
- }
-next:
- put_cpu_ptr(pmu->pmu_cpu_context);
- }
- rcu_read_unlock();
+ perf_event_aux(perf_event_mmap_match,
+ perf_event_mmap_output,
+ mmap_event,
+ NULL);
kfree(buf);
}
@@ -6474,6 +6487,8 @@ set:
if (atomic_read(&event->mmap_count))
goto unlock;
+ old_rb = event->rb;
+
if (output_event) {
/* get the rb we want to redirect to */
rb = ring_buffer_get(output_event);
@@ -6481,16 +6496,28 @@ set:
goto unlock;
}
- old_rb = event->rb;
- rcu_assign_pointer(event->rb, rb);
if (old_rb)
ring_buffer_detach(event, old_rb);
+
+ if (rb)
+ ring_buffer_attach(event, rb);
+
+ rcu_assign_pointer(event->rb, rb);
+
+ if (old_rb) {
+ ring_buffer_put(old_rb);
+ /*
+ * Since we detached before setting the new rb, so that we
+ * could attach the new rb, we could have missed a wakeup.
+ * Provide it now.
+ */
+ wake_up_all(&event->waitq);
+ }
+
ret = 0;
unlock:
mutex_unlock(&event->mmap_mutex);
- if (old_rb)
- ring_buffer_put(old_rb);
out:
return ret;
}
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index a64f8aeb5c1..20185ea64aa 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -120,7 +120,7 @@ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
if (iter->hw.bp_target == tsk &&
find_slot_idx(iter) == type &&
- cpu == iter->cpu)
+ (iter->cpu < 0 || cpu == iter->cpu))
count += hw_breakpoint_weight(iter);
}
@@ -149,7 +149,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
return;
}
- for_each_online_cpu(cpu) {
+ for_each_possible_cpu(cpu) {
unsigned int nr;
nr = per_cpu(nr_cpu_bp_pinned[type], cpu);
@@ -235,7 +235,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
if (cpu >= 0) {
toggle_bp_task_slot(bp, cpu, enable, type, weight);
} else {
- for_each_online_cpu(cpu)
+ for_each_possible_cpu(cpu)
toggle_bp_task_slot(bp, cpu, enable, type, weight);
}
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index eb675c4d59d..ca6599723be 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -31,6 +31,10 @@ struct ring_buffer {
spinlock_t event_lock;
struct list_head event_list;
+ atomic_t mmap_count;
+ unsigned long mmap_locked;
+ struct user_struct *mmap_user;
+
struct perf_event_mmap_page *user_page;
void *data_pages[0];
};
diff --git a/kernel/exit.c b/kernel/exit.c
index af2eb3cbd49..7bb73f9d09d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -649,7 +649,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
* jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
*/
forget_original_parent(tsk);
- exit_task_namespaces(tsk);
write_lock_irq(&tasklist_lock);
if (group_dead)
@@ -795,6 +794,7 @@ void do_exit(long code)
exit_shm(tsk);
exit_files(tsk);
exit_fs(tsk);
+ exit_task_namespaces(tsk);
exit_task_work(tsk);
check_stack_usage();
exit_thread();
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 5a83dde8ca0..54a4d522323 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -143,7 +143,10 @@ static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain,
* irq_domain_add_simple() - Allocate and register a simple irq_domain.
* @of_node: pointer to interrupt controller's device tree node.
* @size: total number of irqs in mapping
- * @first_irq: first number of irq block assigned to the domain
+ * @first_irq: first number of irq block assigned to the domain,
+ * pass zero to assign irqs on-the-fly. This will result in a
+ * linear IRQ domain so it is important to use irq_create_mapping()
+ * for each used IRQ, especially when SPARSE_IRQ is enabled.
* @ops: map/unmap domain callbacks
* @host_data: Controller private data pointer
*
@@ -191,6 +194,7 @@ struct irq_domain *irq_domain_add_simple(struct device_node *of_node,
/* A linear domain is the default */
return irq_domain_add_linear(of_node, size, ops, host_data);
}
+EXPORT_SYMBOL_GPL(irq_domain_add_simple);
/**
* irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain.
@@ -397,11 +401,12 @@ static void irq_domain_disassociate_many(struct irq_domain *domain,
while (count--) {
int irq = irq_base + count;
struct irq_data *irq_data = irq_get_irq_data(irq);
- irq_hw_number_t hwirq = irq_data->hwirq;
+ irq_hw_number_t hwirq;
if (WARN_ON(!irq_data || irq_data->domain != domain))
continue;
+ hwirq = irq_data->hwirq;
irq_set_status_flags(irq, IRQ_NOREQUEST);
/* remove chip and handler */
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 1296e72e416..8241906c4b6 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -569,6 +569,11 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
int retval = 0;
helper_lock();
+ if (!sub_info->path) {
+ retval = -EINVAL;
+ goto out;
+ }
+
if (sub_info->path[0] == '\0')
goto out;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3fed7f0cbcd..bddf3b201a4 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -467,6 +467,7 @@ static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
/* Optimization staging list, protected by kprobe_mutex */
static LIST_HEAD(optimizing_list);
static LIST_HEAD(unoptimizing_list);
+static LIST_HEAD(freeing_list);
static void kprobe_optimizer(struct work_struct *work);
static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
@@ -504,7 +505,7 @@ static __kprobes void do_optimize_kprobes(void)
* Unoptimize (replace a jump with a breakpoint and remove the breakpoint
* if need) kprobes listed on unoptimizing_list.
*/
-static __kprobes void do_unoptimize_kprobes(struct list_head *free_list)
+static __kprobes void do_unoptimize_kprobes(void)
{
struct optimized_kprobe *op, *tmp;
@@ -515,9 +516,9 @@ static __kprobes void do_unoptimize_kprobes(struct list_head *free_list)
/* Ditto to do_optimize_kprobes */
get_online_cpus();
mutex_lock(&text_mutex);
- arch_unoptimize_kprobes(&unoptimizing_list, free_list);
+ arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
/* Loop free_list for disarming */
- list_for_each_entry_safe(op, tmp, free_list, list) {
+ list_for_each_entry_safe(op, tmp, &freeing_list, list) {
/* Disarm probes if marked disabled */
if (kprobe_disabled(&op->kp))
arch_disarm_kprobe(&op->kp);
@@ -536,11 +537,11 @@ static __kprobes void do_unoptimize_kprobes(struct list_head *free_list)
}
/* Reclaim all kprobes on the free_list */
-static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list)
+static __kprobes void do_free_cleaned_kprobes(void)
{
struct optimized_kprobe *op, *tmp;
- list_for_each_entry_safe(op, tmp, free_list, list) {
+ list_for_each_entry_safe(op, tmp, &freeing_list, list) {
BUG_ON(!kprobe_unused(&op->kp));
list_del_init(&op->list);
free_aggr_kprobe(&op->kp);
@@ -556,8 +557,6 @@ static __kprobes void kick_kprobe_optimizer(void)
/* Kprobe jump optimizer */
static __kprobes void kprobe_optimizer(struct work_struct *work)
{
- LIST_HEAD(free_list);
-
mutex_lock(&kprobe_mutex);
/* Lock modules while optimizing kprobes */
mutex_lock(&module_mutex);
@@ -566,7 +565,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
* Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
* kprobes before waiting for quiesence period.
*/
- do_unoptimize_kprobes(&free_list);
+ do_unoptimize_kprobes();
/*
* Step 2: Wait for quiesence period to ensure all running interrupts
@@ -581,7 +580,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
do_optimize_kprobes();
/* Step 4: Free cleaned kprobes after quiesence period */
- do_free_cleaned_kprobes(&free_list);
+ do_free_cleaned_kprobes();
mutex_unlock(&module_mutex);
mutex_unlock(&kprobe_mutex);
@@ -723,8 +722,19 @@ static void __kprobes kill_optimized_kprobe(struct kprobe *p)
if (!list_empty(&op->list))
/* Dequeue from the (un)optimization queue */
list_del_init(&op->list);
-
op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
+
+ if (kprobe_unused(p)) {
+ /* Enqueue if it is unused */
+ list_add(&op->list, &freeing_list);
+ /*
+ * Remove unused probes from the hash list. After waiting
+ * for synchronization, this probe is reclaimed.
+ * (reclaiming is done by do_free_cleaned_kprobes().)
+ */
+ hlist_del_rcu(&op->kp.hlist);
+ }
+
/* Don't touch the code, because it is already freed. */
arch_remove_optimized_kprobe(op);
}
diff --git a/kernel/module.c b/kernel/module.c
index b049939177f..cab4bce49c2 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2431,10 +2431,10 @@ static void kmemleak_load_module(const struct module *mod,
kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
for (i = 1; i < info->hdr->e_shnum; i++) {
- const char *name = info->secstrings + info->sechdrs[i].sh_name;
- if (!(info->sechdrs[i].sh_flags & SHF_ALLOC))
- continue;
- if (!strstarts(name, ".data") && !strstarts(name, ".bss"))
+ /* Scan all writable sections that's not executable */
+ if (!(info->sechdrs[i].sh_flags & SHF_ALLOC) ||
+ !(info->sechdrs[i].sh_flags & SHF_WRITE) ||
+ (info->sechdrs[i].sh_flags & SHF_EXECINSTR))
continue;
kmemleak_scan_area((void *)info->sechdrs[i].sh_addr,
@@ -2769,24 +2769,11 @@ static void find_module_sections(struct module *mod, struct load_info *info)
mod->trace_events = section_objs(info, "_ftrace_events",
sizeof(*mod->trace_events),
&mod->num_trace_events);
- /*
- * This section contains pointers to allocated objects in the trace
- * code and not scanning it leads to false positives.
- */
- kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
- mod->num_trace_events, GFP_KERNEL);
#endif
#ifdef CONFIG_TRACING
mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
sizeof(*mod->trace_bprintk_fmt_start),
&mod->num_trace_bprintk_fmt);
- /*
- * This section contains pointers to allocated objects in the trace
- * code and not scanning it leads to false positives.
- */
- kmemleak_scan_area(mod->trace_bprintk_fmt_start,
- sizeof(*mod->trace_bprintk_fmt_start) *
- mod->num_trace_bprintk_fmt, GFP_KERNEL);
#endif
#ifdef CONFIG_FTRACE_MCOUNT_RECORD
/* sechdrs[0].sh_size is always zero */
diff --git a/kernel/printk.c b/kernel/printk.c
index fa36e149442..8212c1aef12 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -363,6 +363,53 @@ static void log_store(int facility, int level,
log_next_seq++;
}
+#ifdef CONFIG_SECURITY_DMESG_RESTRICT
+int dmesg_restrict = 1;
+#else
+int dmesg_restrict;
+#endif
+
+static int syslog_action_restricted(int type)
+{
+ if (dmesg_restrict)
+ return 1;
+ /*
+ * Unless restricted, we allow "read all" and "get buffer size"
+ * for everybody.
+ */
+ return type != SYSLOG_ACTION_READ_ALL &&
+ type != SYSLOG_ACTION_SIZE_BUFFER;
+}
+
+static int check_syslog_permissions(int type, bool from_file)
+{
+ /*
+ * If this is from /proc/kmsg and we've already opened it, then we've
+ * already done the capabilities checks at open time.
+ */
+ if (from_file && type != SYSLOG_ACTION_OPEN)
+ return 0;
+
+ if (syslog_action_restricted(type)) {
+ if (capable(CAP_SYSLOG))
+ return 0;
+ /*
+ * For historical reasons, accept CAP_SYS_ADMIN too, with
+ * a warning.
+ */
+ if (capable(CAP_SYS_ADMIN)) {
+ pr_warn_once("%s (%d): Attempt to access syslog with "
+ "CAP_SYS_ADMIN but no CAP_SYSLOG "
+ "(deprecated).\n",
+ current->comm, task_pid_nr(current));
+ return 0;
+ }
+ return -EPERM;
+ }
+ return security_syslog(type);
+}
+
+
/* /dev/kmsg - userspace message inject/listen interface */
struct devkmsg_user {
u64 seq;
@@ -620,7 +667,8 @@ static int devkmsg_open(struct inode *inode, struct file *file)
if ((file->f_flags & O_ACCMODE) == O_WRONLY)
return 0;
- err = security_syslog(SYSLOG_ACTION_READ_ALL);
+ err = check_syslog_permissions(SYSLOG_ACTION_READ_ALL,
+ SYSLOG_FROM_READER);
if (err)
return err;
@@ -813,45 +861,6 @@ static inline void boot_delay_msec(int level)
}
#endif
-#ifdef CONFIG_SECURITY_DMESG_RESTRICT
-int dmesg_restrict = 1;
-#else
-int dmesg_restrict;
-#endif
-
-static int syslog_action_restricted(int type)
-{
- if (dmesg_restrict)
- return 1;
- /* Unless restricted, we allow "read all" and "get buffer size" for everybody */
- return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER;
-}
-
-static int check_syslog_permissions(int type, bool from_file)
-{
- /*
- * If this is from /proc/kmsg and we've already opened it, then we've
- * already done the capabilities checks at open time.
- */
- if (from_file && type != SYSLOG_ACTION_OPEN)
- return 0;
-
- if (syslog_action_restricted(type)) {
- if (capable(CAP_SYSLOG))
- return 0;
- /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */
- if (capable(CAP_SYS_ADMIN)) {
- printk_once(KERN_WARNING "%s (%d): "
- "Attempt to access syslog with CAP_SYS_ADMIN "
- "but no CAP_SYSLOG (deprecated).\n",
- current->comm, task_pid_nr(current));
- return 0;
- }
- return -EPERM;
- }
- return 0;
-}
-
#if defined(CONFIG_PRINTK_TIME)
static bool printk_time = 1;
#else
@@ -1249,7 +1258,7 @@ out:
SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
{
- return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
+ return do_syslog(type, buf, len, SYSLOG_FROM_READER);
}
/*
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index aed981a3f69..335a7ae697f 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -665,20 +665,22 @@ static int ptrace_peek_siginfo(struct task_struct *child,
if (unlikely(is_compat_task())) {
compat_siginfo_t __user *uinfo = compat_ptr(data);
- ret = copy_siginfo_to_user32(uinfo, &info);
- ret |= __put_user(info.si_code, &uinfo->si_code);
+ if (copy_siginfo_to_user32(uinfo, &info) ||
+ __put_user(info.si_code, &uinfo->si_code)) {
+ ret = -EFAULT;
+ break;
+ }
+
} else
#endif
{
siginfo_t __user *uinfo = (siginfo_t __user *) data;
- ret = copy_siginfo_to_user(uinfo, &info);
- ret |= __put_user(info.si_code, &uinfo->si_code);
- }
-
- if (ret) {
- ret = -EFAULT;
- break;
+ if (copy_siginfo_to_user(uinfo, &info) ||
+ __put_user(info.si_code, &uinfo->si_code)) {
+ ret = -EFAULT;
+ break;
+ }
}
data += sizeof(siginfo_t);
diff --git a/kernel/range.c b/kernel/range.c
index 071b0ab455c..322ea8e93e4 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -4,7 +4,7 @@
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/sort.h>
-
+#include <linux/string.h>
#include <linux/range.h>
int add_range(struct range *range, int az, int nr_range, u64 start, u64 end)
@@ -32,9 +32,8 @@ int add_range_with_merge(struct range *range, int az, int nr_range,
if (start >= end)
return nr_range;
- /* Try to merge it with old one: */
+ /* get new start/end: */
for (i = 0; i < nr_range; i++) {
- u64 final_start, final_end;
u64 common_start, common_end;
if (!range[i].end)
@@ -45,12 +44,16 @@ int add_range_with_merge(struct range *range, int az, int nr_range,
if (common_start > common_end)
continue;
- final_start = min(range[i].start, start);
- final_end = max(range[i].end, end);
+ /* new start/end, will add it back at last */
+ start = min(range[i].start, start);
+ end = max(range[i].end, end);
- range[i].start = final_start;
- range[i].end = final_end;
- return nr_range;
+ memmove(&range[i], &range[i + 1],
+ (nr_range - (i + 1)) * sizeof(range[i]));
+ range[nr_range - 1].start = 0;
+ range[nr_range - 1].end = 0;
+ nr_range--;
+ i--;
}
/* Need to add it: */
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 16ea6792501..35380019f0f 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1451,9 +1451,9 @@ static int rcu_gp_init(struct rcu_state *rsp)
rnp->grphi, rnp->qsmask);
raw_spin_unlock_irq(&rnp->lock);
#ifdef CONFIG_PROVE_RCU_DELAY
- if ((prandom_u32() % (rcu_num_nodes * 8)) == 0 &&
+ if ((prandom_u32() % (rcu_num_nodes + 1)) == 0 &&
system_state == SYSTEM_RUNNING)
- schedule_timeout_uninterruptible(2);
+ udelay(200);
#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
cond_resched();
}
@@ -1613,6 +1613,14 @@ static int __noreturn rcu_gp_kthread(void *arg)
}
}
+static void rsp_wakeup(struct irq_work *work)
+{
+ struct rcu_state *rsp = container_of(work, struct rcu_state, wakeup_work);
+
+ /* Wake up rcu_gp_kthread() to start the grace period. */
+ wake_up(&rsp->gp_wq);
+}
+
/*
* Start a new RCU grace period if warranted, re-initializing the hierarchy
* in preparation for detecting the next grace period. The caller must hold
@@ -1637,8 +1645,12 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
}
rsp->gp_flags = RCU_GP_FLAG_INIT;
- /* Wake up rcu_gp_kthread() to start the grace period. */
- wake_up(&rsp->gp_wq);
+ /*
+ * We can't do wakeups while holding the rnp->lock, as that
+ * could cause possible deadlocks with the rq->lock. Deter
+ * the wakeup to interrupt context.
+ */
+ irq_work_queue(&rsp->wakeup_work);
}
/*
@@ -3235,6 +3247,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
rsp->rda = rda;
init_waitqueue_head(&rsp->gp_wq);
+ init_irq_work(&rsp->wakeup_work, rsp_wakeup);
rnp = rsp->level[rcu_num_lvls - 1];
for_each_possible_cpu(i) {
while (i > rnp->grphi)
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index da77a8f57ff..4df503470e4 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -27,6 +27,7 @@
#include <linux/threads.h>
#include <linux/cpumask.h>
#include <linux/seqlock.h>
+#include <linux/irq_work.h>
/*
* Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
@@ -442,6 +443,7 @@ struct rcu_state {
char *name; /* Name of structure. */
char abbr; /* Abbreviated name. */
struct list_head flavors; /* List of RCU flavors. */
+ struct irq_work wakeup_work; /* Postponed wakeups */
};
/* Values for rcu_state structure's gp_flags field. */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 170814dc418..3db5a375d8d 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -88,7 +88,7 @@ static void __init rcu_bootup_announce_oddness(void)
#ifdef CONFIG_RCU_NOCB_CPU
#ifndef CONFIG_RCU_NOCB_CPU_NONE
if (!have_rcu_nocb_mask) {
- alloc_bootmem_cpumask_var(&rcu_nocb_mask);
+ zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL);
have_rcu_nocb_mask = true;
}
#ifdef CONFIG_RCU_NOCB_CPU_ZERO
@@ -1667,7 +1667,7 @@ int rcu_needs_cpu(int cpu, unsigned long *dj)
rdtp->last_accelerate = jiffies;
/* Request timer delay depending on laziness, and round. */
- if (rdtp->all_lazy) {
+ if (!rdtp->all_lazy) {
*dj = round_up(rcu_idle_gp_delay + jiffies,
rcu_idle_gp_delay) - jiffies;
} else {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 58453b8272f..e8b335016c5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -633,7 +633,19 @@ void wake_up_nohz_cpu(int cpu)
static inline bool got_nohz_idle_kick(void)
{
int cpu = smp_processor_id();
- return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
+
+ if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
+ return false;
+
+ if (idle_cpu(cpu) && !need_resched())
+ return true;
+
+ /*
+ * We can't run Idle Load Balance on this CPU for this time so we
+ * cancel it and clear NOHZ_BALANCE_KICK
+ */
+ clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
+ return false;
}
#else /* CONFIG_NO_HZ_COMMON */
@@ -1393,8 +1405,9 @@ static void sched_ttwu_pending(void)
void scheduler_ipi(void)
{
- if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()
- && !tick_nohz_full_cpu(smp_processor_id()))
+ if (llist_empty(&this_rq()->wake_list)
+ && !tick_nohz_full_cpu(smp_processor_id())
+ && !got_nohz_idle_kick())
return;
/*
@@ -1417,7 +1430,7 @@ void scheduler_ipi(void)
/*
* Check if someone kicked us for doing the nohz idle load balance.
*/
- if (unlikely(got_nohz_idle_kick() && !need_resched())) {
+ if (unlikely(got_nohz_idle_kick())) {
this_rq()->idle_balance = 1;
raise_softirq_irqoff(SCHED_SOFTIRQ);
}
@@ -4745,7 +4758,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
*/
idle->sched_class = &idle_sched_class;
ftrace_graph_init_idle_task(idle, cpu);
- vtime_init_idle(idle);
+ vtime_init_idle(idle, cpu);
#if defined(CONFIG_SMP)
sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
#endif
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index cc2dc3eea8a..b5ccba22603 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -747,17 +747,17 @@ void arch_vtime_task_switch(struct task_struct *prev)
write_seqlock(&current->vtime_seqlock);
current->vtime_snap_whence = VTIME_SYS;
- current->vtime_snap = sched_clock();
+ current->vtime_snap = sched_clock_cpu(smp_processor_id());
write_sequnlock(&current->vtime_seqlock);
}
-void vtime_init_idle(struct task_struct *t)
+void vtime_init_idle(struct task_struct *t, int cpu)
{
unsigned long flags;
write_seqlock_irqsave(&t->vtime_seqlock, flags);
t->vtime_snap_whence = VTIME_SYS;
- t->vtime_snap = sched_clock();
+ t->vtime_snap = sched_clock_cpu(cpu);
write_sequnlock_irqrestore(&t->vtime_seqlock, flags);
}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index b5197dcb0da..3d6833f125d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -195,8 +195,12 @@ void local_bh_enable_ip(unsigned long ip)
EXPORT_SYMBOL(local_bh_enable_ip);
/*
- * We restart softirq processing for at most 2 ms,
- * and if need_resched() is not set.
+ * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times,
+ * but break the loop if need_resched() is set or after 2 ms.
+ * The MAX_SOFTIRQ_TIME provides a nice upper bound in most cases, but in
+ * certain cases, such as stop_machine(), jiffies may cease to
+ * increment and so we need the MAX_SOFTIRQ_RESTART limit as
+ * well to make sure we eventually return from this method.
*
* These limits have been established via experimentation.
* The two things to balance is latency against fairness -
@@ -204,6 +208,7 @@ EXPORT_SYMBOL(local_bh_enable_ip);
* should not be able to lock up the box.
*/
#define MAX_SOFTIRQ_TIME msecs_to_jiffies(2)
+#define MAX_SOFTIRQ_RESTART 10
asmlinkage void __do_softirq(void)
{
@@ -212,6 +217,7 @@ asmlinkage void __do_softirq(void)
unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
int cpu;
unsigned long old_flags = current->flags;
+ int max_restart = MAX_SOFTIRQ_RESTART;
/*
* Mask out PF_MEMALLOC s current task context is borrowed for the
@@ -265,7 +271,8 @@ restart:
pending = local_softirq_pending();
if (pending) {
- if (time_before(jiffies, end) && !need_resched())
+ if (time_before(jiffies, end) && !need_resched() &&
+ --max_restart)
goto restart;
wakeup_softirqd();
diff --git a/kernel/sys.c b/kernel/sys.c
index b95d3c72ba2..2bbd9a73b54 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -362,6 +362,29 @@ int unregister_reboot_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL(unregister_reboot_notifier);
+/* Add backwards compatibility for stable trees. */
+#ifndef PF_NO_SETAFFINITY
+#define PF_NO_SETAFFINITY PF_THREAD_BOUND
+#endif
+
+static void migrate_to_reboot_cpu(void)
+{
+ /* The boot cpu is always logical cpu 0 */
+ int cpu = 0;
+
+ cpu_hotplug_disable();
+
+ /* Make certain the cpu I'm about to reboot on is online */
+ if (!cpu_online(cpu))
+ cpu = cpumask_first(cpu_online_mask);
+
+ /* Prevent races with other tasks migrating this task */
+ current->flags |= PF_NO_SETAFFINITY;
+
+ /* Make certain I only run on the appropriate processor */
+ set_cpus_allowed_ptr(current, cpumask_of(cpu));
+}
+
/**
* kernel_restart - reboot the system
* @cmd: pointer to buffer containing command to execute for restart
@@ -373,7 +396,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier);
void kernel_restart(char *cmd)
{
kernel_restart_prepare(cmd);
- disable_nonboot_cpus();
+ migrate_to_reboot_cpu();
syscore_shutdown();
if (!cmd)
printk(KERN_EMERG "Restarting system.\n");
@@ -400,7 +423,7 @@ static void kernel_shutdown_prepare(enum system_states state)
void kernel_halt(void)
{
kernel_shutdown_prepare(SYSTEM_HALT);
- disable_nonboot_cpus();
+ migrate_to_reboot_cpu();
syscore_shutdown();
printk(KERN_EMERG "System halted.\n");
kmsg_dump(KMSG_DUMP_HALT);
@@ -419,7 +442,7 @@ void kernel_power_off(void)
kernel_shutdown_prepare(SYSTEM_POWER_OFF);
if (pm_power_off_prepare)
pm_power_off_prepare();
- disable_nonboot_cpus();
+ migrate_to_reboot_cpu();
syscore_shutdown();
printk(KERN_EMERG "Power down.\n");
kmsg_dump(KMSG_DUMP_POWEROFF);
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index e4c07b0692b..70f27e89012 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -12,11 +12,6 @@ config CLOCKSOURCE_WATCHDOG
config ARCH_CLOCKSOURCE_DATA
bool
-# Platforms has a persistent clock
-config ALWAYS_USE_PERSISTENT_CLOCK
- bool
- default n
-
# Timekeeping vsyscall support
config GENERIC_TIME_VSYSCALL
bool
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 12ff13a838c..8f5b3b98577 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -874,7 +874,6 @@ static void hardpps_update_phase(long error)
void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
{
struct pps_normtime pts_norm, freq_norm;
- unsigned long flags;
pts_norm = pps_normalize_ts(*phase_ts);
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 206bbfb34e0..20d6fba7065 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -511,6 +511,12 @@ again:
}
}
+ /*
+ * Remove the current cpu from the pending mask. The event is
+ * delivered immediately in tick_do_broadcast() !
+ */
+ cpumask_clear_cpu(smp_processor_id(), tick_broadcast_pending_mask);
+
/* Take care of enforced broadcast requests */
cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask);
cpumask_clear(tick_broadcast_force_mask);
@@ -575,8 +581,8 @@ void tick_broadcast_oneshot_control(unsigned long reason)
raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
- WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {
+ WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
/*
* We only reprogram the broadcast timer if we
@@ -593,8 +599,6 @@ void tick_broadcast_oneshot_control(unsigned long reason)
} else {
if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
- if (dev->next_event.tv64 == KTIME_MAX)
- goto out;
/*
* The cpu which was handling the broadcast
* timer marked this cpu in the broadcast
@@ -609,6 +613,11 @@ void tick_broadcast_oneshot_control(unsigned long reason)
goto out;
/*
+ * Bail out if there is no next event.
+ */
+ if (dev->next_event.tv64 == KTIME_MAX)
+ goto out;
+ /*
* If the pending bit is not set, then we are
* either the CPU handling the broadcast
* interrupt or we got woken by something else.
@@ -692,10 +701,6 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
bc->event_handler = tick_handle_oneshot_broadcast;
- /* Take the do_timer update */
- if (!tick_nohz_full_cpu(cpu))
- tick_do_timer_cpu = cpu;
-
/*
* We must be careful here. There might be other CPUs
* waiting for periodic broadcast. We need to set the
@@ -786,11 +791,11 @@ bool tick_broadcast_oneshot_available(void)
void __init tick_broadcast_init(void)
{
- alloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT);
- alloc_cpumask_var(&tmpmask, GFP_NOWAIT);
+ zalloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT);
+ zalloc_cpumask_var(&tmpmask, GFP_NOWAIT);
#ifdef CONFIG_TICK_ONESHOT
- alloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT);
- alloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT);
- alloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT);
+ zalloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT);
+ zalloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT);
+ zalloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT);
#endif
}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index bc67d4245e1..0cf1c145318 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -306,7 +306,7 @@ static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb,
* we can't safely shutdown that CPU.
*/
if (have_nohz_full_mask && tick_do_timer_cpu == cpu)
- return -EINVAL;
+ return NOTIFY_BAD;
break;
}
return NOTIFY_OK;
@@ -717,6 +717,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
if (unlikely(!cpu_online(cpu))) {
if (cpu == tick_do_timer_cpu)
tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+ return false;
}
if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
@@ -1168,7 +1169,7 @@ void tick_cancel_sched_timer(int cpu)
hrtimer_cancel(&ts->sched_timer);
# endif
- ts->nohz_mode = NOHZ_MODE_INACTIVE;
+ memset(ts, 0, sizeof(*ts));
}
#endif
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 98cd470bbe4..baeeb5c87cf 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -975,6 +975,14 @@ static int timekeeping_suspend(void)
read_persistent_clock(&timekeeping_suspend_time);
+ /*
+ * On some systems the persistent_clock can not be detected at
+ * timekeeping_init by its return value, so if we see a valid
+ * value returned, update the persistent_clock_exists flag.
+ */
+ if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec)
+ persistent_clock_exist = true;
+
raw_spin_lock_irqsave(&timekeeper_lock, flags);
write_seqcount_begin(&timekeeper_seq);
timekeeping_forward_now(tk);
diff --git a/kernel/timer.c b/kernel/timer.c
index a860bba3441..15ffdb3f194 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1539,12 +1539,12 @@ static int __cpuinit init_timers_cpu(int cpu)
boot_done = 1;
base = &boot_tvec_bases;
}
+ spin_lock_init(&base->lock);
tvec_base_done[cpu] = 1;
} else {
base = per_cpu(tvec_bases, cpu);
}
- spin_lock_init(&base->lock);
for (j = 0; j < TVN_SIZE; j++) {
INIT_LIST_HEAD(base->tv5.vec + j);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b549b0f5b97..6c508ff33c6 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -120,22 +120,22 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);
/*
* Traverse the ftrace_global_list, invoking all entries. The reason that we
- * can use rcu_dereference_raw() is that elements removed from this list
+ * can use rcu_dereference_raw_notrace() is that elements removed from this list
* are simply leaked, so there is no need to interact with a grace-period
- * mechanism. The rcu_dereference_raw() calls are needed to handle
+ * mechanism. The rcu_dereference_raw_notrace() calls are needed to handle
* concurrent insertions into the ftrace_global_list.
*
* Silly Alpha and silly pointer-speculation compiler optimizations!
*/
#define do_for_each_ftrace_op(op, list) \
- op = rcu_dereference_raw(list); \
+ op = rcu_dereference_raw_notrace(list); \
do
/*
* Optimized for just a single item in the list (as that is the normal case).
*/
#define while_for_each_ftrace_op(op) \
- while (likely(op = rcu_dereference_raw((op)->next)) && \
+ while (likely(op = rcu_dereference_raw_notrace((op)->next)) && \
unlikely((op) != &ftrace_list_end))
static inline void ftrace_ops_init(struct ftrace_ops *ops)
@@ -779,7 +779,7 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
if (hlist_empty(hhd))
return NULL;
- hlist_for_each_entry_rcu(rec, hhd, node) {
+ hlist_for_each_entry_rcu_notrace(rec, hhd, node) {
if (rec->ip == ip)
return rec;
}
@@ -1165,7 +1165,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
hhd = &hash->buckets[key];
- hlist_for_each_entry_rcu(entry, hhd, hlist) {
+ hlist_for_each_entry_rcu_notrace(entry, hhd, hlist) {
if (entry->ip == ip)
return entry;
}
@@ -1422,8 +1422,8 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
struct ftrace_hash *notrace_hash;
int ret;
- filter_hash = rcu_dereference_raw(ops->filter_hash);
- notrace_hash = rcu_dereference_raw(ops->notrace_hash);
+ filter_hash = rcu_dereference_raw_notrace(ops->filter_hash);
+ notrace_hash = rcu_dereference_raw_notrace(ops->notrace_hash);
if ((ftrace_hash_empty(filter_hash) ||
ftrace_lookup_ip(filter_hash, ip)) &&
@@ -2920,7 +2920,7 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
* on the hash. rcu_read_lock is too dangerous here.
*/
preempt_disable_notrace();
- hlist_for_each_entry_rcu(entry, hhd, node) {
+ hlist_for_each_entry_rcu_notrace(entry, hhd, node) {
if (entry->ip == ip)
entry->ops->func(ip, parent_ip, &entry->data);
}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b59aea2c48c..e444ff88f0a 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -620,6 +620,9 @@ int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
if (cpu == RING_BUFFER_ALL_CPUS)
work = &buffer->irq_work;
else {
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return -EINVAL;
+
cpu_buffer = buffer->buffers[cpu];
work = &cpu_buffer->irq_work;
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ae6fa2d1cdf..e71a8be4a6e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -652,8 +652,6 @@ static struct {
ARCH_TRACE_CLOCKS
};
-int trace_clock_id;
-
/*
* trace_parser_get_init - gets the buffer for trace parser
*/
@@ -843,7 +841,15 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN);
max_data->pid = tsk->pid;
- max_data->uid = task_uid(tsk);
+ /*
+ * If tsk == current, then use current_uid(), as that does not use
+ * RCU. The irq tracer can be called out of RCU scope.
+ */
+ if (tsk == current)
+ max_data->uid = current_uid();
+ else
+ max_data->uid = task_uid(tsk);
+
max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
max_data->policy = tsk->policy;
max_data->rt_priority = tsk->rt_priority;
@@ -2818,7 +2824,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
iter->iter_flags |= TRACE_FILE_ANNOTATE;
/* Output in nanoseconds only if we are using a clock in nanoseconds. */
- if (trace_clocks[trace_clock_id].in_ns)
+ if (trace_clocks[tr->clock_id].in_ns)
iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
/* stop the trace while dumping if we are not opening "snapshot" */
@@ -3817,7 +3823,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
iter->iter_flags |= TRACE_FILE_LAT_FMT;
/* Output in nanoseconds only if we are using a clock in nanoseconds. */
- if (trace_clocks[trace_clock_id].in_ns)
+ if (trace_clocks[tr->clock_id].in_ns)
iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
iter->cpu_file = tc->cpu;
@@ -5087,7 +5093,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu);
trace_seq_printf(s, "bytes: %ld\n", cnt);
- if (trace_clocks[trace_clock_id].in_ns) {
+ if (trace_clocks[tr->clock_id].in_ns) {
/* local or global for trace_clock */
t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));
usec_rem = do_div(t, USEC_PER_SEC);
@@ -6216,10 +6222,15 @@ __init static int tracer_alloc_buffers(void)
trace_init_cmdlines();
- register_tracer(&nop_trace);
-
+ /*
+ * register_tracer() might reference current_trace, so it
+ * needs to be set before we register anything. This is
+ * just a bootstrap of current_trace anyway.
+ */
global_trace.current_trace = &nop_trace;
+ register_tracer(&nop_trace);
+
/* All seems OK, enable tracing */
tracing_disabled = 0;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 711ca7d3e7f..20572ed88c5 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -700,8 +700,6 @@ enum print_line_t print_trace_line(struct trace_iterator *iter);
extern unsigned long trace_flags;
-extern int trace_clock_id;
-
/* Standard output formatting function used for function return traces */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 7a0cf68027c..27963e2bf4b 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2072,8 +2072,10 @@ event_enable_func(struct ftrace_hash *hash,
out_reg:
/* Don't let event modules unload while probe registered */
ret = try_module_get(file->event_call->mod);
- if (!ret)
+ if (!ret) {
+ ret = -EBUSY;
goto out_free;
+ }
ret = __ftrace_event_enable_disable(file, 1, 1);
if (ret < 0)
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index a6361178de5..e1b653f7e1c 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -750,7 +750,11 @@ static int filter_set_pred(struct event_filter *filter,
static void __free_preds(struct event_filter *filter)
{
+ int i;
+
if (filter->preds) {
+ for (i = 0; i < filter->n_preds; i++)
+ kfree(filter->preds[i].ops);
kfree(filter->preds);
filter->preds = NULL;
}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 636d45fe69b..9f46e98ba8f 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -35,7 +35,7 @@ struct trace_probe {
const char *symbol; /* symbol name */
struct ftrace_event_class class;
struct ftrace_event_call call;
- struct ftrace_event_file **files;
+ struct ftrace_event_file * __rcu *files;
ssize_t size; /* trace entry size */
unsigned int nr_args;
struct probe_arg args[];
@@ -185,9 +185,14 @@ static struct trace_probe *find_trace_probe(const char *event,
static int trace_probe_nr_files(struct trace_probe *tp)
{
- struct ftrace_event_file **file = tp->files;
+ struct ftrace_event_file **file;
int ret = 0;
+ /*
+ * Since all tp->files updater is protected by probe_enable_lock,
+ * we don't need to lock an rcu_read_lock.
+ */
+ file = rcu_dereference_raw(tp->files);
if (file)
while (*(file++))
ret++;
@@ -209,9 +214,10 @@ enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
mutex_lock(&probe_enable_lock);
if (file) {
- struct ftrace_event_file **new, **old = tp->files;
+ struct ftrace_event_file **new, **old;
int n = trace_probe_nr_files(tp);
+ old = rcu_dereference_raw(tp->files);
/* 1 is for new one and 1 is for stopper */
new = kzalloc((n + 2) * sizeof(struct ftrace_event_file *),
GFP_KERNEL);
@@ -251,11 +257,17 @@ enable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
static int
trace_probe_file_index(struct trace_probe *tp, struct ftrace_event_file *file)
{
+ struct ftrace_event_file **files;
int i;
- if (tp->files) {
- for (i = 0; tp->files[i]; i++)
- if (tp->files[i] == file)
+ /*
+ * Since all tp->files updater is protected by probe_enable_lock,
+ * we don't need to lock an rcu_read_lock.
+ */
+ files = rcu_dereference_raw(tp->files);
+ if (files) {
+ for (i = 0; files[i]; i++)
+ if (files[i] == file)
return i;
}
@@ -274,10 +286,11 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
mutex_lock(&probe_enable_lock);
if (file) {
- struct ftrace_event_file **new, **old = tp->files;
+ struct ftrace_event_file **new, **old;
int n = trace_probe_nr_files(tp);
int i, j;
+ old = rcu_dereference_raw(tp->files);
if (n == 0 || trace_probe_file_index(tp, file) < 0) {
ret = -EINVAL;
goto out_unlock;
@@ -872,9 +885,16 @@ __kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs,
static __kprobes void
kprobe_trace_func(struct trace_probe *tp, struct pt_regs *regs)
{
- struct ftrace_event_file **file = tp->files;
+ /*
+ * Note: preempt is already disabled around the kprobe handler.
+ * However, we still need an smp_read_barrier_depends() corresponding
+ * to smp_wmb() in rcu_assign_pointer() to access the pointer.
+ */
+ struct ftrace_event_file **file = rcu_dereference_raw(tp->files);
+
+ if (unlikely(!file))
+ return;
- /* Note: preempt is already disabled around the kprobe handler */
while (*file) {
__kprobe_trace_func(tp, regs, *file);
file++;
@@ -925,9 +945,16 @@ static __kprobes void
kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri,
struct pt_regs *regs)
{
- struct ftrace_event_file **file = tp->files;
+ /*
+ * Note: preempt is already disabled around the kprobe handler.
+ * However, we still need an smp_read_barrier_depends() corresponding
+ * to smp_wmb() in rcu_assign_pointer() to access the pointer.
+ */
+ struct ftrace_event_file **file = rcu_dereference_raw(tp->files);
+
+ if (unlikely(!file))
+ return;
- /* Note: preempt is already disabled around the kprobe handler */
while (*file) {
__kretprobe_trace_func(tp, ri, regs, *file);
file++;
@@ -935,7 +962,7 @@ kretprobe_trace_func(struct trace_probe *tp, struct kretprobe_instance *ri,
}
/* Event entry printers */
-enum print_line_t
+static enum print_line_t
print_kprobe_event(struct trace_iterator *iter, int flags,
struct trace_event *event)
{
@@ -971,7 +998,7 @@ partial:
return TRACE_TYPE_PARTIAL_LINE;
}
-enum print_line_t
+static enum print_line_t
print_kretprobe_event(struct trace_iterator *iter, int flags,
struct trace_event *event)
{
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 55e2cf66967..2901e3b8859 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1159,7 +1159,7 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
/* stop the tracing. */
tracing_stop();
/* check the trace buffer */
- ret = trace_test_buffer(tr, &count);
+ ret = trace_test_buffer(&tr->trace_buffer, &count);
trace->reset(tr);
tracing_start();
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4aa9f5bc6b2..ee8e29a2320 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -296,7 +296,7 @@ static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);
static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];
struct workqueue_struct *system_wq __read_mostly;
-EXPORT_SYMBOL_GPL(system_wq);
+EXPORT_SYMBOL(system_wq);
struct workqueue_struct *system_highpri_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_highpri_wq);
struct workqueue_struct *system_long_wq __read_mostly;
@@ -1411,7 +1411,7 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
local_irq_restore(flags);
return ret;
}
-EXPORT_SYMBOL_GPL(queue_work_on);
+EXPORT_SYMBOL(queue_work_on);
void delayed_work_timer_fn(unsigned long __data)
{
@@ -1485,7 +1485,7 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
local_irq_restore(flags);
return ret;
}
-EXPORT_SYMBOL_GPL(queue_delayed_work_on);
+EXPORT_SYMBOL(queue_delayed_work_on);
/**
* mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU
@@ -2059,6 +2059,7 @@ static bool manage_workers(struct worker *worker)
if (unlikely(!mutex_trylock(&pool->manager_mutex))) {
spin_unlock_irq(&pool->lock);
mutex_lock(&pool->manager_mutex);
+ spin_lock_irq(&pool->lock);
ret = true;
}
@@ -4311,6 +4312,12 @@ bool current_is_workqueue_rescuer(void)
* no synchronization around this function and the test result is
* unreliable and only useful as advisory hints or for debugging.
*
+ * If @cpu is WORK_CPU_UNBOUND, the test is performed on the local CPU.
+ * Note that both per-cpu and unbound workqueues may be associated with
+ * multiple pool_workqueues which have separate congested states. A
+ * workqueue being congested on one CPU doesn't mean the workqueue is also
+ * contested on other CPUs / NUMA nodes.
+ *
* RETURNS:
* %true if congested, %false otherwise.
*/
@@ -4321,6 +4328,9 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
rcu_read_lock_sched();
+ if (cpu == WORK_CPU_UNBOUND)
+ cpu = smp_processor_id();
+
if (!(wq->flags & WQ_UNBOUND))
pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
else
@@ -4895,7 +4905,8 @@ static void __init wq_numa_init(void)
BUG_ON(!tbl);
for_each_node(node)
- BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL, node));
+ BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL,
+ node_online(node) ? node : NUMA_NO_NODE));
for_each_possible_cpu(cpu) {
node = cpu_to_node(cpu);