aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/acct.c7
-rw-r--r--kernel/audit.c526
-rw-r--r--kernel/audit.h156
-rw-r--r--kernel/auditfilter.c360
-rw-r--r--kernel/auditsc.c407
-rw-r--r--kernel/cgroup.c18
-rw-r--r--kernel/compat.c84
-rw-r--r--kernel/configs.c2
-rw-r--r--kernel/cpu/Makefile1
-rw-r--r--kernel/cpu/idle.c116
-rw-r--r--kernel/cpuset.c15
-rw-r--r--kernel/debug/debug_core.c2
-rw-r--r--kernel/events/core.c47
-rw-r--r--kernel/events/ring_buffer.c14
-rw-r--r--kernel/events/uprobes.c300
-rw-r--r--kernel/exit.c7
-rw-r--r--kernel/extable.c6
-rw-r--r--kernel/fork.c12
-rw-r--r--kernel/hrtimer.c30
-rw-r--r--kernel/irq/irqdomain.c20
-rw-r--r--kernel/irq/proc.c20
-rw-r--r--kernel/kallsyms.c26
-rw-r--r--kernel/kexec.c30
-rw-r--r--kernel/kmod.c98
-rw-r--r--kernel/kthread.c19
-rw-r--r--kernel/lockdep.c1
-rw-r--r--kernel/modsign_certificate.S13
-rw-r--r--kernel/module.c18
-rw-r--r--kernel/nsproxy.c6
-rw-r--r--kernel/panic.c6
-rw-r--r--kernel/params.c5
-rw-r--r--kernel/pid.c12
-rw-r--r--kernel/pid_namespace.c4
-rw-r--r--kernel/posix-cpu-timers.c76
-rw-r--r--kernel/posix-timers.c121
-rw-r--r--kernel/power/console.c116
-rw-r--r--kernel/power/poweroff.c2
-rw-r--r--kernel/power/suspend.c22
-rw-r--r--kernel/printk.c63
-rw-r--r--kernel/profile.c6
-rw-r--r--kernel/ptrace.c81
-rw-r--r--kernel/range.c3
-rw-r--r--kernel/rcutree.c276
-rw-r--r--kernel/rcutree.h43
-rw-r--r--kernel/rcutree_plugin.h622
-rw-r--r--kernel/rcutree_trace.c10
-rw-r--r--kernel/relay.c16
-rw-r--r--kernel/rwsem.c16
-rw-r--r--kernel/sched/Makefile1
-rw-r--r--kernel/sched/core.c352
-rw-r--r--kernel/sched/cpuacct.c296
-rw-r--r--kernel/sched/cpuacct.h17
-rw-r--r--kernel/sched/cputime.c238
-rw-r--r--kernel/sched/fair.c158
-rw-r--r--kernel/sched/idle_task.c17
-rw-r--r--kernel/sched/sched.h244
-rw-r--r--kernel/sched/stats.c7
-rw-r--r--kernel/seccomp.c2
-rw-r--r--kernel/semaphore.c8
-rw-r--r--kernel/signal.c11
-rw-r--r--kernel/smp.c91
-rw-r--r--kernel/softirq.c25
-rw-r--r--kernel/sys.c235
-rw-r--r--kernel/sys_ni.c4
-rw-r--r--kernel/sysctl_binary.c4
-rw-r--r--kernel/time.c11
-rw-r--r--kernel/time/Kconfig80
-rw-r--r--kernel/time/ntp.c105
-rw-r--r--kernel/time/ntp_internal.h12
-rw-r--r--kernel/time/tick-broadcast.c242
-rw-r--r--kernel/time/tick-common.c7
-rw-r--r--kernel/time/tick-internal.h5
-rw-r--r--kernel/time/tick-sched.c300
-rw-r--r--kernel/time/timekeeping.c396
-rw-r--r--kernel/time/timer_list.c104
-rw-r--r--kernel/timer.c159
-rw-r--r--kernel/trace/blktrace.c1
-rw-r--r--kernel/trace/trace.h5
-rw-r--r--kernel/trace/trace_uprobe.c203
-rw-r--r--kernel/uid16.c55
-rw-r--r--kernel/user.c2
-rw-r--r--kernel/user_namespace.c2
-rw-r--r--kernel/utsname.c2
-rw-r--r--kernel/watchdog.c5
-rw-r--r--kernel/workqueue.c79
-rw-r--r--kernel/workqueue_internal.h12
87 files changed, 4633 insertions, 2728 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index bbde5f1a448..271fd3119af 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -24,6 +24,7 @@ endif
obj-y += sched/
obj-y += power/
+obj-y += cpu/
obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
obj-$(CONFIG_FREEZER) += freezer.o
@@ -175,7 +176,7 @@ signing_key.priv signing_key.x509: x509.genkey
openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \
-batch -x509 -config x509.genkey \
-outform DER -out signing_key.x509 \
- -keyout signing_key.priv
+ -keyout signing_key.priv 2>&1
@echo "###"
@echo "### Key pair generated."
@echo "###"
diff --git a/kernel/acct.c b/kernel/acct.c
index b9bd7f098ee..8d6e145138b 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -540,6 +540,12 @@ static void do_acct_process(struct bsd_acct_struct *acct,
ac.ac_swaps = encode_comp_t(0);
/*
+ * Get freeze protection. If the fs is frozen, just skip the write
+ * as we could deadlock the system otherwise.
+ */
+ if (!file_start_write_trylock(file))
+ goto out;
+ /*
* Kernel segment override to datasegment and write it
* to the accounting file.
*/
@@ -554,6 +560,7 @@ static void do_acct_process(struct bsd_acct_struct *acct,
sizeof(acct_t), &file->f_pos);
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
set_fs(fs);
+ file_end_write(file);
out:
revert_creds(orig_cred);
}
diff --git a/kernel/audit.c b/kernel/audit.c
index 9816a1b96cf..21c7fa615bd 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -49,6 +49,8 @@
#include <linux/slab.h>
#include <linux/err.h>
#include <linux/kthread.h>
+#include <linux/kernel.h>
+#include <linux/syscalls.h>
#include <linux/audit.h>
@@ -58,7 +60,7 @@
#ifdef CONFIG_SECURITY
#include <linux/security.h>
#endif
-#include <linux/netlink.h>
+#include <net/netlink.h>
#include <linux/freezer.h>
#include <linux/tty.h>
#include <linux/pid_namespace.h>
@@ -265,7 +267,6 @@ void audit_log_lost(const char *message)
}
static int audit_log_config_change(char *function_name, int new, int old,
- kuid_t loginuid, u32 sessionid, u32 sid,
int allow_changes)
{
struct audit_buffer *ab;
@@ -274,29 +275,17 @@ static int audit_log_config_change(char *function_name, int new, int old,
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
if (unlikely(!ab))
return rc;
- audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new,
- old, from_kuid(&init_user_ns, loginuid), sessionid);
- if (sid) {
- char *ctx = NULL;
- u32 len;
-
- rc = security_secid_to_secctx(sid, &ctx, &len);
- if (rc) {
- audit_log_format(ab, " sid=%u", sid);
- allow_changes = 0; /* Something weird, deny request */
- } else {
- audit_log_format(ab, " subj=%s", ctx);
- security_release_secctx(ctx, len);
- }
- }
+ audit_log_format(ab, "%s=%d old=%d", function_name, new, old);
+ audit_log_session_info(ab);
+ rc = audit_log_task_context(ab);
+ if (rc)
+ allow_changes = 0; /* Something weird, deny request */
audit_log_format(ab, " res=%d", allow_changes);
audit_log_end(ab);
return rc;
}
-static int audit_do_config_change(char *function_name, int *to_change,
- int new, kuid_t loginuid, u32 sessionid,
- u32 sid)
+static int audit_do_config_change(char *function_name, int *to_change, int new)
{
int allow_changes, rc = 0, old = *to_change;
@@ -307,8 +296,7 @@ static int audit_do_config_change(char *function_name, int *to_change,
allow_changes = 1;
if (audit_enabled != AUDIT_OFF) {
- rc = audit_log_config_change(function_name, new, old, loginuid,
- sessionid, sid, allow_changes);
+ rc = audit_log_config_change(function_name, new, old, allow_changes);
if (rc)
allow_changes = 0;
}
@@ -322,44 +310,37 @@ static int audit_do_config_change(char *function_name, int *to_change,
return rc;
}
-static int audit_set_rate_limit(int limit, kuid_t loginuid, u32 sessionid,
- u32 sid)
+static int audit_set_rate_limit(int limit)
{
- return audit_do_config_change("audit_rate_limit", &audit_rate_limit,
- limit, loginuid, sessionid, sid);
+ return audit_do_config_change("audit_rate_limit", &audit_rate_limit, limit);
}
-static int audit_set_backlog_limit(int limit, kuid_t loginuid, u32 sessionid,
- u32 sid)
+static int audit_set_backlog_limit(int limit)
{
- return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit,
- limit, loginuid, sessionid, sid);
+ return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, limit);
}
-static int audit_set_enabled(int state, kuid_t loginuid, u32 sessionid, u32 sid)
+static int audit_set_enabled(int state)
{
int rc;
if (state < AUDIT_OFF || state > AUDIT_LOCKED)
return -EINVAL;
- rc = audit_do_config_change("audit_enabled", &audit_enabled, state,
- loginuid, sessionid, sid);
-
+ rc = audit_do_config_change("audit_enabled", &audit_enabled, state);
if (!rc)
audit_ever_enabled |= !!state;
return rc;
}
-static int audit_set_failure(int state, kuid_t loginuid, u32 sessionid, u32 sid)
+static int audit_set_failure(int state)
{
if (state != AUDIT_FAIL_SILENT
&& state != AUDIT_FAIL_PRINTK
&& state != AUDIT_FAIL_PANIC)
return -EINVAL;
- return audit_do_config_change("audit_failure", &audit_failure, state,
- loginuid, sessionid, sid);
+ return audit_do_config_change("audit_failure", &audit_failure, state);
}
/*
@@ -417,34 +398,53 @@ static void kauditd_send_skb(struct sk_buff *skb)
consume_skb(skb);
}
-static int kauditd_thread(void *dummy)
+/*
+ * flush_hold_queue - empty the hold queue if auditd appears
+ *
+ * If auditd just started, drain the queue of messages already
+ * sent to syslog/printk. Remember loss here is ok. We already
+ * called audit_log_lost() if it didn't go out normally. so the
+ * race between the skb_dequeue and the next check for audit_pid
+ * doesn't matter.
+ *
+ * If you ever find kauditd to be too slow we can get a perf win
+ * by doing our own locking and keeping better track if there
+ * are messages in this queue. I don't see the need now, but
+ * in 5 years when I want to play with this again I'll see this
+ * note and still have no friggin idea what i'm thinking today.
+ */
+static void flush_hold_queue(void)
{
struct sk_buff *skb;
+ if (!audit_default || !audit_pid)
+ return;
+
+ skb = skb_dequeue(&audit_skb_hold_queue);
+ if (likely(!skb))
+ return;
+
+ while (skb && audit_pid) {
+ kauditd_send_skb(skb);
+ skb = skb_dequeue(&audit_skb_hold_queue);
+ }
+
+ /*
+ * if auditd just disappeared but we
+ * dequeued an skb we need to drop ref
+ */
+ if (skb)
+ consume_skb(skb);
+}
+
+static int kauditd_thread(void *dummy)
+{
set_freezable();
while (!kthread_should_stop()) {
- /*
- * if auditd just started drain the queue of messages already
- * sent to syslog/printk. remember loss here is ok. we already
- * called audit_log_lost() if it didn't go out normally. so the
- * race between the skb_dequeue and the next check for audit_pid
- * doesn't matter.
- *
- * if you ever find kauditd to be too slow we can get a perf win
- * by doing our own locking and keeping better track if there
- * are messages in this queue. I don't see the need now, but
- * in 5 years when I want to play with this again I'll see this
- * note and still have no friggin idea what i'm thinking today.
- */
- if (audit_default && audit_pid) {
- skb = skb_dequeue(&audit_skb_hold_queue);
- if (unlikely(skb)) {
- while (skb && audit_pid) {
- kauditd_send_skb(skb);
- skb = skb_dequeue(&audit_skb_hold_queue);
- }
- }
- }
+ struct sk_buff *skb;
+ DECLARE_WAITQUEUE(wait, current);
+
+ flush_hold_queue();
skb = skb_dequeue(&audit_skb_queue);
wake_up(&audit_backlog_wait);
@@ -453,19 +453,18 @@ static int kauditd_thread(void *dummy)
kauditd_send_skb(skb);
else
audit_printk_skb(skb);
- } else {
- DECLARE_WAITQUEUE(wait, current);
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&kauditd_wait, &wait);
-
- if (!skb_queue_len(&audit_skb_queue)) {
- try_to_freeze();
- schedule();
- }
+ continue;
+ }
+ set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue(&kauditd_wait, &wait);
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&kauditd_wait, &wait);
+ if (!skb_queue_len(&audit_skb_queue)) {
+ try_to_freeze();
+ schedule();
}
+
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&kauditd_wait, &wait);
}
return 0;
}
@@ -579,13 +578,14 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
return -EPERM;
switch (msg_type) {
- case AUDIT_GET:
case AUDIT_LIST:
- case AUDIT_LIST_RULES:
- case AUDIT_SET:
case AUDIT_ADD:
- case AUDIT_ADD_RULE:
case AUDIT_DEL:
+ return -EOPNOTSUPP;
+ case AUDIT_GET:
+ case AUDIT_SET:
+ case AUDIT_LIST_RULES:
+ case AUDIT_ADD_RULE:
case AUDIT_DEL_RULE:
case AUDIT_SIGNAL_INFO:
case AUDIT_TTY_GET:
@@ -608,12 +608,10 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
return err;
}
-static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
- kuid_t auid, u32 ses, u32 sid)
+static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
{
int rc = 0;
- char *ctx = NULL;
- u32 len;
+ uid_t uid = from_kuid(&init_user_ns, current_uid());
if (!audit_enabled) {
*ab = NULL;
@@ -623,33 +621,21 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
*ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
if (unlikely(!*ab))
return rc;
- audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u",
- task_tgid_vnr(current),
- from_kuid(&init_user_ns, current_uid()),
- from_kuid(&init_user_ns, auid), ses);
- if (sid) {
- rc = security_secid_to_secctx(sid, &ctx, &len);
- if (rc)
- audit_log_format(*ab, " ssid=%u", sid);
- else {
- audit_log_format(*ab, " subj=%s", ctx);
- security_release_secctx(ctx, len);
- }
- }
+ audit_log_format(*ab, "pid=%d uid=%u", task_tgid_vnr(current), uid);
+ audit_log_session_info(*ab);
+ audit_log_task_context(*ab);
return rc;
}
static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
{
- u32 seq, sid;
+ u32 seq;
void *data;
struct audit_status *status_get, status_set;
int err;
struct audit_buffer *ab;
u16 msg_type = nlh->nlmsg_type;
- kuid_t loginuid; /* loginuid of sender */
- u32 sessionid;
struct audit_sig_info *sig_data;
char *ctx = NULL;
u32 len;
@@ -668,9 +654,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
return err;
}
}
- loginuid = audit_get_loginuid(current);
- sessionid = audit_get_sessionid(current);
- security_task_getsecid(current, &sid);
seq = nlh->nlmsg_seq;
data = nlmsg_data(nlh);
@@ -691,14 +674,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
return -EINVAL;
status_get = (struct audit_status *)data;
if (status_get->mask & AUDIT_STATUS_ENABLED) {
- err = audit_set_enabled(status_get->enabled,
- loginuid, sessionid, sid);
+ err = audit_set_enabled(status_get->enabled);
if (err < 0)
return err;
}
if (status_get->mask & AUDIT_STATUS_FAILURE) {
- err = audit_set_failure(status_get->failure,
- loginuid, sessionid, sid);
+ err = audit_set_failure(status_get->failure);
if (err < 0)
return err;
}
@@ -706,22 +687,17 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
int new_pid = status_get->pid;
if (audit_enabled != AUDIT_OFF)
- audit_log_config_change("audit_pid", new_pid,
- audit_pid, loginuid,
- sessionid, sid, 1);
-
+ audit_log_config_change("audit_pid", new_pid, audit_pid, 1);
audit_pid = new_pid;
audit_nlk_portid = NETLINK_CB(skb).portid;
}
if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) {
- err = audit_set_rate_limit(status_get->rate_limit,
- loginuid, sessionid, sid);
+ err = audit_set_rate_limit(status_get->rate_limit);
if (err < 0)
return err;
}
if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
- err = audit_set_backlog_limit(status_get->backlog_limit,
- loginuid, sessionid, sid);
+ err = audit_set_backlog_limit(status_get->backlog_limit);
break;
case AUDIT_USER:
case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
@@ -729,25 +705,22 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (!audit_enabled && msg_type != AUDIT_USER_AVC)
return 0;
- err = audit_filter_user();
+ err = audit_filter_user(msg_type);
if (err == 1) {
err = 0;
if (msg_type == AUDIT_USER_TTY) {
- err = tty_audit_push_task(current, loginuid,
- sessionid);
+ err = tty_audit_push_current();
if (err)
break;
}
- audit_log_common_recv_msg(&ab, msg_type,
- loginuid, sessionid, sid);
-
+ audit_log_common_recv_msg(&ab, msg_type);
if (msg_type != AUDIT_USER_TTY)
audit_log_format(ab, " msg='%.1024s'",
(char *)data);
else {
int size;
- audit_log_format(ab, " msg=");
+ audit_log_format(ab, " data=");
size = nlmsg_len(nlh);
if (size > 0 &&
((unsigned char *)data)[size - 1] == '\0')
@@ -758,50 +731,24 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
audit_log_end(ab);
}
break;
- case AUDIT_ADD:
- case AUDIT_DEL:
- if (nlmsg_len(nlh) < sizeof(struct audit_rule))
- return -EINVAL;
- if (audit_enabled == AUDIT_LOCKED) {
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE,
- loginuid, sessionid, sid);
-
- audit_log_format(ab, " audit_enabled=%d res=0",
- audit_enabled);
- audit_log_end(ab);
- return -EPERM;
- }
- /* fallthrough */
- case AUDIT_LIST:
- err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid,
- seq, data, nlmsg_len(nlh),
- loginuid, sessionid, sid);
- break;
case AUDIT_ADD_RULE:
case AUDIT_DEL_RULE:
if (nlmsg_len(nlh) < sizeof(struct audit_rule_data))
return -EINVAL;
if (audit_enabled == AUDIT_LOCKED) {
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE,
- loginuid, sessionid, sid);
-
- audit_log_format(ab, " audit_enabled=%d res=0",
- audit_enabled);
+ audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
+ audit_log_format(ab, " audit_enabled=%d res=0", audit_enabled);
audit_log_end(ab);
return -EPERM;
}
/* fallthrough */
case AUDIT_LIST_RULES:
err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid,
- seq, data, nlmsg_len(nlh),
- loginuid, sessionid, sid);
+ seq, data, nlmsg_len(nlh));
break;
case AUDIT_TRIM:
audit_trim_trees();
-
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE,
- loginuid, sessionid, sid);
-
+ audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
audit_log_format(ab, " op=trim res=1");
audit_log_end(ab);
break;
@@ -831,8 +778,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
/* OK, here comes... */
err = audit_tag_tree(old, new);
- audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE,
- loginuid, sessionid, sid);
+ audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
audit_log_format(ab, " op=make_equiv old=");
audit_log_untrustedstring(ab, old);
@@ -871,27 +817,30 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
struct audit_tty_status s;
struct task_struct *tsk = current;
- spin_lock_irq(&tsk->sighand->siglock);
+ spin_lock(&tsk->sighand->siglock);
s.enabled = tsk->signal->audit_tty != 0;
- spin_unlock_irq(&tsk->sighand->siglock);
+ s.log_passwd = tsk->signal->audit_tty_log_passwd;
+ spin_unlock(&tsk->sighand->siglock);
audit_send_reply(NETLINK_CB(skb).portid, seq,
AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
break;
}
case AUDIT_TTY_SET: {
- struct audit_tty_status *s;
+ struct audit_tty_status s;
struct task_struct *tsk = current;
- if (nlh->nlmsg_len < sizeof(struct audit_tty_status))
- return -EINVAL;
- s = data;
- if (s->enabled != 0 && s->enabled != 1)
+ memset(&s, 0, sizeof(s));
+ /* guard against past and future API changes */
+ memcpy(&s, data, min(sizeof(s), (size_t)nlh->nlmsg_len));
+ if ((s.enabled != 0 && s.enabled != 1) ||
+ (s.log_passwd != 0 && s.log_passwd != 1))
return -EINVAL;
- spin_lock_irq(&tsk->sighand->siglock);
- tsk->signal->audit_tty = s->enabled != 0;
- spin_unlock_irq(&tsk->sighand->siglock);
+ spin_lock(&tsk->sighand->siglock);
+ tsk->signal->audit_tty = s.enabled;
+ tsk->signal->audit_tty_log_passwd = s.log_passwd;
+ spin_unlock(&tsk->sighand->siglock);
break;
}
default:
@@ -910,7 +859,7 @@ static void audit_receive_skb(struct sk_buff *skb)
{
struct nlmsghdr *nlh;
/*
- * len MUST be signed for NLMSG_NEXT to be able to dec it below 0
+ * len MUST be signed for nlmsg_next to be able to dec it below 0
* if the nlmsg_len was not aligned
*/
int len;
@@ -919,13 +868,13 @@ static void audit_receive_skb(struct sk_buff *skb)
nlh = nlmsg_hdr(skb);
len = skb->len;
- while (NLMSG_OK(nlh, len)) {
+ while (nlmsg_ok(nlh, len)) {
err = audit_receive_msg(skb, nlh);
/* if err or if this message says it wants a response */
if (err || (nlh->nlmsg_flags & NLM_F_ACK))
netlink_ack(skb, nlh, err);
- nlh = NLMSG_NEXT(nlh, len);
+ nlh = nlmsg_next(nlh, &len);
}
}
@@ -1434,6 +1383,14 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
kfree(pathname);
}
+void audit_log_session_info(struct audit_buffer *ab)
+{
+ u32 sessionid = audit_get_sessionid(current);
+ uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current));
+
+ audit_log_format(ab, " auid=%u ses=%u\n", auid, sessionid);
+}
+
void audit_log_key(struct audit_buffer *ab, char *key)
{
audit_log_format(ab, " key=");
@@ -1443,6 +1400,224 @@ void audit_log_key(struct audit_buffer *ab, char *key)
audit_log_format(ab, "(null)");
}
+void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
+{
+ int i;
+
+ audit_log_format(ab, " %s=", prefix);
+ CAP_FOR_EACH_U32(i) {
+ audit_log_format(ab, "%08x",
+ cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]);
+ }
+}
+
+void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
+{
+ kernel_cap_t *perm = &name->fcap.permitted;
+ kernel_cap_t *inh = &name->fcap.inheritable;
+ int log = 0;
+
+ if (!cap_isclear(*perm)) {
+ audit_log_cap(ab, "cap_fp", perm);
+ log = 1;
+ }
+ if (!cap_isclear(*inh)) {
+ audit_log_cap(ab, "cap_fi", inh);
+ log = 1;
+ }
+
+ if (log)
+ audit_log_format(ab, " cap_fe=%d cap_fver=%x",
+ name->fcap.fE, name->fcap_ver);
+}
+
+static inline int audit_copy_fcaps(struct audit_names *name,
+ const struct dentry *dentry)
+{
+ struct cpu_vfs_cap_data caps;
+ int rc;
+
+ if (!dentry)
+ return 0;
+
+ rc = get_vfs_caps_from_disk(dentry, &caps);
+ if (rc)
+ return rc;
+
+ name->fcap.permitted = caps.permitted;
+ name->fcap.inheritable = caps.inheritable;
+ name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
+ name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >>
+ VFS_CAP_REVISION_SHIFT;
+
+ return 0;
+}
+
+/* Copy inode data into an audit_names. */
+void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
+ const struct inode *inode)
+{
+ name->ino = inode->i_ino;
+ name->dev = inode->i_sb->s_dev;
+ name->mode = inode->i_mode;
+ name->uid = inode->i_uid;
+ name->gid = inode->i_gid;
+ name->rdev = inode->i_rdev;
+ security_inode_getsecid(inode, &name->osid);
+ audit_copy_fcaps(name, dentry);
+}
+
+/**
+ * audit_log_name - produce AUDIT_PATH record from struct audit_names
+ * @context: audit_context for the task
+ * @n: audit_names structure with reportable details
+ * @path: optional path to report instead of audit_names->name
+ * @record_num: record number to report when handling a list of names
+ * @call_panic: optional pointer to int that will be updated if secid fails
+ */
+void audit_log_name(struct audit_context *context, struct audit_names *n,
+ struct path *path, int record_num, int *call_panic)
+{
+ struct audit_buffer *ab;
+ ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
+ if (!ab)
+ return;
+
+ audit_log_format(ab, "item=%d", record_num);
+
+ if (path)
+ audit_log_d_path(ab, " name=", path);
+ else if (n->name) {
+ switch (n->name_len) {
+ case AUDIT_NAME_FULL:
+ /* log the full path */
+ audit_log_format(ab, " name=");
+ audit_log_untrustedstring(ab, n->name->name);
+ break;
+ case 0:
+ /* name was specified as a relative path and the
+ * directory component is the cwd */
+ audit_log_d_path(ab, " name=", &context->pwd);
+ break;
+ default:
+ /* log the name's directory component */
+ audit_log_format(ab, " name=");
+ audit_log_n_untrustedstring(ab, n->name->name,
+ n->name_len);
+ }
+ } else
+ audit_log_format(ab, " name=(null)");
+
+ if (n->ino != (unsigned long)-1) {
+ audit_log_format(ab, " inode=%lu"
+ " dev=%02x:%02x mode=%#ho"
+ " ouid=%u ogid=%u rdev=%02x:%02x",
+ n->ino,
+ MAJOR(n->dev),
+ MINOR(n->dev),
+ n->mode,
+ from_kuid(&init_user_ns, n->uid),
+ from_kgid(&init_user_ns, n->gid),
+ MAJOR(n->rdev),
+ MINOR(n->rdev));
+ }
+ if (n->osid != 0) {
+ char *ctx = NULL;
+ u32 len;
+ if (security_secid_to_secctx(
+ n->osid, &ctx, &len)) {
+ audit_log_format(ab, " osid=%u", n->osid);
+ if (call_panic)
+ *call_panic = 2;
+ } else {
+ audit_log_format(ab, " obj=%s", ctx);
+ security_release_secctx(ctx, len);
+ }
+ }
+
+ audit_log_fcaps(ab, n);
+ audit_log_end(ab);
+}
+
+int audit_log_task_context(struct audit_buffer *ab)
+{
+ char *ctx = NULL;
+ unsigned len;
+ int error;
+ u32 sid;
+
+ security_task_getsecid(current, &sid);
+ if (!sid)
+ return 0;
+
+ error = security_secid_to_secctx(sid, &ctx, &len);
+ if (error) {
+ if (error != -EINVAL)
+ goto error_path;
+ return 0;
+ }
+
+ audit_log_format(ab, " subj=%s", ctx);
+ security_release_secctx(ctx, len);
+ return 0;
+
+error_path:
+ audit_panic("error in audit_log_task_context");
+ return error;
+}
+EXPORT_SYMBOL(audit_log_task_context);
+
+void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
+{
+ const struct cred *cred;
+ char name[sizeof(tsk->comm)];
+ struct mm_struct *mm = tsk->mm;
+ char *tty;
+
+ if (!ab)
+ return;
+
+ /* tsk == current */
+ cred = current_cred();
+
+ spin_lock_irq(&tsk->sighand->siglock);
+ if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
+ tty = tsk->signal->tty->name;
+ else
+ tty = "(none)";
+ spin_unlock_irq(&tsk->sighand->siglock);
+
+ audit_log_format(ab,
+ " ppid=%ld pid=%d auid=%u uid=%u gid=%u"
+ " euid=%u suid=%u fsuid=%u"
+ " egid=%u sgid=%u fsgid=%u ses=%u tty=%s",
+ sys_getppid(),
+ tsk->pid,
+ from_kuid(&init_user_ns, audit_get_loginuid(tsk)),
+ from_kuid(&init_user_ns, cred->uid),
+ from_kgid(&init_user_ns, cred->gid),
+ from_kuid(&init_user_ns, cred->euid),
+ from_kuid(&init_user_ns, cred->suid),
+ from_kuid(&init_user_ns, cred->fsuid),
+ from_kgid(&init_user_ns, cred->egid),
+ from_kgid(&init_user_ns, cred->sgid),
+ from_kgid(&init_user_ns, cred->fsgid),
+ audit_get_sessionid(tsk), tty);
+
+ get_task_comm(name, tsk);
+ audit_log_format(ab, " comm=");
+ audit_log_untrustedstring(ab, name);
+
+ if (mm) {
+ down_read(&mm->mmap_sem);
+ if (mm->exe_file)
+ audit_log_d_path(ab, " exe=", &mm->exe_file->f_path);
+ up_read(&mm->mmap_sem);
+ }
+ audit_log_task_context(ab);
+}
+EXPORT_SYMBOL(audit_log_task_info);
+
/**
* audit_log_link_denied - report a link restriction denial
* @operation: specific link opreation
@@ -1451,19 +1626,28 @@ void audit_log_key(struct audit_buffer *ab, char *key)
void audit_log_link_denied(const char *operation, struct path *link)
{
struct audit_buffer *ab;
+ struct audit_names *name;
+
+ name = kzalloc(sizeof(*name), GFP_NOFS);
+ if (!name)
+ return;
+ /* Generate AUDIT_ANOM_LINK with subject, operation, outcome. */
ab = audit_log_start(current->audit_context, GFP_KERNEL,
AUDIT_ANOM_LINK);
if (!ab)
- return;
- audit_log_format(ab, "op=%s action=denied", operation);
- audit_log_format(ab, " pid=%d comm=", current->pid);
- audit_log_untrustedstring(ab, current->comm);
- audit_log_d_path(ab, " path=", link);
- audit_log_format(ab, " dev=");
- audit_log_untrustedstring(ab, link->dentry->d_inode->i_sb->s_id);
- audit_log_format(ab, " ino=%lu", link->dentry->d_inode->i_ino);
+ goto out;
+ audit_log_format(ab, "op=%s", operation);
+ audit_log_task_info(ab, current);
+ audit_log_format(ab, " res=0");
audit_log_end(ab);
+
+ /* Generate AUDIT_PATH record with object. */
+ name->type = AUDIT_TYPE_NORMAL;
+ audit_copy_inode(name, link->dentry, link->dentry->d_inode);
+ audit_log_name(current->audit_context, name, link, 0, NULL);
+out:
+ kfree(name);
}
/**
@@ -1483,7 +1667,7 @@ void audit_log_end(struct audit_buffer *ab)
audit_log_lost("rate limit exceeded");
} else {
struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
- nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0);
+ nlh->nlmsg_len = ab->skb->len - NLMSG_HDRLEN;
if (audit_pid) {
skb_queue_tail(&audit_skb_queue, ab->skb);
diff --git a/kernel/audit.h b/kernel/audit.h
index 11468d99dad..1c95131ef76 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -22,6 +22,7 @@
#include <linux/fs.h>
#include <linux/audit.h>
#include <linux/skbuff.h>
+#include <uapi/linux/mqueue.h>
/* 0 = no checking
1 = put_count checking
@@ -29,6 +30,11 @@
*/
#define AUDIT_DEBUG 0
+/* AUDIT_NAMES is the number of slots we reserve in the audit_context
+ * for saving names from getname(). If we get more names we will allocate
+ * a name dynamically and also add those to the list anchored by names_list. */
+#define AUDIT_NAMES 5
+
/* At task start time, the audit_state is set in the audit_context using
a per-task filter. At syscall entry, the audit_state is augmented by
the syscall filter. */
@@ -59,8 +65,158 @@ struct audit_entry {
struct audit_krule rule;
};
+struct audit_cap_data {
+ kernel_cap_t permitted;
+ kernel_cap_t inheritable;
+ union {
+ unsigned int fE; /* effective bit of file cap */
+ kernel_cap_t effective; /* effective set of process */
+ };
+};
+
+/* When fs/namei.c:getname() is called, we store the pointer in name and
+ * we don't let putname() free it (instead we free all of the saved
+ * pointers at syscall exit time).
+ *
+ * Further, in fs/namei.c:path_lookup() we store the inode and device.
+ */
+struct audit_names {
+ struct list_head list; /* audit_context->names_list */
+
+ struct filename *name;
+ int name_len; /* number of chars to log */
+ bool name_put; /* call __putname()? */
+
+ unsigned long ino;
+ dev_t dev;
+ umode_t mode;
+ kuid_t uid;
+ kgid_t gid;
+ dev_t rdev;
+ u32 osid;
+ struct audit_cap_data fcap;
+ unsigned int fcap_ver;
+ unsigned char type; /* record type */
+ /*
+ * This was an allocated audit_names and not from the array of
+ * names allocated in the task audit context. Thus this name
+ * should be freed on syscall exit.
+ */
+ bool should_free;
+};
+
+/* The per-task audit context. */
+struct audit_context {
+ int dummy; /* must be the first element */
+ int in_syscall; /* 1 if task is in a syscall */
+ enum audit_state state, current_state;
+ unsigned int serial; /* serial number for record */
+ int major; /* syscall number */
+ struct timespec ctime; /* time of syscall entry */
+ unsigned long argv[4]; /* syscall arguments */
+ long return_code;/* syscall return code */
+ u64 prio;
+ int return_valid; /* return code is valid */
+ /*
+ * The names_list is the list of all audit_names collected during this
+ * syscall. The first AUDIT_NAMES entries in the names_list will
+ * actually be from the preallocated_names array for performance
+ * reasons. Except during allocation they should never be referenced
+ * through the preallocated_names array and should only be found/used
+ * by running the names_list.
+ */
+ struct audit_names preallocated_names[AUDIT_NAMES];
+ int name_count; /* total records in names_list */
+ struct list_head names_list; /* struct audit_names->list anchor */
+ char *filterkey; /* key for rule that triggered record */
+ struct path pwd;
+ struct audit_aux_data *aux;
+ struct audit_aux_data *aux_pids;
+ struct sockaddr_storage *sockaddr;
+ size_t sockaddr_len;
+ /* Save things to print about task_struct */
+ pid_t pid, ppid;
+ kuid_t uid, euid, suid, fsuid;
+ kgid_t gid, egid, sgid, fsgid;
+ unsigned long personality;
+ int arch;
+
+ pid_t target_pid;
+ kuid_t target_auid;
+ kuid_t target_uid;
+ unsigned int target_sessionid;
+ u32 target_sid;
+ char target_comm[TASK_COMM_LEN];
+
+ struct audit_tree_refs *trees, *first_trees;
+ struct list_head killed_trees;
+ int tree_count;
+
+ int type;
+ union {
+ struct {
+ int nargs;
+ long args[6];
+ } socketcall;
+ struct {
+ kuid_t uid;
+ kgid_t gid;
+ umode_t mode;
+ u32 osid;
+ int has_perm;
+ uid_t perm_uid;
+ gid_t perm_gid;
+ umode_t perm_mode;
+ unsigned long qbytes;
+ } ipc;
+ struct {
+ mqd_t mqdes;
+ struct mq_attr mqstat;
+ } mq_getsetattr;
+ struct {
+ mqd_t mqdes;
+ int sigev_signo;
+ } mq_notify;
+ struct {
+ mqd_t mqdes;
+ size_t msg_len;
+ unsigned int msg_prio;
+ struct timespec abs_timeout;
+ } mq_sendrecv;
+ struct {
+ int oflag;
+ umode_t mode;
+ struct mq_attr attr;
+ } mq_open;
+ struct {
+ pid_t pid;
+ struct audit_cap_data cap;
+ } capset;
+ struct {
+ int fd;
+ int flags;
+ } mmap;
+ };
+ int fds[2];
+
+#if AUDIT_DEBUG
+ int put_count;
+ int ino_count;
+#endif
+};
+
extern int audit_ever_enabled;
+extern void audit_copy_inode(struct audit_names *name,
+ const struct dentry *dentry,
+ const struct inode *inode);
+extern void audit_log_cap(struct audit_buffer *ab, char *prefix,
+ kernel_cap_t *cap);
+extern void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name);
+extern void audit_log_name(struct audit_context *context,
+ struct audit_names *n, struct path *path,
+ int record_num, int *call_panic);
+
extern int audit_pid;
#define AUDIT_INODE_BUCKETS 32
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 267436826c3..83a2970295d 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -310,121 +310,83 @@ static u32 audit_to_op(u32 op)
return n;
}
-
-/* Translate struct audit_rule to kernel's rule respresentation.
- * Exists for backward compatibility with userspace. */
-static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
+/* check if an audit field is valid */
+static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
{
- struct audit_entry *entry;
- int err = 0;
- int i;
-
- entry = audit_to_entry_common(rule);
- if (IS_ERR(entry))
- goto exit_nofree;
-
- for (i = 0; i < rule->field_count; i++) {
- struct audit_field *f = &entry->rule.fields[i];
- u32 n;
-
- n = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS);
-
- /* Support for legacy operators where
- * AUDIT_NEGATE bit signifies != and otherwise assumes == */
- if (n & AUDIT_NEGATE)
- f->op = Audit_not_equal;
- else if (!n)
- f->op = Audit_equal;
- else
- f->op = audit_to_op(n);
-
- entry->rule.vers_ops = (n & AUDIT_OPERATORS) ? 2 : 1;
-
- f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS);
- f->val = rule->values[i];
- f->uid = INVALID_UID;
- f->gid = INVALID_GID;
-
- err = -EINVAL;
- if (f->op == Audit_bad)
- goto exit_free;
-
- switch(f->type) {
- default:
- goto exit_free;
- case AUDIT_UID:
- case AUDIT_EUID:
- case AUDIT_SUID:
- case AUDIT_FSUID:
- case AUDIT_LOGINUID:
- /* bit ops not implemented for uid comparisons */
- if (f->op == Audit_bitmask || f->op == Audit_bittest)
- goto exit_free;
-
- f->uid = make_kuid(current_user_ns(), f->val);
- if (!uid_valid(f->uid))
- goto exit_free;
- break;
- case AUDIT_GID:
- case AUDIT_EGID:
- case AUDIT_SGID:
- case AUDIT_FSGID:
- /* bit ops not implemented for gid comparisons */
- if (f->op == Audit_bitmask || f->op == Audit_bittest)
- goto exit_free;
-
- f->gid = make_kgid(current_user_ns(), f->val);
- if (!gid_valid(f->gid))
- goto exit_free;
- break;
- case AUDIT_PID:
- case AUDIT_PERS:
- case AUDIT_MSGTYPE:
- case AUDIT_PPID:
- case AUDIT_DEVMAJOR:
- case AUDIT_DEVMINOR:
- case AUDIT_EXIT:
- case AUDIT_SUCCESS:
- /* bit ops are only useful on syscall args */
- if (f->op == Audit_bitmask || f->op == Audit_bittest)
- goto exit_free;
- break;
- case AUDIT_ARG0:
- case AUDIT_ARG1:
- case AUDIT_ARG2:
- case AUDIT_ARG3:
- break;
- /* arch is only allowed to be = or != */
- case AUDIT_ARCH:
- if (f->op != Audit_not_equal && f->op != Audit_equal)
- goto exit_free;
- entry->rule.arch_f = f;
- break;
- case AUDIT_PERM:
- if (f->val & ~15)
- goto exit_free;
- break;
- case AUDIT_FILETYPE:
- if (f->val & ~S_IFMT)
- goto exit_free;
- break;
- case AUDIT_INODE:
- err = audit_to_inode(&entry->rule, f);
- if (err)
- goto exit_free;
- break;
- }
- }
-
- if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal)
- entry->rule.inode_f = NULL;
-
-exit_nofree:
- return entry;
+ switch(f->type) {
+ case AUDIT_MSGTYPE:
+ if (entry->rule.listnr != AUDIT_FILTER_TYPE &&
+ entry->rule.listnr != AUDIT_FILTER_USER)
+ return -EINVAL;
+ break;
+ };
-exit_free:
- audit_free_rule(entry);
- return ERR_PTR(err);
+ switch(f->type) {
+ default:
+ return -EINVAL;
+ case AUDIT_UID:
+ case AUDIT_EUID:
+ case AUDIT_SUID:
+ case AUDIT_FSUID:
+ case AUDIT_LOGINUID:
+ case AUDIT_OBJ_UID:
+ case AUDIT_GID:
+ case AUDIT_EGID:
+ case AUDIT_SGID:
+ case AUDIT_FSGID:
+ case AUDIT_OBJ_GID:
+ case AUDIT_PID:
+ case AUDIT_PERS:
+ case AUDIT_MSGTYPE:
+ case AUDIT_PPID:
+ case AUDIT_DEVMAJOR:
+ case AUDIT_DEVMINOR:
+ case AUDIT_EXIT:
+ case AUDIT_SUCCESS:
+ /* bit ops are only useful on syscall args */
+ if (f->op == Audit_bitmask || f->op == Audit_bittest)
+ return -EINVAL;
+ break;
+ case AUDIT_ARG0:
+ case AUDIT_ARG1:
+ case AUDIT_ARG2:
+ case AUDIT_ARG3:
+ case AUDIT_SUBJ_USER:
+ case AUDIT_SUBJ_ROLE:
+ case AUDIT_SUBJ_TYPE:
+ case AUDIT_SUBJ_SEN:
+ case AUDIT_SUBJ_CLR:
+ case AUDIT_OBJ_USER:
+ case AUDIT_OBJ_ROLE:
+ case AUDIT_OBJ_TYPE:
+ case AUDIT_OBJ_LEV_LOW:
+ case AUDIT_OBJ_LEV_HIGH:
+ case AUDIT_WATCH:
+ case AUDIT_DIR:
+ case AUDIT_FILTERKEY:
+ break;
+ case AUDIT_LOGINUID_SET:
+ if ((f->val != 0) && (f->val != 1))
+ return -EINVAL;
+ /* FALL THROUGH */
+ case AUDIT_ARCH:
+ if (f->op != Audit_not_equal && f->op != Audit_equal)
+ return -EINVAL;
+ break;
+ case AUDIT_PERM:
+ if (f->val & ~15)
+ return -EINVAL;
+ break;
+ case AUDIT_FILETYPE:
+ if (f->val & ~S_IFMT)
+ return -EINVAL;
+ break;
+ case AUDIT_FIELD_COMPARE:
+ if (f->val > AUDIT_MAX_FIELD_COMPARE)
+ return -EINVAL;
+ break;
+ };
+ return 0;
}
/* Translate struct audit_rule_data to kernel's rule respresentation. */
@@ -459,17 +421,25 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
f->gid = INVALID_GID;
f->lsm_str = NULL;
f->lsm_rule = NULL;
- switch(f->type) {
+
+ /* Support legacy tests for a valid loginuid */
+ if ((f->type == AUDIT_LOGINUID) && (f->val == 4294967295)) {
+ f->type = AUDIT_LOGINUID_SET;
+ f->val = 0;
+ }
+
+ err = audit_field_valid(entry, f);
+ if (err)
+ goto exit_free;
+
+ err = -EINVAL;
+ switch (f->type) {
+ case AUDIT_LOGINUID:
case AUDIT_UID:
case AUDIT_EUID:
case AUDIT_SUID:
case AUDIT_FSUID:
- case AUDIT_LOGINUID:
case AUDIT_OBJ_UID:
- /* bit ops not implemented for uid comparisons */
- if (f->op == Audit_bitmask || f->op == Audit_bittest)
- goto exit_free;
-
f->uid = make_kuid(current_user_ns(), f->val);
if (!uid_valid(f->uid))
goto exit_free;
@@ -479,27 +449,10 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
case AUDIT_SGID:
case AUDIT_FSGID:
case AUDIT_OBJ_GID:
- /* bit ops not implemented for gid comparisons */
- if (f->op == Audit_bitmask || f->op == Audit_bittest)
- goto exit_free;
-
f->gid = make_kgid(current_user_ns(), f->val);
if (!gid_valid(f->gid))
goto exit_free;
break;
- case AUDIT_PID:
- case AUDIT_PERS:
- case AUDIT_MSGTYPE:
- case AUDIT_PPID:
- case AUDIT_DEVMAJOR:
- case AUDIT_DEVMINOR:
- case AUDIT_EXIT:
- case AUDIT_SUCCESS:
- case AUDIT_ARG0:
- case AUDIT_ARG1:
- case AUDIT_ARG2:
- case AUDIT_ARG3:
- break;
case AUDIT_ARCH:
entry->rule.arch_f = f;
break;
@@ -570,20 +523,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
entry->rule.buflen += f->val;
entry->rule.filterkey = str;
break;
- case AUDIT_PERM:
- if (f->val & ~15)
- goto exit_free;
- break;
- case AUDIT_FILETYPE:
- if (f->val & ~S_IFMT)
- goto exit_free;
- break;
- case AUDIT_FIELD_COMPARE:
- if (f->val > AUDIT_MAX_FIELD_COMPARE)
- goto exit_free;
- break;
- default:
- goto exit_free;
}
}
@@ -613,36 +552,6 @@ static inline size_t audit_pack_string(void **bufp, const char *str)
return len;
}
-/* Translate kernel rule respresentation to struct audit_rule.
- * Exists for backward compatibility with userspace. */
-static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule)
-{
- struct audit_rule *rule;
- int i;
-
- rule = kzalloc(sizeof(*rule), GFP_KERNEL);
- if (unlikely(!rule))
- return NULL;
-
- rule->flags = krule->flags | krule->listnr;
- rule->action = krule->action;
- rule->field_count = krule->field_count;
- for (i = 0; i < rule->field_count; i++) {
- rule->values[i] = krule->fields[i].val;
- rule->fields[i] = krule->fields[i].type;
-
- if (krule->vers_ops == 1) {
- if (krule->fields[i].op == Audit_not_equal)
- rule->fields[i] |= AUDIT_NEGATE;
- } else {
- rule->fields[i] |= audit_ops[krule->fields[i].op];
- }
- }
- for (i = 0; i < AUDIT_BITMASK_SIZE; i++) rule->mask[i] = krule->mask[i];
-
- return rule;
-}
-
/* Translate kernel rule respresentation to struct audit_rule_data. */
static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
{
@@ -1055,35 +964,6 @@ out:
return ret;
}
-/* List rules using struct audit_rule. Exists for backward
- * compatibility with userspace. */
-static void audit_list(int pid, int seq, struct sk_buff_head *q)
-{
- struct sk_buff *skb;
- struct audit_krule *r;
- int i;
-
- /* This is a blocking read, so use audit_filter_mutex instead of rcu
- * iterator to sync with list writers. */
- for (i=0; i<AUDIT_NR_FILTERS; i++) {
- list_for_each_entry(r, &audit_rules_list[i], list) {
- struct audit_rule *rule;
-
- rule = audit_krule_to_rule(r);
- if (unlikely(!rule))
- break;
- skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1,
- rule, sizeof(*rule));
- if (skb)
- skb_queue_tail(q, skb);
- kfree(rule);
- }
- }
- skb = audit_make_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
- if (skb)
- skb_queue_tail(q, skb);
-}
-
/* List rules using struct audit_rule_data. */
static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
{
@@ -1113,11 +993,11 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
}
/* Log rule additions and removals */
-static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid,
- char *action, struct audit_krule *rule,
- int res)
+static void audit_log_rule_change(char *action, struct audit_krule *rule, int res)
{
struct audit_buffer *ab;
+ uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(current));
+ u32 sessionid = audit_get_sessionid(current);
if (!audit_enabled)
return;
@@ -1125,18 +1005,8 @@ static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid,
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
if (!ab)
return;
- audit_log_format(ab, "auid=%u ses=%u",
- from_kuid(&init_user_ns, loginuid), sessionid);
- if (sid) {
- char *ctx = NULL;
- u32 len;
- if (security_secid_to_secctx(sid, &ctx, &len))
- audit_log_format(ab, " ssid=%u", sid);
- else {
- audit_log_format(ab, " subj=%s", ctx);
- security_release_secctx(ctx, len);
- }
- }
+ audit_log_format(ab, "auid=%u ses=%u" ,loginuid, sessionid);
+ audit_log_task_context(ab);
audit_log_format(ab, " op=");
audit_log_string(ab, action);
audit_log_key(ab, rule->filterkey);
@@ -1155,8 +1025,7 @@ static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid,
* @sessionid: sessionid for netlink audit message
* @sid: SE Linux Security ID of sender
*/
-int audit_receive_filter(int type, int pid, int seq, void *data,
- size_t datasz, kuid_t loginuid, u32 sessionid, u32 sid)
+int audit_receive_filter(int type, int pid, int seq, void *data, size_t datasz)
{
struct task_struct *tsk;
struct audit_netlink_list *dest;
@@ -1164,7 +1033,6 @@ int audit_receive_filter(int type, int pid, int seq, void *data,
struct audit_entry *entry;
switch (type) {
- case AUDIT_LIST:
case AUDIT_LIST_RULES:
/* We can't just spew out the rules here because we might fill
* the available socket buffer space and deadlock waiting for
@@ -1179,10 +1047,7 @@ int audit_receive_filter(int type, int pid, int seq, void *data,
skb_queue_head_init(&dest->q);
mutex_lock(&audit_filter_mutex);
- if (type == AUDIT_LIST)
- audit_list(pid, seq, &dest->q);
- else
- audit_list_rules(pid, seq, &dest->q);
+ audit_list_rules(pid, seq, &dest->q);
mutex_unlock(&audit_filter_mutex);
tsk = kthread_run(audit_send_list, dest, "audit_send_list");
@@ -1192,35 +1057,23 @@ int audit_receive_filter(int type, int pid, int seq, void *data,
err = PTR_ERR(tsk);
}
break;
- case AUDIT_ADD:
case AUDIT_ADD_RULE:
- if (type == AUDIT_ADD)
- entry = audit_rule_to_entry(data);
- else
- entry = audit_data_to_entry(data, datasz);
+ entry = audit_data_to_entry(data, datasz);
if (IS_ERR(entry))
return PTR_ERR(entry);
err = audit_add_rule(entry);
- audit_log_rule_change(loginuid, sessionid, sid, "add rule",
- &entry->rule, !err);
-
+ audit_log_rule_change("add rule", &entry->rule, !err);
if (err)
audit_free_rule(entry);
break;
- case AUDIT_DEL:
case AUDIT_DEL_RULE:
- if (type == AUDIT_DEL)
- entry = audit_rule_to_entry(data);
- else
- entry = audit_data_to_entry(data, datasz);
+ entry = audit_data_to_entry(data, datasz);
if (IS_ERR(entry))
return PTR_ERR(entry);
err = audit_del_rule(entry);
- audit_log_rule_change(loginuid, sessionid, sid, "remove rule",
- &entry->rule, !err);
-
+ audit_log_rule_change("remove rule", &entry->rule, !err);
audit_free_rule(entry);
break;
default:
@@ -1358,7 +1211,7 @@ int audit_compare_dname_path(const char *dname, const char *path, int parentlen)
return strncmp(p, dname, dlen);
}
-static int audit_filter_user_rules(struct audit_krule *rule,
+static int audit_filter_user_rules(struct audit_krule *rule, int type,
enum audit_state *state)
{
int i;
@@ -1382,6 +1235,13 @@ static int audit_filter_user_rules(struct audit_krule *rule,
result = audit_uid_comparator(audit_get_loginuid(current),
f->op, f->uid);
break;
+ case AUDIT_LOGINUID_SET:
+ result = audit_comparator(audit_loginuid_set(current),
+ f->op, f->val);
+ break;
+ case AUDIT_MSGTYPE:
+ result = audit_comparator(type, f->op, f->val);
+ break;
case AUDIT_SUBJ_USER:
case AUDIT_SUBJ_ROLE:
case AUDIT_SUBJ_TYPE:
@@ -1408,7 +1268,7 @@ static int audit_filter_user_rules(struct audit_krule *rule,
return 1;
}
-int audit_filter_user(void)
+int audit_filter_user(int type)
{
enum audit_state state = AUDIT_DISABLED;
struct audit_entry *e;
@@ -1416,7 +1276,7 @@ int audit_filter_user(void)
rcu_read_lock();
list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) {
- if (audit_filter_user_rules(&e->rule, &state)) {
+ if (audit_filter_user_rules(&e->rule, type, &state)) {
if (state == AUDIT_DISABLED)
ret = 0;
break;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index c68229411a7..3c8a601324a 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -76,11 +76,6 @@
#define AUDITSC_SUCCESS 1
#define AUDITSC_FAILURE 2
-/* AUDIT_NAMES is the number of slots we reserve in the audit_context
- * for saving names from getname(). If we get more names we will allocate
- * a name dynamically and also add those to the list anchored by names_list. */
-#define AUDIT_NAMES 5
-
/* no execve audit message should be longer than this (userspace limits) */
#define MAX_EXECVE_AUDIT_LEN 7500
@@ -90,44 +85,6 @@ int audit_n_rules;
/* determines whether we collect data for signals sent */
int audit_signals;
-struct audit_cap_data {
- kernel_cap_t permitted;
- kernel_cap_t inheritable;
- union {
- unsigned int fE; /* effective bit of a file capability */
- kernel_cap_t effective; /* effective set of a process */
- };
-};
-
-/* When fs/namei.c:getname() is called, we store the pointer in name and
- * we don't let putname() free it (instead we free all of the saved
- * pointers at syscall exit time).
- *
- * Further, in fs/namei.c:path_lookup() we store the inode and device.
- */
-struct audit_names {
- struct list_head list; /* audit_context->names_list */
- struct filename *name;
- unsigned long ino;
- dev_t dev;
- umode_t mode;
- kuid_t uid;
- kgid_t gid;
- dev_t rdev;
- u32 osid;
- struct audit_cap_data fcap;
- unsigned int fcap_ver;
- int name_len; /* number of name's characters to log */
- unsigned char type; /* record type */
- bool name_put; /* call __putname() for this name */
- /*
- * This was an allocated audit_names and not from the array of
- * names allocated in the task audit context. Thus this name
- * should be freed on syscall exit
- */
- bool should_free;
-};
-
struct audit_aux_data {
struct audit_aux_data *next;
int type;
@@ -175,106 +132,6 @@ struct audit_tree_refs {
struct audit_chunk *c[31];
};
-/* The per-task audit context. */
-struct audit_context {
- int dummy; /* must be the first element */
- int in_syscall; /* 1 if task is in a syscall */
- enum audit_state state, current_state;
- unsigned int serial; /* serial number for record */
- int major; /* syscall number */
- struct timespec ctime; /* time of syscall entry */
- unsigned long argv[4]; /* syscall arguments */
- long return_code;/* syscall return code */
- u64 prio;
- int return_valid; /* return code is valid */
- /*
- * The names_list is the list of all audit_names collected during this
- * syscall. The first AUDIT_NAMES entries in the names_list will
- * actually be from the preallocated_names array for performance
- * reasons. Except during allocation they should never be referenced
- * through the preallocated_names array and should only be found/used
- * by running the names_list.
- */
- struct audit_names preallocated_names[AUDIT_NAMES];
- int name_count; /* total records in names_list */
- struct list_head names_list; /* anchor for struct audit_names->list */
- char * filterkey; /* key for rule that triggered record */
- struct path pwd;
- struct audit_aux_data *aux;
- struct audit_aux_data *aux_pids;
- struct sockaddr_storage *sockaddr;
- size_t sockaddr_len;
- /* Save things to print about task_struct */
- pid_t pid, ppid;
- kuid_t uid, euid, suid, fsuid;
- kgid_t gid, egid, sgid, fsgid;
- unsigned long personality;
- int arch;
-
- pid_t target_pid;
- kuid_t target_auid;
- kuid_t target_uid;
- unsigned int target_sessionid;
- u32 target_sid;
- char target_comm[TASK_COMM_LEN];
-
- struct audit_tree_refs *trees, *first_trees;
- struct list_head killed_trees;
- int tree_count;
-
- int type;
- union {
- struct {
- int nargs;
- long args[6];
- } socketcall;
- struct {
- kuid_t uid;
- kgid_t gid;
- umode_t mode;
- u32 osid;
- int has_perm;
- uid_t perm_uid;
- gid_t perm_gid;
- umode_t perm_mode;
- unsigned long qbytes;
- } ipc;
- struct {
- mqd_t mqdes;
- struct mq_attr mqstat;
- } mq_getsetattr;
- struct {
- mqd_t mqdes;
- int sigev_signo;
- } mq_notify;
- struct {
- mqd_t mqdes;
- size_t msg_len;
- unsigned int msg_prio;
- struct timespec abs_timeout;
- } mq_sendrecv;
- struct {
- int oflag;
- umode_t mode;
- struct mq_attr attr;
- } mq_open;
- struct {
- pid_t pid;
- struct audit_cap_data cap;
- } capset;
- struct {
- int fd;
- int flags;
- } mmap;
- };
- int fds[2];
-
-#if AUDIT_DEBUG
- int put_count;
- int ino_count;
-#endif
-};
-
static inline int open_arg(int flags, int mask)
{
int n = ACC_MODE(flags);
@@ -633,9 +490,23 @@ static int audit_filter_rules(struct task_struct *tsk,
break;
case AUDIT_GID:
result = audit_gid_comparator(cred->gid, f->op, f->gid);
+ if (f->op == Audit_equal) {
+ if (!result)
+ result = in_group_p(f->gid);
+ } else if (f->op == Audit_not_equal) {
+ if (result)
+ result = !in_group_p(f->gid);
+ }
break;
case AUDIT_EGID:
result = audit_gid_comparator(cred->egid, f->op, f->gid);
+ if (f->op == Audit_equal) {
+ if (!result)
+ result = in_egroup_p(f->gid);
+ } else if (f->op == Audit_not_equal) {
+ if (result)
+ result = !in_egroup_p(f->gid);
+ }
break;
case AUDIT_SGID:
result = audit_gid_comparator(cred->sgid, f->op, f->gid);
@@ -742,6 +613,9 @@ static int audit_filter_rules(struct task_struct *tsk,
if (ctx)
result = audit_uid_comparator(tsk->loginuid, f->op, f->uid);
break;
+ case AUDIT_LOGINUID_SET:
+ result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val);
+ break;
case AUDIT_SUBJ_USER:
case AUDIT_SUBJ_ROLE:
case AUDIT_SUBJ_TYPE:
@@ -987,6 +861,8 @@ static inline void audit_free_names(struct audit_context *context)
#if AUDIT_DEBUG == 2
if (context->put_count + context->ino_count != context->name_count) {
+ int i = 0;
+
printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d"
" name_count=%d put_count=%d"
" ino_count=%d [NOT freeing]\n",
@@ -995,7 +871,7 @@ static inline void audit_free_names(struct audit_context *context)
context->name_count, context->put_count,
context->ino_count);
list_for_each_entry(n, &context->names_list, list) {
- printk(KERN_ERR "names[%d] = %p = %s\n", i,
+ printk(KERN_ERR "names[%d] = %p = %s\n", i++,
n->name, n->name->name ?: "(null)");
}
dump_stack();
@@ -1010,7 +886,7 @@ static inline void audit_free_names(struct audit_context *context)
list_for_each_entry_safe(n, next, &context->names_list, list) {
list_del(&n->list);
if (n->name && n->name_put)
- __putname(n->name);
+ final_putname(n->name);
if (n->should_free)
kfree(n);
}
@@ -1093,88 +969,6 @@ static inline void audit_free_context(struct audit_context *context)
kfree(context);
}
-void audit_log_task_context(struct audit_buffer *ab)
-{
- char *ctx = NULL;
- unsigned len;
- int error;
- u32 sid;
-
- security_task_getsecid(current, &sid);
- if (!sid)
- return;
-
- error = security_secid_to_secctx(sid, &ctx, &len);
- if (error) {
- if (error != -EINVAL)
- goto error_path;
- return;
- }
-
- audit_log_format(ab, " subj=%s", ctx);
- security_release_secctx(ctx, len);
- return;
-
-error_path:
- audit_panic("error in audit_log_task_context");
- return;
-}
-
-EXPORT_SYMBOL(audit_log_task_context);
-
-void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
-{
- const struct cred *cred;
- char name[sizeof(tsk->comm)];
- struct mm_struct *mm = tsk->mm;
- char *tty;
-
- if (!ab)
- return;
-
- /* tsk == current */
- cred = current_cred();
-
- spin_lock_irq(&tsk->sighand->siglock);
- if (tsk->signal && tsk->signal->tty)
- tty = tsk->signal->tty->name;
- else
- tty = "(none)";
- spin_unlock_irq(&tsk->sighand->siglock);
-
-
- audit_log_format(ab,
- " ppid=%ld pid=%d auid=%u uid=%u gid=%u"
- " euid=%u suid=%u fsuid=%u"
- " egid=%u sgid=%u fsgid=%u ses=%u tty=%s",
- sys_getppid(),
- tsk->pid,
- from_kuid(&init_user_ns, tsk->loginuid),
- from_kuid(&init_user_ns, cred->uid),
- from_kgid(&init_user_ns, cred->gid),
- from_kuid(&init_user_ns, cred->euid),
- from_kuid(&init_user_ns, cred->suid),
- from_kuid(&init_user_ns, cred->fsuid),
- from_kgid(&init_user_ns, cred->egid),
- from_kgid(&init_user_ns, cred->sgid),
- from_kgid(&init_user_ns, cred->fsgid),
- tsk->sessionid, tty);
-
- get_task_comm(name, tsk);
- audit_log_format(ab, " comm=");
- audit_log_untrustedstring(ab, name);
-
- if (mm) {
- down_read(&mm->mmap_sem);
- if (mm->exe_file)
- audit_log_d_path(ab, " exe=", &mm->exe_file->f_path);
- up_read(&mm->mmap_sem);
- }
- audit_log_task_context(ab);
-}
-
-EXPORT_SYMBOL(audit_log_task_info);
-
static int audit_log_pid_context(struct audit_context *context, pid_t pid,
kuid_t auid, kuid_t uid, unsigned int sessionid,
u32 sid, char *comm)
@@ -1191,12 +985,14 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid,
from_kuid(&init_user_ns, auid),
from_kuid(&init_user_ns, uid), sessionid);
- if (security_secid_to_secctx(sid, &ctx, &len)) {
- audit_log_format(ab, " obj=(none)");
- rc = 1;
- } else {
- audit_log_format(ab, " obj=%s", ctx);
- security_release_secctx(ctx, len);
+ if (sid) {
+ if (security_secid_to_secctx(sid, &ctx, &len)) {
+ audit_log_format(ab, " obj=(none)");
+ rc = 1;
+ } else {
+ audit_log_format(ab, " obj=%s", ctx);
+ security_release_secctx(ctx, len);
+ }
}
audit_log_format(ab, " ocomm=");
audit_log_untrustedstring(ab, comm);
@@ -1390,35 +1186,6 @@ static void audit_log_execve_info(struct audit_context *context,
kfree(buf);
}
-static void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
-{
- int i;
-
- audit_log_format(ab, " %s=", prefix);
- CAP_FOR_EACH_U32(i) {
- audit_log_format(ab, "%08x", cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]);
- }
-}
-
-static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
-{
- kernel_cap_t *perm = &name->fcap.permitted;
- kernel_cap_t *inh = &name->fcap.inheritable;
- int log = 0;
-
- if (!cap_isclear(*perm)) {
- audit_log_cap(ab, "cap_fp", perm);
- log = 1;
- }
- if (!cap_isclear(*inh)) {
- audit_log_cap(ab, "cap_fi", inh);
- log = 1;
- }
-
- if (log)
- audit_log_format(ab, " cap_fe=%d cap_fver=%x", name->fcap.fE, name->fcap_ver);
-}
-
static void show_special(struct audit_context *context, int *call_panic)
{
struct audit_buffer *ab;
@@ -1516,68 +1283,6 @@ static void show_special(struct audit_context *context, int *call_panic)
audit_log_end(ab);
}
-static void audit_log_name(struct audit_context *context, struct audit_names *n,
- int record_num, int *call_panic)
-{
- struct audit_buffer *ab;
- ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
- if (!ab)
- return; /* audit_panic has been called */
-
- audit_log_format(ab, "item=%d", record_num);
-
- if (n->name) {
- switch (n->name_len) {
- case AUDIT_NAME_FULL:
- /* log the full path */
- audit_log_format(ab, " name=");
- audit_log_untrustedstring(ab, n->name->name);
- break;
- case 0:
- /* name was specified as a relative path and the
- * directory component is the cwd */
- audit_log_d_path(ab, " name=", &context->pwd);
- break;
- default:
- /* log the name's directory component */
- audit_log_format(ab, " name=");
- audit_log_n_untrustedstring(ab, n->name->name,
- n->name_len);
- }
- } else
- audit_log_format(ab, " name=(null)");
-
- if (n->ino != (unsigned long)-1) {
- audit_log_format(ab, " inode=%lu"
- " dev=%02x:%02x mode=%#ho"
- " ouid=%u ogid=%u rdev=%02x:%02x",
- n->ino,
- MAJOR(n->dev),
- MINOR(n->dev),
- n->mode,
- from_kuid(&init_user_ns, n->uid),
- from_kgid(&init_user_ns, n->gid),
- MAJOR(n->rdev),
- MINOR(n->rdev));
- }
- if (n->osid != 0) {
- char *ctx = NULL;
- u32 len;
- if (security_secid_to_secctx(
- n->osid, &ctx, &len)) {
- audit_log_format(ab, " osid=%u", n->osid);
- *call_panic = 2;
- } else {
- audit_log_format(ab, " obj=%s", ctx);
- security_release_secctx(ctx, len);
- }
- }
-
- audit_log_fcaps(ab, n);
-
- audit_log_end(ab);
-}
-
static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
{
int i, call_panic = 0;
@@ -1695,7 +1400,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
i = 0;
list_for_each_entry(n, &context->names_list, list)
- audit_log_name(context, n, i++, &call_panic);
+ audit_log_name(context, n, NULL, i++, &call_panic);
/* Send end of event record to help user space know we are finished */
ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
@@ -2030,18 +1735,18 @@ void audit_putname(struct filename *name)
BUG_ON(!context);
if (!context->in_syscall) {
#if AUDIT_DEBUG == 2
- printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n",
+ printk(KERN_ERR "%s:%d(:%d): final_putname(%p)\n",
__FILE__, __LINE__, context->serial, name);
if (context->name_count) {
struct audit_names *n;
- int i;
+ int i = 0;
list_for_each_entry(n, &context->names_list, list)
- printk(KERN_ERR "name[%d] = %p = %s\n", i,
+ printk(KERN_ERR "name[%d] = %p = %s\n", i++,
n->name, n->name->name ?: "(null)");
}
#endif
- __putname(name);
+ final_putname(name);
}
#if AUDIT_DEBUG
else {
@@ -2060,41 +1765,6 @@ void audit_putname(struct filename *name)
#endif
}
-static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry)
-{
- struct cpu_vfs_cap_data caps;
- int rc;
-
- if (!dentry)
- return 0;
-
- rc = get_vfs_caps_from_disk(dentry, &caps);
- if (rc)
- return rc;
-
- name->fcap.permitted = caps.permitted;
- name->fcap.inheritable = caps.inheritable;
- name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
- name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT;
-
- return 0;
-}
-
-
-/* Copy inode data into an audit_names. */
-static void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
- const struct inode *inode)
-{
- name->ino = inode->i_ino;
- name->dev = inode->i_sb->s_dev;
- name->mode = inode->i_mode;
- name->uid = inode->i_uid;
- name->gid = inode->i_gid;
- name->rdev = inode->i_rdev;
- security_inode_getsecid(inode, &name->osid);
- audit_copy_fcaps(name, dentry);
-}
-
/**
* __audit_inode - store the inode and device from a lookup
* @name: name being audited
@@ -2303,7 +1973,7 @@ int audit_set_loginuid(kuid_t loginuid)
unsigned int sessionid;
#ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE
- if (uid_valid(task->loginuid))
+ if (audit_loginuid_set(task))
return -EPERM;
#else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
if (!capable(CAP_AUDIT_CONTROL))
@@ -2471,17 +2141,20 @@ int __audit_bprm(struct linux_binprm *bprm)
/**
* audit_socketcall - record audit data for sys_socketcall
- * @nargs: number of args
+ * @nargs: number of args, which should not be more than AUDITSC_ARGS.
* @args: args array
*
*/
-void __audit_socketcall(int nargs, unsigned long *args)
+int __audit_socketcall(int nargs, unsigned long *args)
{
struct audit_context *context = current->audit_context;
+ if (nargs <= 0 || nargs > AUDITSC_ARGS || !args)
+ return -EINVAL;
context->type = AUDIT_SOCKETCALL;
context->socketcall.nargs = nargs;
memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long));
+ return 0;
}
/**
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index eeb7e49946b..2a9926275f8 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4380,7 +4380,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
* need to invoke fork callbacks here. */
BUG_ON(!list_empty(&init_task.tasks));
- ss->active = 1;
BUG_ON(online_css(ss, dummytop));
mutex_unlock(&cgroup_mutex);
@@ -4485,7 +4484,6 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
}
write_unlock(&css_set_lock);
- ss->active = 1;
ret = online_css(ss, dummytop);
if (ret)
goto err_unload;
@@ -4526,7 +4524,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
mutex_lock(&cgroup_mutex);
offline_css(ss, dummytop);
- ss->active = 0;
if (ss->use_id)
idr_destroy(&ss->idr);
@@ -4681,7 +4678,7 @@ out:
*/
/* TODO: Use a proper seq_file iterator */
-static int proc_cgroup_show(struct seq_file *m, void *v)
+int proc_cgroup_show(struct seq_file *m, void *v)
{
struct pid *pid;
struct task_struct *tsk;
@@ -4733,19 +4730,6 @@ out:
return retval;
}
-static int cgroup_open(struct inode *inode, struct file *file)
-{
- struct pid *pid = PROC_I(inode)->pid;
- return single_open(file, proc_cgroup_show, pid);
-}
-
-const struct file_operations proc_cgroup_operations = {
- .open = cgroup_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
/* Display information about each subsystem and each hierarchy */
static int proc_cgroupstats_show(struct seq_file *m, void *v)
{
diff --git a/kernel/compat.c b/kernel/compat.c
index 19971d8c729..0a09e481b70 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -516,25 +516,6 @@ int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru)
return 0;
}
-asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru)
-{
- struct rusage r;
- int ret;
- mm_segment_t old_fs = get_fs();
-
- set_fs(KERNEL_DS);
- ret = sys_getrusage(who, (struct rusage __user *) &r);
- set_fs(old_fs);
-
- if (ret)
- return ret;
-
- if (put_compat_rusage(&r, ru))
- return -EFAULT;
-
- return 0;
-}
-
COMPAT_SYSCALL_DEFINE4(wait4,
compat_pid_t, pid,
compat_uint_t __user *, stat_addr,
@@ -1138,71 +1119,6 @@ asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
}
#endif
-struct compat_sysinfo {
- s32 uptime;
- u32 loads[3];
- u32 totalram;
- u32 freeram;
- u32 sharedram;
- u32 bufferram;
- u32 totalswap;
- u32 freeswap;
- u16 procs;
- u16 pad;
- u32 totalhigh;
- u32 freehigh;
- u32 mem_unit;
- char _f[20-2*sizeof(u32)-sizeof(int)];
-};
-
-asmlinkage long
-compat_sys_sysinfo(struct compat_sysinfo __user *info)
-{
- struct sysinfo s;
-
- do_sysinfo(&s);
-
- /* Check to see if any memory value is too large for 32-bit and scale
- * down if needed
- */
- if ((s.totalram >> 32) || (s.totalswap >> 32)) {
- int bitcount = 0;
-
- while (s.mem_unit < PAGE_SIZE) {
- s.mem_unit <<= 1;
- bitcount++;
- }
-
- s.totalram >>= bitcount;
- s.freeram >>= bitcount;
- s.sharedram >>= bitcount;
- s.bufferram >>= bitcount;
- s.totalswap >>= bitcount;
- s.freeswap >>= bitcount;
- s.totalhigh >>= bitcount;
- s.freehigh >>= bitcount;
- }
-
- if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) ||
- __put_user (s.uptime, &info->uptime) ||
- __put_user (s.loads[0], &info->loads[0]) ||
- __put_user (s.loads[1], &info->loads[1]) ||
- __put_user (s.loads[2], &info->loads[2]) ||
- __put_user (s.totalram, &info->totalram) ||
- __put_user (s.freeram, &info->freeram) ||
- __put_user (s.sharedram, &info->sharedram) ||
- __put_user (s.bufferram, &info->bufferram) ||
- __put_user (s.totalswap, &info->totalswap) ||
- __put_user (s.freeswap, &info->freeswap) ||
- __put_user (s.procs, &info->procs) ||
- __put_user (s.totalhigh, &info->totalhigh) ||
- __put_user (s.freehigh, &info->freehigh) ||
- __put_user (s.mem_unit, &info->mem_unit))
- return -EFAULT;
-
- return 0;
-}
-
COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
compat_pid_t, pid,
struct compat_timespec __user *, interval)
diff --git a/kernel/configs.c b/kernel/configs.c
index 42e8fa075ee..c18b1f1ae51 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -79,7 +79,7 @@ static int __init ikconfig_init(void)
if (!entry)
return -ENOMEM;
- entry->size = kernel_config_data_size;
+ proc_set_size(entry, kernel_config_data_size);
return 0;
}
diff --git a/kernel/cpu/Makefile b/kernel/cpu/Makefile
new file mode 100644
index 00000000000..59ab052ef7a
--- /dev/null
+++ b/kernel/cpu/Makefile
@@ -0,0 +1 @@
+obj-y = idle.o
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
new file mode 100644
index 00000000000..8b86c0c68ed
--- /dev/null
+++ b/kernel/cpu/idle.c
@@ -0,0 +1,116 @@
+/*
+ * Generic entry point for the idle threads
+ */
+#include <linux/sched.h>
+#include <linux/cpu.h>
+#include <linux/tick.h>
+#include <linux/mm.h>
+
+#include <asm/tlb.h>
+
+#include <trace/events/power.h>
+
+static int __read_mostly cpu_idle_force_poll;
+
+void cpu_idle_poll_ctrl(bool enable)
+{
+ if (enable) {
+ cpu_idle_force_poll++;
+ } else {
+ cpu_idle_force_poll--;
+ WARN_ON_ONCE(cpu_idle_force_poll < 0);
+ }
+}
+
+#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP
+static int __init cpu_idle_poll_setup(char *__unused)
+{
+ cpu_idle_force_poll = 1;
+ return 1;
+}
+__setup("nohlt", cpu_idle_poll_setup);
+
+static int __init cpu_idle_nopoll_setup(char *__unused)
+{
+ cpu_idle_force_poll = 0;
+ return 1;
+}
+__setup("hlt", cpu_idle_nopoll_setup);
+#endif
+
+static inline int cpu_idle_poll(void)
+{
+ trace_cpu_idle_rcuidle(0, smp_processor_id());
+ local_irq_enable();
+ while (!need_resched())
+ cpu_relax();
+ trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
+ return 1;
+}
+
+/* Weak implementations for optional arch specific functions */
+void __weak arch_cpu_idle_prepare(void) { }
+void __weak arch_cpu_idle_enter(void) { }
+void __weak arch_cpu_idle_exit(void) { }
+void __weak arch_cpu_idle_dead(void) { }
+void __weak arch_cpu_idle(void)
+{
+ cpu_idle_force_poll = 1;
+}
+
+/*
+ * Generic idle loop implementation
+ */
+static void cpu_idle_loop(void)
+{
+ while (1) {
+ tick_nohz_idle_enter();
+
+ while (!need_resched()) {
+ check_pgt_cache();
+ rmb();
+
+ if (cpu_is_offline(smp_processor_id()))
+ arch_cpu_idle_dead();
+
+ local_irq_disable();
+ arch_cpu_idle_enter();
+
+ /*
+ * In poll mode we reenable interrupts and spin.
+ *
+ * Also if we detected in the wakeup from idle
+ * path that the tick broadcast device expired
+ * for us, we don't want to go deep idle as we
+ * know that the IPI is going to arrive right
+ * away
+ */
+ if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
+ cpu_idle_poll();
+ } else {
+ current_clr_polling();
+ if (!need_resched()) {
+ stop_critical_timings();
+ rcu_idle_enter();
+ arch_cpu_idle();
+ WARN_ON_ONCE(irqs_disabled());
+ rcu_idle_exit();
+ start_critical_timings();
+ } else {
+ local_irq_enable();
+ }
+ current_set_polling();
+ }
+ arch_cpu_idle_exit();
+ }
+ tick_nohz_idle_exit();
+ schedule_preempt_disabled();
+ }
+}
+
+void cpu_startup_entry(enum cpuhp_state state)
+{
+ current_set_polling();
+ arch_cpu_idle_prepare();
+ cpu_idle_loop();
+}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 12331120767..64b3f791bbe 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2609,7 +2609,7 @@ void __cpuset_memory_pressure_bump(void)
* and we take cpuset_mutex, keeping cpuset_attach() from changing it
* anyway.
*/
-static int proc_cpuset_show(struct seq_file *m, void *unused_v)
+int proc_cpuset_show(struct seq_file *m, void *unused_v)
{
struct pid *pid;
struct task_struct *tsk;
@@ -2643,19 +2643,6 @@ out_free:
out:
return retval;
}
-
-static int cpuset_open(struct inode *inode, struct file *file)
-{
- struct pid *pid = PROC_I(inode)->pid;
- return single_open(file, proc_cpuset_show, pid);
-}
-
-const struct file_operations proc_cpuset_operations = {
- .open = cpuset_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
#endif /* CONFIG_PROC_PID_CPUSET */
/* Display task mems_allowed in /proc/<pid>/status file. */
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index c26278fd485..0506d447aed 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -775,7 +775,7 @@ static void sysrq_handle_dbg(int key)
static struct sysrq_key_op sysrq_dbg_op = {
.handler = sysrq_handle_dbg,
- .help_msg = "debug(G)",
+ .help_msg = "debug(g)",
.action_msg = "DEBUG",
};
#endif
diff --git a/kernel/events/core.c b/kernel/events/core.c
index dce6e13cf9d..6b41c1899a8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -18,6 +18,7 @@
#include <linux/poll.h>
#include <linux/slab.h>
#include <linux/hash.h>
+#include <linux/tick.h>
#include <linux/sysfs.h>
#include <linux/dcache.h>
#include <linux/percpu.h>
@@ -37,6 +38,7 @@
#include <linux/ftrace_event.h>
#include <linux/hw_breakpoint.h>
#include <linux/mm_types.h>
+#include <linux/cgroup.h>
#include "internal.h"
@@ -234,6 +236,20 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
#ifdef CONFIG_CGROUP_PERF
/*
+ * perf_cgroup_info keeps track of time_enabled for a cgroup.
+ * This is a per-cpu dynamically allocated data structure.
+ */
+struct perf_cgroup_info {
+ u64 time;
+ u64 timestamp;
+};
+
+struct perf_cgroup {
+ struct cgroup_subsys_state css;
+ struct perf_cgroup_info __percpu *info;
+};
+
+/*
* Must ensure cgroup is pinned (css_get) before calling
* this function. In other words, we cannot call this function
* if there is no cgroup event for the current CPU context.
@@ -670,8 +686,12 @@ static void perf_pmu_rotate_start(struct pmu *pmu)
WARN_ON(!irqs_disabled());
- if (list_empty(&cpuctx->rotation_list))
+ if (list_empty(&cpuctx->rotation_list)) {
+ int was_empty = list_empty(head);
list_add(&cpuctx->rotation_list, head);
+ if (was_empty)
+ tick_nohz_full_kick();
+ }
}
static void get_ctx(struct perf_event_context *ctx)
@@ -976,9 +996,15 @@ static void perf_event__header_size(struct perf_event *event)
if (sample_type & PERF_SAMPLE_PERIOD)
size += sizeof(data->period);
+ if (sample_type & PERF_SAMPLE_WEIGHT)
+ size += sizeof(data->weight);
+
if (sample_type & PERF_SAMPLE_READ)
size += event->read_size;
+ if (sample_type & PERF_SAMPLE_DATA_SRC)
+ size += sizeof(data->data_src.val);
+
event->header_size = size;
}
@@ -2570,6 +2596,16 @@ done:
list_del_init(&cpuctx->rotation_list);
}
+#ifdef CONFIG_NO_HZ_FULL
+bool perf_event_can_stop_tick(void)
+{
+ if (list_empty(&__get_cpu_var(rotation_list)))
+ return true;
+ else
+ return false;
+}
+#endif
+
void perf_event_task_tick(void)
{
struct list_head *head = &__get_cpu_var(rotation_list);
@@ -4193,6 +4229,12 @@ void perf_output_sample(struct perf_output_handle *handle,
perf_output_sample_ustack(handle,
data->stack_user_size,
data->regs_user.regs);
+
+ if (sample_type & PERF_SAMPLE_WEIGHT)
+ perf_output_put(handle, data->weight);
+
+ if (sample_type & PERF_SAMPLE_DATA_SRC)
+ perf_output_put(handle, data->data_src.val);
}
void perf_prepare_sample(struct perf_event_header *header,
@@ -4782,6 +4824,9 @@ got_name:
mmap_event->file_name = name;
mmap_event->file_size = size;
+ if (!(vma->vm_flags & VM_EXEC))
+ mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
+
mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
rcu_read_lock();
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 97fddb09762..cd55144270b 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -326,11 +326,16 @@ void rb_free(struct ring_buffer *rb)
}
#else
+static int data_page_nr(struct ring_buffer *rb)
+{
+ return rb->nr_pages << page_order(rb);
+}
struct page *
perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
{
- if (pgoff > (1UL << page_order(rb)))
+ /* The '>' counts in the user page. */
+ if (pgoff > data_page_nr(rb))
return NULL;
return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE);
@@ -350,10 +355,11 @@ static void rb_free_work(struct work_struct *work)
int i, nr;
rb = container_of(work, struct ring_buffer, work);
- nr = 1 << page_order(rb);
+ nr = data_page_nr(rb);
base = rb->user_page;
- for (i = 0; i < nr + 1; i++)
+ /* The '<=' counts in the user page. */
+ for (i = 0; i <= nr; i++)
perf_mmap_unmark_page(base + (i * PAGE_SIZE));
vfree(base);
@@ -387,7 +393,7 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
rb->user_page = all_buf;
rb->data_pages[0] = all_buf + PAGE_SIZE;
rb->page_order = ilog2(nr_pages);
- rb->nr_pages = 1;
+ rb->nr_pages = !!nr_pages;
ring_buffer_init(rb, watermark, flags);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index a567c8c7ef3..f3569747d62 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -75,6 +75,15 @@ struct uprobe {
struct arch_uprobe arch;
};
+struct return_instance {
+ struct uprobe *uprobe;
+ unsigned long func;
+ unsigned long orig_ret_vaddr; /* original return address */
+ bool chained; /* true, if instance is nested */
+
+ struct return_instance *next; /* keep as stack */
+};
+
/*
* valid_vma: Verify if the specified vma is an executable vma
* Relax restrictions while unregistering: vm_flags might have
@@ -173,10 +182,31 @@ bool __weak is_swbp_insn(uprobe_opcode_t *insn)
return *insn == UPROBE_SWBP_INSN;
}
-static void copy_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *opcode)
+/**
+ * is_trap_insn - check if instruction is breakpoint instruction.
+ * @insn: instruction to be checked.
+ * Default implementation of is_trap_insn
+ * Returns true if @insn is a breakpoint instruction.
+ *
+ * This function is needed for the case where an architecture has multiple
+ * trap instructions (like powerpc).
+ */
+bool __weak is_trap_insn(uprobe_opcode_t *insn)
+{
+ return is_swbp_insn(insn);
+}
+
+static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
{
void *kaddr = kmap_atomic(page);
- memcpy(opcode, kaddr + (vaddr & ~PAGE_MASK), UPROBE_SWBP_INSN_SIZE);
+ memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
+ kunmap_atomic(kaddr);
+}
+
+static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len)
+{
+ void *kaddr = kmap_atomic(page);
+ memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len);
kunmap_atomic(kaddr);
}
@@ -185,7 +215,16 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
uprobe_opcode_t old_opcode;
bool is_swbp;
- copy_opcode(page, vaddr, &old_opcode);
+ /*
+ * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here.
+ * We do not check if it is any other 'trap variant' which could
+ * be conditional trap instruction such as the one powerpc supports.
+ *
+ * The logic is that we do not care if the underlying instruction
+ * is a trap variant; uprobes always wins over any other (gdb)
+ * breakpoint.
+ */
+ copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
is_swbp = is_swbp_insn(&old_opcode);
if (is_swbp_insn(new_opcode)) {
@@ -204,7 +243,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
* Expect the breakpoint instruction to be the smallest size instruction for
* the architecture. If an arch has variable length instruction and the
* breakpoint instruction is not of the smallest length instruction
- * supported by that architecture then we need to modify is_swbp_at_addr and
+ * supported by that architecture then we need to modify is_trap_at_addr and
* write_opcode accordingly. This would never be a problem for archs that
* have fixed length instructions.
*/
@@ -225,7 +264,6 @@ static int write_opcode(struct mm_struct *mm, unsigned long vaddr,
uprobe_opcode_t opcode)
{
struct page *old_page, *new_page;
- void *vaddr_old, *vaddr_new;
struct vm_area_struct *vma;
int ret;
@@ -246,15 +284,8 @@ retry:
__SetPageUptodate(new_page);
- /* copy the page now that we've got it stable */
- vaddr_old = kmap_atomic(old_page);
- vaddr_new = kmap_atomic(new_page);
-
- memcpy(vaddr_new, vaddr_old, PAGE_SIZE);
- memcpy(vaddr_new + (vaddr & ~PAGE_MASK), &opcode, UPROBE_SWBP_INSN_SIZE);
-
- kunmap_atomic(vaddr_new);
- kunmap_atomic(vaddr_old);
+ copy_highpage(new_page, old_page);
+ copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
ret = anon_vma_prepare(vma);
if (ret)
@@ -477,30 +508,18 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn,
unsigned long nbytes, loff_t offset)
{
struct page *page;
- void *vaddr;
- unsigned long off;
- pgoff_t idx;
-
- if (!filp)
- return -EINVAL;
if (!mapping->a_ops->readpage)
return -EIO;
-
- idx = offset >> PAGE_CACHE_SHIFT;
- off = offset & ~PAGE_MASK;
-
/*
* Ensure that the page that has the original instruction is
* populated and in page-cache.
*/
- page = read_mapping_page(mapping, idx, filp);
+ page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp);
if (IS_ERR(page))
return PTR_ERR(page);
- vaddr = kmap_atomic(page);
- memcpy(insn, vaddr + off, nbytes);
- kunmap_atomic(vaddr);
+ copy_from_page(page, offset, insn, nbytes);
page_cache_release(page);
return 0;
@@ -550,7 +569,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
goto out;
ret = -ENOTSUPP;
- if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn))
+ if (is_trap_insn((uprobe_opcode_t *)uprobe->arch.insn))
goto out;
ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
@@ -758,7 +777,7 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
down_write(&mm->mmap_sem);
vma = find_vma(mm, info->vaddr);
if (!vma || !valid_vma(vma, is_register) ||
- vma->vm_file->f_mapping->host != uprobe->inode)
+ file_inode(vma->vm_file) != uprobe->inode)
goto unlock;
if (vma->vm_start > info->vaddr ||
@@ -828,6 +847,10 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
struct uprobe *uprobe;
int ret;
+ /* Uprobe must have at least one set consumer */
+ if (!uc->handler && !uc->ret_handler)
+ return -EINVAL;
+
/* Racy, just to catch the obvious mistakes */
if (offset > i_size_read(inode))
return -EINVAL;
@@ -917,7 +940,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
loff_t offset;
if (!valid_vma(vma, false) ||
- vma->vm_file->f_mapping->host != uprobe->inode)
+ file_inode(vma->vm_file) != uprobe->inode)
continue;
offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
@@ -1010,7 +1033,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
if (no_uprobe_events() || !valid_vma(vma, true))
return 0;
- inode = vma->vm_file->f_mapping->host;
+ inode = file_inode(vma->vm_file);
if (!inode)
return 0;
@@ -1041,7 +1064,7 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e
struct inode *inode;
struct rb_node *n;
- inode = vma->vm_file->f_mapping->host;
+ inode = file_inode(vma->vm_file);
min = vaddr_to_offset(vma, start);
max = min + (end - start) - 1;
@@ -1114,6 +1137,7 @@ static struct xol_area *get_xol_area(void)
{
struct mm_struct *mm = current->mm;
struct xol_area *area;
+ uprobe_opcode_t insn = UPROBE_SWBP_INSN;
area = mm->uprobes_state.xol_area;
if (area)
@@ -1131,7 +1155,12 @@ static struct xol_area *get_xol_area(void)
if (!area->page)
goto free_bitmap;
+ /* allocate first slot of task's xol_area for the return probes */
+ set_bit(0, area->bitmap);
+ copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
+ atomic_set(&area->slot_count, 1);
init_waitqueue_head(&area->wq);
+
if (!xol_add_vma(area))
return area;
@@ -1216,9 +1245,7 @@ static unsigned long xol_take_insn_slot(struct xol_area *area)
static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
{
struct xol_area *area;
- unsigned long offset;
unsigned long xol_vaddr;
- void *vaddr;
area = get_xol_area();
if (!area)
@@ -1229,10 +1256,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
return 0;
/* Initialize the slot */
- offset = xol_vaddr & ~PAGE_MASK;
- vaddr = kmap_atomic(area->page);
- memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES);
- kunmap_atomic(vaddr);
+ copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES);
/*
* We probably need flush_icache_user_range() but it needs vma.
* This should work on supported architectures too.
@@ -1298,6 +1322,7 @@ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
void uprobe_free_utask(struct task_struct *t)
{
struct uprobe_task *utask = t->utask;
+ struct return_instance *ri, *tmp;
if (!utask)
return;
@@ -1305,6 +1330,15 @@ void uprobe_free_utask(struct task_struct *t)
if (utask->active_uprobe)
put_uprobe(utask->active_uprobe);
+ ri = utask->return_instances;
+ while (ri) {
+ tmp = ri;
+ ri = ri->next;
+
+ put_uprobe(tmp->uprobe);
+ kfree(tmp);
+ }
+
xol_free_insn_slot(t);
kfree(utask);
t->utask = NULL;
@@ -1333,6 +1367,93 @@ static struct uprobe_task *get_utask(void)
return current->utask;
}
+/*
+ * Current area->vaddr notion assume the trampoline address is always
+ * equal area->vaddr.
+ *
+ * Returns -1 in case the xol_area is not allocated.
+ */
+static unsigned long get_trampoline_vaddr(void)
+{
+ struct xol_area *area;
+ unsigned long trampoline_vaddr = -1;
+
+ area = current->mm->uprobes_state.xol_area;
+ smp_read_barrier_depends();
+ if (area)
+ trampoline_vaddr = area->vaddr;
+
+ return trampoline_vaddr;
+}
+
+static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
+{
+ struct return_instance *ri;
+ struct uprobe_task *utask;
+ unsigned long orig_ret_vaddr, trampoline_vaddr;
+ bool chained = false;
+
+ if (!get_xol_area())
+ return;
+
+ utask = get_utask();
+ if (!utask)
+ return;
+
+ if (utask->depth >= MAX_URETPROBE_DEPTH) {
+ printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to"
+ " nestedness limit pid/tgid=%d/%d\n",
+ current->pid, current->tgid);
+ return;
+ }
+
+ ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL);
+ if (!ri)
+ goto fail;
+
+ trampoline_vaddr = get_trampoline_vaddr();
+ orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
+ if (orig_ret_vaddr == -1)
+ goto fail;
+
+ /*
+ * We don't want to keep trampoline address in stack, rather keep the
+ * original return address of first caller thru all the consequent
+ * instances. This also makes breakpoint unwrapping easier.
+ */
+ if (orig_ret_vaddr == trampoline_vaddr) {
+ if (!utask->return_instances) {
+ /*
+ * This situation is not possible. Likely we have an
+ * attack from user-space.
+ */
+ pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n",
+ current->pid, current->tgid);
+ goto fail;
+ }
+
+ chained = true;
+ orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
+ }
+
+ atomic_inc(&uprobe->ref);
+ ri->uprobe = uprobe;
+ ri->func = instruction_pointer(regs);
+ ri->orig_ret_vaddr = orig_ret_vaddr;
+ ri->chained = chained;
+
+ utask->depth++;
+
+ /* add instance to the stack */
+ ri->next = utask->return_instances;
+ utask->return_instances = ri;
+
+ return;
+
+ fail:
+ kfree(ri);
+}
+
/* Prepare to single-step probed instruction out of line. */
static int
pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
@@ -1431,7 +1552,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm)
clear_bit(MMF_HAS_UPROBES, &mm->flags);
}
-static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
+static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
{
struct page *page;
uprobe_opcode_t opcode;
@@ -1449,10 +1570,11 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
if (result < 0)
return result;
- copy_opcode(page, vaddr, &opcode);
+ copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
put_page(page);
out:
- return is_swbp_insn(&opcode);
+ /* This needs to return true for any variant of the trap insn */
+ return is_trap_insn(&opcode);
}
static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
@@ -1465,14 +1587,14 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
vma = find_vma(mm, bp_vaddr);
if (vma && vma->vm_start <= bp_vaddr) {
if (valid_vma(vma, false)) {
- struct inode *inode = vma->vm_file->f_mapping->host;
+ struct inode *inode = file_inode(vma->vm_file);
loff_t offset = vaddr_to_offset(vma, bp_vaddr);
uprobe = find_uprobe(inode, offset);
}
if (!uprobe)
- *is_swbp = is_swbp_at_addr(mm, bp_vaddr);
+ *is_swbp = is_trap_at_addr(mm, bp_vaddr);
} else {
*is_swbp = -EFAULT;
}
@@ -1488,16 +1610,27 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
{
struct uprobe_consumer *uc;
int remove = UPROBE_HANDLER_REMOVE;
+ bool need_prep = false; /* prepare return uprobe, when needed */
down_read(&uprobe->register_rwsem);
for (uc = uprobe->consumers; uc; uc = uc->next) {
- int rc = uc->handler(uc, regs);
+ int rc = 0;
+
+ if (uc->handler) {
+ rc = uc->handler(uc, regs);
+ WARN(rc & ~UPROBE_HANDLER_MASK,
+ "bad rc=0x%x from %pf()\n", rc, uc->handler);
+ }
+
+ if (uc->ret_handler)
+ need_prep = true;
- WARN(rc & ~UPROBE_HANDLER_MASK,
- "bad rc=0x%x from %pf()\n", rc, uc->handler);
remove &= rc;
}
+ if (need_prep && !remove)
+ prepare_uretprobe(uprobe, regs); /* put bp at return */
+
if (remove && uprobe->consumers) {
WARN_ON(!uprobe_is_active(uprobe));
unapply_uprobe(uprobe, current->mm);
@@ -1505,6 +1638,64 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
up_read(&uprobe->register_rwsem);
}
+static void
+handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
+{
+ struct uprobe *uprobe = ri->uprobe;
+ struct uprobe_consumer *uc;
+
+ down_read(&uprobe->register_rwsem);
+ for (uc = uprobe->consumers; uc; uc = uc->next) {
+ if (uc->ret_handler)
+ uc->ret_handler(uc, ri->func, regs);
+ }
+ up_read(&uprobe->register_rwsem);
+}
+
+static bool handle_trampoline(struct pt_regs *regs)
+{
+ struct uprobe_task *utask;
+ struct return_instance *ri, *tmp;
+ bool chained;
+
+ utask = current->utask;
+ if (!utask)
+ return false;
+
+ ri = utask->return_instances;
+ if (!ri)
+ return false;
+
+ /*
+ * TODO: we should throw out return_instance's invalidated by
+ * longjmp(), currently we assume that the probed function always
+ * returns.
+ */
+ instruction_pointer_set(regs, ri->orig_ret_vaddr);
+
+ for (;;) {
+ handle_uretprobe_chain(ri, regs);
+
+ chained = ri->chained;
+ put_uprobe(ri->uprobe);
+
+ tmp = ri;
+ ri = ri->next;
+ kfree(tmp);
+
+ if (!chained)
+ break;
+
+ utask->depth--;
+
+ BUG_ON(!ri);
+ }
+
+ utask->return_instances = ri;
+
+ return true;
+}
+
/*
* Run handler and ask thread to singlestep.
* Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
@@ -1516,8 +1707,15 @@ static void handle_swbp(struct pt_regs *regs)
int uninitialized_var(is_swbp);
bp_vaddr = uprobe_get_swbp_addr(regs);
- uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
+ if (bp_vaddr == get_trampoline_vaddr()) {
+ if (handle_trampoline(regs))
+ return;
+
+ pr_warn("uprobe: unable to handle uretprobe pid/tgid=%d/%d\n",
+ current->pid, current->tgid);
+ }
+ uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
if (!uprobe) {
if (is_swbp > 0) {
/* No matching uprobe; signal SIGTRAP. */
@@ -1616,7 +1814,11 @@ void uprobe_notify_resume(struct pt_regs *regs)
*/
int uprobe_pre_sstep_notifier(struct pt_regs *regs)
{
- if (!current->mm || !test_bit(MMF_HAS_UPROBES, &current->mm->flags))
+ if (!current->mm)
+ return 0;
+
+ if (!test_bit(MMF_HAS_UPROBES, &current->mm->flags) &&
+ (!current->utask || !current->utask->return_instances))
return 0;
set_thread_flag(TIF_UPROBE);
diff --git a/kernel/exit.c b/kernel/exit.c
index 60bc027c61c..af2eb3cbd49 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -847,7 +847,7 @@ void do_exit(long code)
exit_io_context(tsk);
if (tsk->splice_pipe)
- __free_pipe_info(tsk->splice_pipe);
+ free_pipe_info(tsk->splice_pipe);
if (tsk->task_frag.page)
put_page(tsk->task_frag.page);
@@ -1629,9 +1629,6 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
}
put_pid(pid);
-
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(5, ret, which, upid, infop, options, ru);
return ret;
}
@@ -1669,8 +1666,6 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
ret = do_wait(&wo);
put_pid(pid);
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(4, ret, upid, stat_addr, options, ru);
return ret;
}
diff --git a/kernel/extable.c b/kernel/extable.c
index fe35a634bf7..67460b93b1a 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -41,10 +41,10 @@ u32 __initdata main_extable_sort_needed = 1;
/* Sort the kernel's built-in exception table */
void __init sort_main_extable(void)
{
- if (main_extable_sort_needed)
+ if (main_extable_sort_needed) {
+ pr_notice("Sorting __ex_table...\n");
sort_extable(__start___ex_table, __stop___ex_table);
- else
- pr_notice("__ex_table already sorted, skipping sort\n");
+ }
}
/* Given an address, look for it in the exception tables. */
diff --git a/kernel/fork.c b/kernel/fork.c
index 1766d324d5e..987b28a1f01 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -70,6 +70,7 @@
#include <linux/khugepaged.h>
#include <linux/signalfd.h>
#include <linux/uprobes.h>
+#include <linux/aio.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -1233,7 +1234,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->utime = p->stime = p->gtime = 0;
p->utimescaled = p->stimescaled = 0;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
p->prev_cputime.utime = p->prev_cputime.stime = 0;
#endif
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
@@ -1303,6 +1304,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
p->memcg_batch.do_batch = 0;
p->memcg_batch.memcg = NULL;
#endif
+#ifdef CONFIG_BCACHE
+ p->sequential_io = 0;
+ p->sequential_io_avg = 0;
+#endif
/* Perform scheduler related setup. Assign this task to a CPU. */
sched_fork(p);
@@ -1677,10 +1682,7 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
int, tls_val)
#endif
{
- long ret = do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
- asmlinkage_protect(5, ret, clone_flags, newsp,
- parent_tidptr, child_tidptr, tls_val);
- return ret;
+ return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
}
#endif
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 14be27feda4..fd4b13b131f 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -84,6 +84,12 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
.get_time = &ktime_get_boottime,
.resolution = KTIME_LOW_RES,
},
+ {
+ .index = HRTIMER_BASE_TAI,
+ .clockid = CLOCK_TAI,
+ .get_time = &ktime_get_clocktai,
+ .resolution = KTIME_LOW_RES,
+ },
}
};
@@ -91,6 +97,7 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
[CLOCK_REALTIME] = HRTIMER_BASE_REALTIME,
[CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC,
[CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME,
+ [CLOCK_TAI] = HRTIMER_BASE_TAI,
};
static inline int hrtimer_clockid_to_base(clockid_t clock_id)
@@ -107,8 +114,10 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
{
ktime_t xtim, mono, boot;
struct timespec xts, tom, slp;
+ s32 tai_offset;
get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp);
+ tai_offset = timekeeping_get_tai_offset();
xtim = timespec_to_ktime(xts);
mono = ktime_add(xtim, timespec_to_ktime(tom));
@@ -116,6 +125,8 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
+ base->clock_base[HRTIMER_BASE_TAI].softirq_time =
+ ktime_add(xtim, ktime_set(tai_offset, 0));
}
/*
@@ -161,7 +172,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
*/
static int hrtimer_get_target(int this_cpu, int pinned)
{
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu))
return get_nohz_timer_target();
#endif
@@ -276,6 +287,10 @@ ktime_t ktime_add_ns(const ktime_t kt, u64 nsec)
} else {
unsigned long rem = do_div(nsec, NSEC_PER_SEC);
+ /* Make sure nsec fits into long */
+ if (unlikely(nsec > KTIME_SEC_MAX))
+ return (ktime_t){ .tv64 = KTIME_MAX };
+
tmp = ktime_set((long)nsec, rem);
}
@@ -652,8 +667,9 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
{
ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
+ ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
- return ktime_get_update_offsets(offs_real, offs_boot);
+ return ktime_get_update_offsets(offs_real, offs_boot, offs_tai);
}
/*
@@ -1011,7 +1027,8 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
* @timer: the timer to be added
* @tim: expiry time
* @delta_ns: "slack" range for the timer
- * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
+ * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
+ * relative (HRTIMER_MODE_REL)
*
* Returns:
* 0 on success
@@ -1028,7 +1045,8 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
* hrtimer_start - (re)start an hrtimer on the current CPU
* @timer: the timer to be added
* @tim: expiry time
- * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
+ * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or
+ * relative (HRTIMER_MODE_REL)
*
* Returns:
* 0 on success
@@ -1107,7 +1125,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
}
EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
/**
* hrtimer_get_next_event - get the time until next expiry event
*
@@ -1310,6 +1328,8 @@ retry:
expires = ktime_sub(hrtimer_get_expires(timer),
base->offset);
+ if (expires.tv64 < 0)
+ expires.tv64 = KTIME_MAX;
if (expires.tv64 < expires_next.tv64)
expires_next = expires;
break;
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 96f3a1d9c37..5a83dde8ca0 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -462,9 +462,23 @@ int irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base,
if (domain->ops->map) {
ret = domain->ops->map(domain, virq, hwirq);
if (ret != 0) {
- pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n",
- virq, hwirq, ret);
- WARN_ON(1);
+ /*
+ * If map() returns -EPERM, this interrupt is protected
+ * by the firmware or some other service and shall not
+ * be mapped.
+ *
+ * Since on some platforms we blindly try to map everything
+ * we end up with a log full of backtraces.
+ *
+ * So instead, we silently fail on -EPERM, it is the
+ * responsibility of the PIC driver to display a relevant
+ * message if needed.
+ */
+ if (ret != -EPERM) {
+ pr_err("irq-%i==>hwirq-0x%lx mapping failed: %d\n",
+ virq, hwirq, ret);
+ WARN_ON(1);
+ }
irq_data->domain = NULL;
irq_data->hwirq = 0;
goto err_unmap;
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 397db02209e..19ed5c425c3 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -76,7 +76,7 @@ static int irq_affinity_list_proc_show(struct seq_file *m, void *v)
static ssize_t write_irq_affinity(int type, struct file *file,
const char __user *buffer, size_t count, loff_t *pos)
{
- unsigned int irq = (int)(long)PDE(file_inode(file))->data;
+ unsigned int irq = (int)(long)PDE_DATA(file_inode(file));
cpumask_var_t new_value;
int err;
@@ -131,17 +131,17 @@ static ssize_t irq_affinity_list_proc_write(struct file *file,
static int irq_affinity_proc_open(struct inode *inode, struct file *file)
{
- return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
+ return single_open(file, irq_affinity_proc_show, PDE_DATA(inode));
}
static int irq_affinity_list_proc_open(struct inode *inode, struct file *file)
{
- return single_open(file, irq_affinity_list_proc_show, PDE(inode)->data);
+ return single_open(file, irq_affinity_list_proc_show, PDE_DATA(inode));
}
static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file)
{
- return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data);
+ return single_open(file, irq_affinity_hint_proc_show, PDE_DATA(inode));
}
static const struct file_operations irq_affinity_proc_fops = {
@@ -212,7 +212,7 @@ out:
static int default_affinity_open(struct inode *inode, struct file *file)
{
- return single_open(file, default_affinity_show, PDE(inode)->data);
+ return single_open(file, default_affinity_show, PDE_DATA(inode));
}
static const struct file_operations default_affinity_proc_fops = {
@@ -233,7 +233,7 @@ static int irq_node_proc_show(struct seq_file *m, void *v)
static int irq_node_proc_open(struct inode *inode, struct file *file)
{
- return single_open(file, irq_node_proc_show, PDE(inode)->data);
+ return single_open(file, irq_node_proc_show, PDE_DATA(inode));
}
static const struct file_operations irq_node_proc_fops = {
@@ -256,7 +256,7 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v)
static int irq_spurious_proc_open(struct inode *inode, struct file *file)
{
- return single_open(file, irq_spurious_proc_show, PDE(inode)->data);
+ return single_open(file, irq_spurious_proc_show, PDE_DATA(inode));
}
static const struct file_operations irq_spurious_proc_fops = {
@@ -366,11 +366,7 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
void unregister_handler_proc(unsigned int irq, struct irqaction *action)
{
- if (action->dir) {
- struct irq_desc *desc = irq_to_desc(irq);
-
- remove_proc_entry(action->dir->name, desc->dir);
- }
+ proc_remove(action->dir);
}
static void register_default_affinity_proc(void)
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 2169feeba52..3127ad52cdb 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -84,9 +84,11 @@ static int is_ksym_addr(unsigned long addr)
/*
* Expand a compressed symbol data into the resulting uncompressed string,
+ * if uncompressed string is too long (>= maxlen), it will be truncated,
* given the offset to where the symbol is in the compressed stream.
*/
-static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
+static unsigned int kallsyms_expand_symbol(unsigned int off,
+ char *result, size_t maxlen)
{
int len, skipped_first = 0;
const u8 *tptr, *data;
@@ -113,15 +115,20 @@ static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
while (*tptr) {
if (skipped_first) {
+ if (maxlen <= 1)
+ goto tail;
*result = *tptr;
result++;
+ maxlen--;
} else
skipped_first = 1;
tptr++;
}
}
- *result = '\0';
+tail:
+ if (maxlen)
+ *result = '\0';
/* Return to offset to the next symbol. */
return off;
@@ -176,7 +183,7 @@ unsigned long kallsyms_lookup_name(const char *name)
unsigned int off;
for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
- off = kallsyms_expand_symbol(off, namebuf);
+ off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
if (strcmp(namebuf, name) == 0)
return kallsyms_addresses[i];
@@ -195,7 +202,7 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
int ret;
for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
- off = kallsyms_expand_symbol(off, namebuf);
+ off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf));
ret = fn(data, namebuf, NULL, kallsyms_addresses[i]);
if (ret != 0)
return ret;
@@ -294,7 +301,8 @@ const char *kallsyms_lookup(unsigned long addr,
pos = get_symbol_pos(addr, symbolsize, offset);
/* Grab name */
- kallsyms_expand_symbol(get_symbol_offset(pos), namebuf);
+ kallsyms_expand_symbol(get_symbol_offset(pos),
+ namebuf, KSYM_NAME_LEN);
if (modname)
*modname = NULL;
return namebuf;
@@ -315,7 +323,8 @@ int lookup_symbol_name(unsigned long addr, char *symname)
pos = get_symbol_pos(addr, NULL, NULL);
/* Grab name */
- kallsyms_expand_symbol(get_symbol_offset(pos), symname);
+ kallsyms_expand_symbol(get_symbol_offset(pos),
+ symname, KSYM_NAME_LEN);
return 0;
}
/* See if it's in a module. */
@@ -333,7 +342,8 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
pos = get_symbol_pos(addr, size, offset);
/* Grab name */
- kallsyms_expand_symbol(get_symbol_offset(pos), name);
+ kallsyms_expand_symbol(get_symbol_offset(pos),
+ name, KSYM_NAME_LEN);
modname[0] = '\0';
return 0;
}
@@ -463,7 +473,7 @@ static unsigned long get_ksymbol_core(struct kallsym_iter *iter)
iter->type = kallsyms_get_symbol_type(off);
- off = kallsyms_expand_symbol(off, iter->name);
+ off = kallsyms_expand_symbol(off, iter->name, ARRAY_SIZE(iter->name));
return off - iter->nameoff;
}
diff --git a/kernel/kexec.c b/kernel/kexec.c
index b574920cbd4..59f7b55ba74 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -786,7 +786,7 @@ static int kimage_load_normal_segment(struct kimage *image,
struct kexec_segment *segment)
{
unsigned long maddr;
- unsigned long ubytes, mbytes;
+ size_t ubytes, mbytes;
int result;
unsigned char __user *buf;
@@ -819,13 +819,9 @@ static int kimage_load_normal_segment(struct kimage *image,
/* Start with a clear page */
clear_page(ptr);
ptr += maddr & ~PAGE_MASK;
- mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
- if (mchunk > mbytes)
- mchunk = mbytes;
-
- uchunk = mchunk;
- if (uchunk > ubytes)
- uchunk = ubytes;
+ mchunk = min_t(size_t, mbytes,
+ PAGE_SIZE - (maddr & ~PAGE_MASK));
+ uchunk = min(ubytes, mchunk);
result = copy_from_user(ptr, buf, uchunk);
kunmap(page);
@@ -850,7 +846,7 @@ static int kimage_load_crash_segment(struct kimage *image,
* We do things a page at a time for the sake of kmap.
*/
unsigned long maddr;
- unsigned long ubytes, mbytes;
+ size_t ubytes, mbytes;
int result;
unsigned char __user *buf;
@@ -871,13 +867,10 @@ static int kimage_load_crash_segment(struct kimage *image,
}
ptr = kmap(page);
ptr += maddr & ~PAGE_MASK;
- mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
- if (mchunk > mbytes)
- mchunk = mbytes;
-
- uchunk = mchunk;
- if (uchunk > ubytes) {
- uchunk = ubytes;
+ mchunk = min_t(size_t, mbytes,
+ PAGE_SIZE - (maddr & ~PAGE_MASK));
+ uchunk = min(ubytes, mchunk);
+ if (mchunk > uchunk) {
/* Zero the trailing part of the page */
memset(ptr + uchunk, 0, mchunk - uchunk);
}
@@ -1540,14 +1533,13 @@ void vmcoreinfo_append_str(const char *fmt, ...)
{
va_list args;
char buf[0x50];
- int r;
+ size_t r;
va_start(args, fmt);
r = vsnprintf(buf, sizeof(buf), fmt, args);
va_end(args);
- if (r + vmcoreinfo_size > vmcoreinfo_max_size)
- r = vmcoreinfo_max_size - vmcoreinfo_size;
+ r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 56dd34976d7..1296e72e416 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -77,6 +77,7 @@ static void free_modprobe_argv(struct subprocess_info *info)
static int call_modprobe(char *module_name, int wait)
{
+ struct subprocess_info *info;
static char *envp[] = {
"HOME=/",
"TERM=linux",
@@ -98,8 +99,15 @@ static int call_modprobe(char *module_name, int wait)
argv[3] = module_name; /* check free_modprobe_argv() */
argv[4] = NULL;
- return call_usermodehelper_fns(modprobe_path, argv, envp,
- wait | UMH_KILLABLE, NULL, free_modprobe_argv, NULL);
+ info = call_usermodehelper_setup(modprobe_path, argv, envp, GFP_KERNEL,
+ NULL, free_modprobe_argv, NULL);
+ if (!info)
+ goto free_module_name;
+
+ return call_usermodehelper_exec(info, wait | UMH_KILLABLE);
+
+free_module_name:
+ kfree(module_name);
free_argv:
kfree(argv);
out:
@@ -502,14 +510,28 @@ static void helper_unlock(void)
* @argv: arg vector for process
* @envp: environment for process
* @gfp_mask: gfp mask for memory allocation
+ * @cleanup: a cleanup function
+ * @init: an init function
+ * @data: arbitrary context sensitive data
*
* Returns either %NULL on allocation failure, or a subprocess_info
* structure. This should be passed to call_usermodehelper_exec to
* exec the process and free the structure.
+ *
+ * The init function is used to customize the helper process prior to
+ * exec. A non-zero return code causes the process to error out, exit,
+ * and return the failure to the calling process
+ *
+ * The cleanup function is just before ethe subprocess_info is about to
+ * be freed. This can be used for freeing the argv and envp. The
+ * Function must be runnable in either a process context or the
+ * context in which call_usermodehelper_exec is called.
*/
-static
struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
- char **envp, gfp_t gfp_mask)
+ char **envp, gfp_t gfp_mask,
+ int (*init)(struct subprocess_info *info, struct cred *new),
+ void (*cleanup)(struct subprocess_info *info),
+ void *data)
{
struct subprocess_info *sub_info;
sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask);
@@ -520,50 +542,27 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
sub_info->path = path;
sub_info->argv = argv;
sub_info->envp = envp;
+
+ sub_info->cleanup = cleanup;
+ sub_info->init = init;
+ sub_info->data = data;
out:
return sub_info;
}
-
-/**
- * call_usermodehelper_setfns - set a cleanup/init function
- * @info: a subprocess_info returned by call_usermodehelper_setup
- * @cleanup: a cleanup function
- * @init: an init function
- * @data: arbitrary context sensitive data
- *
- * The init function is used to customize the helper process prior to
- * exec. A non-zero return code causes the process to error out, exit,
- * and return the failure to the calling process
- *
- * The cleanup function is just before ethe subprocess_info is about to
- * be freed. This can be used for freeing the argv and envp. The
- * Function must be runnable in either a process context or the
- * context in which call_usermodehelper_exec is called.
- */
-static
-void call_usermodehelper_setfns(struct subprocess_info *info,
- int (*init)(struct subprocess_info *info, struct cred *new),
- void (*cleanup)(struct subprocess_info *info),
- void *data)
-{
- info->cleanup = cleanup;
- info->init = init;
- info->data = data;
-}
+EXPORT_SYMBOL(call_usermodehelper_setup);
/**
* call_usermodehelper_exec - start a usermode application
* @sub_info: information about the subprocessa
* @wait: wait for the application to finish and return status.
- * when -1 don't wait at all, but you get no useful error back when
- * the program couldn't be exec'ed. This makes it safe to call
+ * when UMH_NO_WAIT don't wait at all, but you get no useful error back
+ * when the program couldn't be exec'ed. This makes it safe to call
* from interrupt context.
*
* Runs a user-space application. The application is started
* asynchronously if wait is not set, and runs as a child of keventd.
* (ie. it runs with full root capabilities).
*/
-static
int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
{
DECLARE_COMPLETION_ONSTACK(done);
@@ -615,31 +614,34 @@ unlock:
helper_unlock();
return retval;
}
+EXPORT_SYMBOL(call_usermodehelper_exec);
-/*
- * call_usermodehelper_fns() will not run the caller-provided cleanup function
- * if a memory allocation failure is experienced. So the caller might need to
- * check the call_usermodehelper_fns() return value: if it is -ENOMEM, perform
- * the necessaary cleanup within the caller.
+/**
+ * call_usermodehelper() - prepare and start a usermode application
+ * @path: path to usermode executable
+ * @argv: arg vector for process
+ * @envp: environment for process
+ * @wait: wait for the application to finish and return status.
+ * when UMH_NO_WAIT don't wait at all, but you get no useful error back
+ * when the program couldn't be exec'ed. This makes it safe to call
+ * from interrupt context.
+ *
+ * This function is the equivalent to use call_usermodehelper_setup() and
+ * call_usermodehelper_exec().
*/
-int call_usermodehelper_fns(
- char *path, char **argv, char **envp, int wait,
- int (*init)(struct subprocess_info *info, struct cred *new),
- void (*cleanup)(struct subprocess_info *), void *data)
+int call_usermodehelper(char *path, char **argv, char **envp, int wait)
{
struct subprocess_info *info;
gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
- info = call_usermodehelper_setup(path, argv, envp, gfp_mask);
-
+ info = call_usermodehelper_setup(path, argv, envp, gfp_mask,
+ NULL, NULL, NULL);
if (info == NULL)
return -ENOMEM;
- call_usermodehelper_setfns(info, init, cleanup, data);
-
return call_usermodehelper_exec(info, wait);
}
-EXPORT_SYMBOL(call_usermodehelper_fns);
+EXPORT_SYMBOL(call_usermodehelper);
static int proc_cap_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 16d8ddd268b..760e86df8c2 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -17,6 +17,7 @@
#include <linux/slab.h>
#include <linux/freezer.h>
#include <linux/ptrace.h>
+#include <linux/uaccess.h>
#include <trace/events/sched.h>
static DEFINE_SPINLOCK(kthread_create_lock);
@@ -135,6 +136,24 @@ void *kthread_data(struct task_struct *task)
return to_kthread(task)->data;
}
+/**
+ * probe_kthread_data - speculative version of kthread_data()
+ * @task: possible kthread task in question
+ *
+ * @task could be a kthread task. Return the data value specified when it
+ * was created if accessible. If @task isn't a kthread task or its data is
+ * inaccessible for any reason, %NULL is returned. This function requires
+ * that @task itself is safe to dereference.
+ */
+void *probe_kthread_data(struct task_struct *task)
+{
+ struct kthread *kthread = to_kthread(task);
+ void *data = NULL;
+
+ probe_kernel_read(&data, &kthread->data, sizeof(data));
+ return data;
+}
+
static void __kthread_parkme(struct kthread *self)
{
__set_current_state(TASK_PARKED);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 6a3bccba7e7..1f3186b37fd 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2998,6 +2998,7 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
EXPORT_SYMBOL_GPL(lockdep_init_map);
struct lock_class_key __lockdep_no_validate__;
+EXPORT_SYMBOL_GPL(__lockdep_no_validate__);
static int
print_lock_nested_lock_not_held(struct task_struct *curr,
diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S
index 246b4c6e613..4a9a86d12c8 100644
--- a/kernel/modsign_certificate.S
+++ b/kernel/modsign_certificate.S
@@ -1,15 +1,8 @@
-/* SYMBOL_PREFIX defined on commandline from CONFIG_SYMBOL_PREFIX */
-#ifndef SYMBOL_PREFIX
-#define ASM_SYMBOL(sym) sym
-#else
-#define PASTE2(x,y) x##y
-#define PASTE(x,y) PASTE2(x,y)
-#define ASM_SYMBOL(sym) PASTE(SYMBOL_PREFIX, sym)
-#endif
+#include <linux/export.h>
#define GLOBAL(name) \
- .globl ASM_SYMBOL(name); \
- ASM_SYMBOL(name):
+ .globl VMLINUX_SYMBOL(name); \
+ VMLINUX_SYMBOL(name):
.section ".init.data","aw"
diff --git a/kernel/module.c b/kernel/module.c
index 0925c9a7197..b049939177f 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1209,10 +1209,11 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
/* Since this should be found in kernel (which can't be removed),
* no locking is necessary. */
- if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL,
+ if (!find_symbol(VMLINUX_SYMBOL_STR(module_layout), NULL,
&crc, true, false))
BUG();
- return check_version(sechdrs, versindex, "module_layout", mod, crc,
+ return check_version(sechdrs, versindex,
+ VMLINUX_SYMBOL_STR(module_layout), mod, crc,
NULL);
}
@@ -1861,12 +1862,12 @@ static void free_module(struct module *mod)
{
trace_module_free(mod);
- /* Delete from various lists */
- mutex_lock(&module_mutex);
- stop_machine(__unlink_module, mod, NULL);
- mutex_unlock(&module_mutex);
mod_sysfs_teardown(mod);
+ /* We leave it in list to prevent duplicate loads, but make sure
+ * that noone uses it while it's being deconstructed. */
+ mod->state = MODULE_STATE_UNFORMED;
+
/* Remove dynamic debug info */
ddebug_remove_module(mod->name);
@@ -1879,6 +1880,11 @@ static void free_module(struct module *mod)
/* Free any allocated parameters. */
destroy_params(mod->kp, mod->num_kp);
+ /* Now we can delete it from the lists */
+ mutex_lock(&module_mutex);
+ stop_machine(__unlink_module, mod, NULL);
+ mutex_unlock(&module_mutex);
+
/* This may be NULL, but that's OK */
unset_module_init_ro_nx(mod);
module_free(mod, mod->module_init);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index afc0456f227..364ceab15f0 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -22,7 +22,7 @@
#include <linux/pid_namespace.h>
#include <net/net_namespace.h>
#include <linux/ipc_namespace.h>
-#include <linux/proc_fs.h>
+#include <linux/proc_ns.h>
#include <linux/file.h>
#include <linux/syscalls.h>
@@ -241,7 +241,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
const struct proc_ns_operations *ops;
struct task_struct *tsk = current;
struct nsproxy *new_nsproxy;
- struct proc_inode *ei;
+ struct proc_ns *ei;
struct file *file;
int err;
@@ -250,7 +250,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
return PTR_ERR(file);
err = -EINVAL;
- ei = PROC_I(file_inode(file));
+ ei = get_proc_ns(file_inode(file));
ops = ei->ns_ops;
if (nstype && (ops->type != nstype))
goto out;
diff --git a/kernel/panic.c b/kernel/panic.c
index 7c57cc9eee2..167ec097ce8 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -22,7 +22,6 @@
#include <linux/sysrq.h>
#include <linux/init.h>
#include <linux/nmi.h>
-#include <linux/dmi.h>
#define PANIC_TIMER_STEP 100
#define PANIC_BLINK_SPD 18
@@ -400,13 +399,8 @@ struct slowpath_args {
static void warn_slowpath_common(const char *file, int line, void *caller,
unsigned taint, struct slowpath_args *args)
{
- const char *board;
-
printk(KERN_WARNING "------------[ cut here ]------------\n");
printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller);
- board = dmi_get_system_info(DMI_PRODUCT_NAME);
- if (board)
- printk(KERN_WARNING "Hardware name: %s\n", board);
if (args)
vprintk(args->fmt, args->args);
diff --git a/kernel/params.c b/kernel/params.c
index ed35345be53..53b958fcd63 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -613,10 +613,13 @@ static __modinit int add_sysfs_param(struct module_kobject *mk,
sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (num+1),
GFP_KERNEL);
if (!new) {
- kfree(mk->mp);
+ kfree(attrs);
err = -ENOMEM;
goto fail;
}
+ /* Despite looking like the typical realloc() bug, this is safe.
+ * We *want* the old 'attrs' to be freed either way, and we'll store
+ * the new one in the success case. */
attrs = krealloc(attrs, sizeof(new->grp.attrs[0])*(num+2), GFP_KERNEL);
if (!attrs) {
err = -ENOMEM;
diff --git a/kernel/pid.c b/kernel/pid.c
index 047dc626463..0db3e791a06 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -36,6 +36,7 @@
#include <linux/pid_namespace.h>
#include <linux/init_task.h>
#include <linux/syscalls.h>
+#include <linux/proc_ns.h>
#include <linux/proc_fs.h>
#define pid_hashfn(nr, ns) \
@@ -51,9 +52,6 @@ int pid_max = PID_MAX_DEFAULT;
int pid_max_min = RESERVED_PIDS + 1;
int pid_max_max = PID_MAX_LIMIT;
-#define BITS_PER_PAGE (PAGE_SIZE*8)
-#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1)
-
static inline int mk_pid(struct pid_namespace *pid_ns,
struct pidmap *map, int off)
{
@@ -183,15 +181,19 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
break;
}
if (likely(atomic_read(&map->nr_free))) {
- do {
+ for ( ; ; ) {
if (!test_and_set_bit(offset, map->page)) {
atomic_dec(&map->nr_free);
set_last_pid(pid_ns, last, pid);
return pid;
}
offset = find_next_offset(map, offset);
+ if (offset >= BITS_PER_PAGE)
+ break;
pid = mk_pid(pid_ns, map, offset);
- } while (offset < BITS_PER_PAGE && pid < pid_max);
+ if (pid >= pid_max)
+ break;
+ }
}
if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
++map;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index bea15bdf82b..6917e8edb48 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -15,12 +15,10 @@
#include <linux/err.h>
#include <linux/acct.h>
#include <linux/slab.h>
-#include <linux/proc_fs.h>
+#include <linux/proc_ns.h>
#include <linux/reboot.h>
#include <linux/export.h>
-#define BITS_PER_PAGE (PAGE_SIZE*8)
-
struct pid_cache {
int nr_ids;
char name[16];
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 8fd709c9bb5..42670e9b44e 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -10,6 +10,8 @@
#include <linux/kernel_stat.h>
#include <trace/events/timer.h>
#include <linux/random.h>
+#include <linux/tick.h>
+#include <linux/workqueue.h>
/*
* Called after updating RLIMIT_CPU to run cpu timer and update
@@ -153,6 +155,21 @@ static void bump_cpu_timer(struct k_itimer *timer,
}
}
+/**
+ * task_cputime_zero - Check a task_cputime struct for all zero fields.
+ *
+ * @cputime: The struct to compare.
+ *
+ * Checks @cputime to see if all fields are zero. Returns true if all fields
+ * are zero, false if any field is nonzero.
+ */
+static inline int task_cputime_zero(const struct task_cputime *cputime)
+{
+ if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
+ return 1;
+ return 0;
+}
+
static inline cputime_t prof_ticks(struct task_struct *p)
{
cputime_t utime, stime;
@@ -636,6 +653,37 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
return 0;
}
+#ifdef CONFIG_NO_HZ_FULL
+static void nohz_kick_work_fn(struct work_struct *work)
+{
+ tick_nohz_full_kick_all();
+}
+
+static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn);
+
+/*
+ * We need the IPIs to be sent from sane process context.
+ * The posix cpu timers are always set with irqs disabled.
+ */
+static void posix_cpu_timer_kick_nohz(void)
+{
+ schedule_work(&nohz_kick_work);
+}
+
+bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
+{
+ if (!task_cputime_zero(&tsk->cputime_expires))
+ return false;
+
+ if (tsk->signal->cputimer.running)
+ return false;
+
+ return true;
+}
+#else
+static inline void posix_cpu_timer_kick_nohz(void) { }
+#endif
+
/*
* Guts of sys_timer_settime for CPU timers.
* This is called with the timer locked and interrupts disabled.
@@ -794,6 +842,8 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
sample_to_timespec(timer->it_clock,
old_incr, &old->it_interval);
}
+ if (!ret)
+ posix_cpu_timer_kick_nohz();
return ret;
}
@@ -1008,21 +1058,6 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
}
}
-/**
- * task_cputime_zero - Check a task_cputime struct for all zero fields.
- *
- * @cputime: The struct to compare.
- *
- * Checks @cputime to see if all fields are zero. Returns true if all fields
- * are zero, false if any field is nonzero.
- */
-static inline int task_cputime_zero(const struct task_cputime *cputime)
-{
- if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
- return 1;
- return 0;
-}
-
/*
* Check for any per-thread CPU timers that have fired and move them
* off the tsk->*_timers list onto the firing list. Per-thread timers
@@ -1336,6 +1371,13 @@ void run_posix_cpu_timers(struct task_struct *tsk)
cpu_timer_fire(timer);
spin_unlock(&timer->it_lock);
}
+
+ /*
+ * In case some timers were rescheduled after the queue got emptied,
+ * wake up full dynticks CPUs.
+ */
+ if (tsk->signal->cputimer.running)
+ posix_cpu_timer_kick_nohz();
}
/*
@@ -1366,7 +1408,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
}
if (!*newval)
- return;
+ goto out;
*newval += now.cpu;
}
@@ -1384,6 +1426,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
tsk->signal->cputime_expires.virt_exp = *newval;
break;
}
+out:
+ posix_cpu_timer_kick_nohz();
}
static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 6edbb2c55c2..424c2d4265c 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -40,38 +40,31 @@
#include <linux/list.h>
#include <linux/init.h>
#include <linux/compiler.h>
-#include <linux/idr.h>
+#include <linux/hash.h>
#include <linux/posix-clock.h>
#include <linux/posix-timers.h>
#include <linux/syscalls.h>
#include <linux/wait.h>
#include <linux/workqueue.h>
#include <linux/export.h>
+#include <linux/hashtable.h>
/*
- * Management arrays for POSIX timers. Timers are kept in slab memory
- * Timer ids are allocated by an external routine that keeps track of the
- * id and the timer. The external interface is:
- *
- * void *idr_find(struct idr *idp, int id); to find timer_id <id>
- * int idr_get_new(struct idr *idp, void *ptr); to get a new id and
- * related it to <ptr>
- * void idr_remove(struct idr *idp, int id); to release <id>
- * void idr_init(struct idr *idp); to initialize <idp>
- * which we supply.
- * The idr_get_new *may* call slab for more memory so it must not be
- * called under a spin lock. Likewise idr_remore may release memory
- * (but it may be ok to do this under a lock...).
- * idr_find is just a memory look up and is quite fast. A -1 return
- * indicates that the requested id does not exist.
+ * Management arrays for POSIX timers. Timers are now kept in static hash table
+ * with 512 entries.
+ * Timer ids are allocated by local routine, which selects proper hash head by
+ * key, constructed from current->signal address and per signal struct counter.
+ * This keeps timer ids unique per process, but now they can intersect between
+ * processes.
*/
/*
* Lets keep our timers in a slab cache :-)
*/
static struct kmem_cache *posix_timers_cache;
-static struct idr posix_timers_id;
-static DEFINE_SPINLOCK(idr_lock);
+
+static DEFINE_HASHTABLE(posix_timers_hashtable, 9);
+static DEFINE_SPINLOCK(hash_lock);
/*
* we assume that the new SIGEV_THREAD_ID shares no bits with the other
@@ -152,6 +145,56 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags);
__timr; \
})
+static int hash(struct signal_struct *sig, unsigned int nr)
+{
+ return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable));
+}
+
+static struct k_itimer *__posix_timers_find(struct hlist_head *head,
+ struct signal_struct *sig,
+ timer_t id)
+{
+ struct k_itimer *timer;
+
+ hlist_for_each_entry_rcu(timer, head, t_hash) {
+ if ((timer->it_signal == sig) && (timer->it_id == id))
+ return timer;
+ }
+ return NULL;
+}
+
+static struct k_itimer *posix_timer_by_id(timer_t id)
+{
+ struct signal_struct *sig = current->signal;
+ struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)];
+
+ return __posix_timers_find(head, sig, id);
+}
+
+static int posix_timer_add(struct k_itimer *timer)
+{
+ struct signal_struct *sig = current->signal;
+ int first_free_id = sig->posix_timer_id;
+ struct hlist_head *head;
+ int ret = -ENOENT;
+
+ do {
+ spin_lock(&hash_lock);
+ head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)];
+ if (!__posix_timers_find(head, sig, sig->posix_timer_id)) {
+ hlist_add_head_rcu(&timer->t_hash, head);
+ ret = sig->posix_timer_id;
+ }
+ if (++sig->posix_timer_id < 0)
+ sig->posix_timer_id = 0;
+ if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT))
+ /* Loop over all possible ids completed */
+ ret = -EAGAIN;
+ spin_unlock(&hash_lock);
+ } while (ret == -ENOENT);
+ return ret;
+}
+
static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
{
spin_unlock_irqrestore(&timr->it_lock, flags);
@@ -221,6 +264,11 @@ static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp)
return 0;
}
+static int posix_get_tai(clockid_t which_clock, struct timespec *tp)
+{
+ timekeeping_clocktai(tp);
+ return 0;
+}
/*
* Initialize everything, well, just everything in Posix clocks/timers ;)
@@ -261,6 +309,16 @@ static __init int init_posix_timers(void)
.clock_getres = posix_get_coarse_res,
.clock_get = posix_get_monotonic_coarse,
};
+ struct k_clock clock_tai = {
+ .clock_getres = hrtimer_get_res,
+ .clock_get = posix_get_tai,
+ .nsleep = common_nsleep,
+ .nsleep_restart = hrtimer_nanosleep_restart,
+ .timer_create = common_timer_create,
+ .timer_set = common_timer_set,
+ .timer_get = common_timer_get,
+ .timer_del = common_timer_del,
+ };
struct k_clock clock_boottime = {
.clock_getres = hrtimer_get_res,
.clock_get = posix_get_boottime,
@@ -278,11 +336,11 @@ static __init int init_posix_timers(void)
posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime);
+ posix_timers_register_clock(CLOCK_TAI, &clock_tai);
posix_timers_cache = kmem_cache_create("posix_timers_cache",
sizeof (struct k_itimer), 0, SLAB_PANIC,
NULL);
- idr_init(&posix_timers_id);
return 0;
}
@@ -504,9 +562,9 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
{
if (it_id_set) {
unsigned long flags;
- spin_lock_irqsave(&idr_lock, flags);
- idr_remove(&posix_timers_id, tmr->it_id);
- spin_unlock_irqrestore(&idr_lock, flags);
+ spin_lock_irqsave(&hash_lock, flags);
+ hlist_del_rcu(&tmr->t_hash);
+ spin_unlock_irqrestore(&hash_lock, flags);
}
put_pid(tmr->it_pid);
sigqueue_free(tmr->sigq);
@@ -552,22 +610,11 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
return -EAGAIN;
spin_lock_init(&new_timer->it_lock);
-
- idr_preload(GFP_KERNEL);
- spin_lock_irq(&idr_lock);
- error = idr_alloc(&posix_timers_id, new_timer, 0, 0, GFP_NOWAIT);
- spin_unlock_irq(&idr_lock);
- idr_preload_end();
- if (error < 0) {
- /*
- * Weird looking, but we return EAGAIN if the IDR is
- * full (proper POSIX return value for this)
- */
- if (error == -ENOSPC)
- error = -EAGAIN;
+ new_timer_id = posix_timer_add(new_timer);
+ if (new_timer_id < 0) {
+ error = new_timer_id;
goto out;
}
- new_timer_id = error;
it_id_set = IT_ID_SET;
new_timer->it_id = (timer_t) new_timer_id;
@@ -645,7 +692,7 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
return NULL;
rcu_read_lock();
- timr = idr_find(&posix_timers_id, (int)timer_id);
+ timr = posix_timer_by_id(timer_id);
if (timr) {
spin_lock_irqsave(&timr->it_lock, *flags);
if (timr->it_signal == current->signal) {
diff --git a/kernel/power/console.c b/kernel/power/console.c
index b1dc456474b..463aa673675 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -4,6 +4,7 @@
* Originally from swsusp.
*/
+#include <linux/console.h>
#include <linux/vt_kern.h>
#include <linux/kbd_kern.h>
#include <linux/vt.h>
@@ -14,8 +15,120 @@
static int orig_fgconsole, orig_kmsg;
+static DEFINE_MUTEX(vt_switch_mutex);
+
+struct pm_vt_switch {
+ struct list_head head;
+ struct device *dev;
+ bool required;
+};
+
+static LIST_HEAD(pm_vt_switch_list);
+
+
+/**
+ * pm_vt_switch_required - indicate VT switch at suspend requirements
+ * @dev: device
+ * @required: if true, caller needs VT switch at suspend/resume time
+ *
+ * The different console drivers may or may not require VT switches across
+ * suspend/resume, depending on how they handle restoring video state and
+ * what may be running.
+ *
+ * Drivers can indicate support for switchless suspend/resume, which can
+ * save time and flicker, by using this routine and passing 'false' as
+ * the argument. If any loaded driver needs VT switching, or the
+ * no_console_suspend argument has been passed on the command line, VT
+ * switches will occur.
+ */
+void pm_vt_switch_required(struct device *dev, bool required)
+{
+ struct pm_vt_switch *entry, *tmp;
+
+ mutex_lock(&vt_switch_mutex);
+ list_for_each_entry(tmp, &pm_vt_switch_list, head) {
+ if (tmp->dev == dev) {
+ /* already registered, update requirement */
+ tmp->required = required;
+ goto out;
+ }
+ }
+
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ goto out;
+
+ entry->required = required;
+ entry->dev = dev;
+
+ list_add(&entry->head, &pm_vt_switch_list);
+out:
+ mutex_unlock(&vt_switch_mutex);
+}
+EXPORT_SYMBOL(pm_vt_switch_required);
+
+/**
+ * pm_vt_switch_unregister - stop tracking a device's VT switching needs
+ * @dev: device
+ *
+ * Remove @dev from the vt switch list.
+ */
+void pm_vt_switch_unregister(struct device *dev)
+{
+ struct pm_vt_switch *tmp;
+
+ mutex_lock(&vt_switch_mutex);
+ list_for_each_entry(tmp, &pm_vt_switch_list, head) {
+ if (tmp->dev == dev) {
+ list_del(&tmp->head);
+ break;
+ }
+ }
+ mutex_unlock(&vt_switch_mutex);
+}
+EXPORT_SYMBOL(pm_vt_switch_unregister);
+
+/*
+ * There are three cases when a VT switch on suspend/resume are required:
+ * 1) no driver has indicated a requirement one way or another, so preserve
+ * the old behavior
+ * 2) console suspend is disabled, we want to see debug messages across
+ * suspend/resume
+ * 3) any registered driver indicates it needs a VT switch
+ *
+ * If none of these conditions is present, meaning we have at least one driver
+ * that doesn't need the switch, and none that do, we can avoid it to make
+ * resume look a little prettier (and suspend too, but that's usually hidden,
+ * e.g. when closing the lid on a laptop).
+ */
+static bool pm_vt_switch(void)
+{
+ struct pm_vt_switch *entry;
+ bool ret = true;
+
+ mutex_lock(&vt_switch_mutex);
+ if (list_empty(&pm_vt_switch_list))
+ goto out;
+
+ if (!console_suspend_enabled)
+ goto out;
+
+ list_for_each_entry(entry, &pm_vt_switch_list, head) {
+ if (entry->required)
+ goto out;
+ }
+
+ ret = false;
+out:
+ mutex_unlock(&vt_switch_mutex);
+ return ret;
+}
+
int pm_prepare_console(void)
{
+ if (!pm_vt_switch())
+ return 0;
+
orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1);
if (orig_fgconsole < 0)
return 1;
@@ -26,6 +139,9 @@ int pm_prepare_console(void)
void pm_restore_console(void)
{
+ if (!pm_vt_switch())
+ return;
+
if (orig_fgconsole >= 0) {
vt_move_to_console(orig_fgconsole, 0);
vt_kmsg_redirect(orig_kmsg);
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 68197a4e8fc..7ef6866b521 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -32,7 +32,7 @@ static void handle_poweroff(int key)
static struct sysrq_key_op sysrq_poweroff_op = {
.handler = handle_poweroff,
- .help_msg = "powerOff",
+ .help_msg = "poweroff(o)",
.action_msg = "Power Off",
.enable_mask = SYSRQ_ENABLE_BOOT,
};
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index d4feda084a3..bef86d121eb 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -76,8 +76,20 @@ EXPORT_SYMBOL_GPL(suspend_set_ops);
bool valid_state(suspend_state_t state)
{
- if (state == PM_SUSPEND_FREEZE)
- return true;
+ if (state == PM_SUSPEND_FREEZE) {
+#ifdef CONFIG_PM_DEBUG
+ if (pm_test_level != TEST_NONE &&
+ pm_test_level != TEST_FREEZER &&
+ pm_test_level != TEST_DEVICES &&
+ pm_test_level != TEST_PLATFORM) {
+ printk(KERN_WARNING "Unsupported pm_test mode for "
+ "freeze state, please choose "
+ "none/freezer/devices/platform.\n");
+ return false;
+ }
+#endif
+ return true;
+ }
/*
* PM_SUSPEND_STANDBY and PM_SUSPEND_MEMORY states need lowlevel
* support and need to be valid to the lowlevel
@@ -184,6 +196,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
goto Platform_wake;
}
+ if (suspend_test(TEST_PLATFORM))
+ goto Platform_wake;
+
/*
* PM_SUSPEND_FREEZE equals
* frozen processes + suspended devices + idle processors.
@@ -195,9 +210,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
goto Platform_wake;
}
- if (suspend_test(TEST_PLATFORM))
- goto Platform_wake;
-
error = disable_nonboot_cpus();
if (error || suspend_test(TEST_CPUS))
goto Enable_cpus;
diff --git a/kernel/printk.c b/kernel/printk.c
index 376914e2869..fa36e149442 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -32,6 +32,7 @@
#include <linux/security.h>
#include <linux/bootmem.h>
#include <linux/memblock.h>
+#include <linux/aio.h>
#include <linux/syscalls.h>
#include <linux/kexec.h>
#include <linux/kdb.h>
@@ -43,6 +44,7 @@
#include <linux/rculist.h>
#include <linux/poll.h>
#include <linux/irq_work.h>
+#include <linux/utsname.h>
#include <asm/uaccess.h>
@@ -2849,4 +2851,65 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper)
raw_spin_unlock_irqrestore(&logbuf_lock, flags);
}
EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
+
+static char dump_stack_arch_desc_str[128];
+
+/**
+ * dump_stack_set_arch_desc - set arch-specific str to show with task dumps
+ * @fmt: printf-style format string
+ * @...: arguments for the format string
+ *
+ * The configured string will be printed right after utsname during task
+ * dumps. Usually used to add arch-specific system identifiers. If an
+ * arch wants to make use of such an ID string, it should initialize this
+ * as soon as possible during boot.
+ */
+void __init dump_stack_set_arch_desc(const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ vsnprintf(dump_stack_arch_desc_str, sizeof(dump_stack_arch_desc_str),
+ fmt, args);
+ va_end(args);
+}
+
+/**
+ * dump_stack_print_info - print generic debug info for dump_stack()
+ * @log_lvl: log level
+ *
+ * Arch-specific dump_stack() implementations can use this function to
+ * print out the same debug information as the generic dump_stack().
+ */
+void dump_stack_print_info(const char *log_lvl)
+{
+ printk("%sCPU: %d PID: %d Comm: %.20s %s %s %.*s\n",
+ log_lvl, raw_smp_processor_id(), current->pid, current->comm,
+ print_tainted(), init_utsname()->release,
+ (int)strcspn(init_utsname()->version, " "),
+ init_utsname()->version);
+
+ if (dump_stack_arch_desc_str[0] != '\0')
+ printk("%sHardware name: %s\n",
+ log_lvl, dump_stack_arch_desc_str);
+
+ print_worker_info(log_lvl, current);
+}
+
+/**
+ * show_regs_print_info - print generic debug info for show_regs()
+ * @log_lvl: log level
+ *
+ * show_regs() implementations can use this function to print out generic
+ * debug information.
+ */
+void show_regs_print_info(const char *log_lvl)
+{
+ dump_stack_print_info(log_lvl);
+
+ printk("%stask: %p ti: %p task.ti: %p\n",
+ log_lvl, current, current_thread_info(),
+ task_thread_info(current));
+}
+
#endif
diff --git a/kernel/profile.c b/kernel/profile.c
index dc3384ee874..0bf40073766 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -462,10 +462,10 @@ static const struct file_operations prof_cpu_mask_proc_fops = {
.write = prof_cpu_mask_proc_write,
};
-void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
+void create_prof_cpu_mask(void)
{
/* create /proc/irq/prof_cpu_mask */
- proc_create("prof_cpu_mask", 0600, root_irq_dir, &prof_cpu_mask_proc_fops);
+ proc_create("irq/prof_cpu_mask", 0600, NULL, &prof_cpu_mask_proc_fops);
}
/*
@@ -600,7 +600,7 @@ int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */
NULL, &proc_profile_operations);
if (!entry)
return 0;
- entry->size = (1+prof_len) * sizeof(atomic_t);
+ proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t));
hotcpu_notifier(profile_cpu_callback, 0);
return 0;
}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index acbd28424d8..aed981a3f69 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -17,6 +17,7 @@
#include <linux/ptrace.h>
#include <linux/security.h>
#include <linux/signal.h>
+#include <linux/uio.h>
#include <linux/audit.h>
#include <linux/pid_namespace.h>
#include <linux/syscalls.h>
@@ -24,6 +25,7 @@
#include <linux/regset.h>
#include <linux/hw_breakpoint.h>
#include <linux/cn_proc.h>
+#include <linux/compat.h>
static int ptrace_trapping_sleep_fn(void *flags)
@@ -618,6 +620,81 @@ static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info)
return error;
}
+static int ptrace_peek_siginfo(struct task_struct *child,
+ unsigned long addr,
+ unsigned long data)
+{
+ struct ptrace_peeksiginfo_args arg;
+ struct sigpending *pending;
+ struct sigqueue *q;
+ int ret, i;
+
+ ret = copy_from_user(&arg, (void __user *) addr,
+ sizeof(struct ptrace_peeksiginfo_args));
+ if (ret)
+ return -EFAULT;
+
+ if (arg.flags & ~PTRACE_PEEKSIGINFO_SHARED)
+ return -EINVAL; /* unknown flags */
+
+ if (arg.nr < 0)
+ return -EINVAL;
+
+ if (arg.flags & PTRACE_PEEKSIGINFO_SHARED)
+ pending = &child->signal->shared_pending;
+ else
+ pending = &child->pending;
+
+ for (i = 0; i < arg.nr; ) {
+ siginfo_t info;
+ s32 off = arg.off + i;
+
+ spin_lock_irq(&child->sighand->siglock);
+ list_for_each_entry(q, &pending->list, list) {
+ if (!off--) {
+ copy_siginfo(&info, &q->info);
+ break;
+ }
+ }
+ spin_unlock_irq(&child->sighand->siglock);
+
+ if (off >= 0) /* beyond the end of the list */
+ break;
+
+#ifdef CONFIG_COMPAT
+ if (unlikely(is_compat_task())) {
+ compat_siginfo_t __user *uinfo = compat_ptr(data);
+
+ ret = copy_siginfo_to_user32(uinfo, &info);
+ ret |= __put_user(info.si_code, &uinfo->si_code);
+ } else
+#endif
+ {
+ siginfo_t __user *uinfo = (siginfo_t __user *) data;
+
+ ret = copy_siginfo_to_user(uinfo, &info);
+ ret |= __put_user(info.si_code, &uinfo->si_code);
+ }
+
+ if (ret) {
+ ret = -EFAULT;
+ break;
+ }
+
+ data += sizeof(siginfo_t);
+ i++;
+
+ if (signal_pending(current))
+ break;
+
+ cond_resched();
+ }
+
+ if (i > 0)
+ return i;
+
+ return ret;
+}
#ifdef PTRACE_SINGLESTEP
#define is_singlestep(request) ((request) == PTRACE_SINGLESTEP)
@@ -748,6 +825,10 @@ int ptrace_request(struct task_struct *child, long request,
ret = put_user(child->ptrace_message, datalp);
break;
+ case PTRACE_PEEKSIGINFO:
+ ret = ptrace_peek_siginfo(child, addr, data);
+ break;
+
case PTRACE_GETSIGINFO:
ret = ptrace_getsiginfo(child, &siginfo);
if (!ret)
diff --git a/kernel/range.c b/kernel/range.c
index 9b8ae2d6ed6..071b0ab455c 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -97,7 +97,8 @@ void subtract_range(struct range *range, int az, u64 start, u64 end)
range[i].end = range[j].end;
range[i].start = end;
} else {
- printk(KERN_ERR "run of slot in ranges\n");
+ pr_err("%s: run out of slot in ranges\n",
+ __func__);
}
range[j].end = start;
continue;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 2f8530bc4b1..16ea6792501 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -64,7 +64,7 @@
static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
-#define RCU_STATE_INITIALIZER(sname, cr) { \
+#define RCU_STATE_INITIALIZER(sname, sabbr, cr) { \
.level = { &sname##_state.node[0] }, \
.call = cr, \
.fqs_state = RCU_GP_IDLE, \
@@ -76,13 +76,14 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
.onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
.name = #sname, \
+ .abbr = sabbr, \
}
struct rcu_state rcu_sched_state =
- RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched);
+ RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
-struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh);
+struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
static struct rcu_state *rcu_state;
@@ -223,6 +224,8 @@ static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS;
module_param(jiffies_till_first_fqs, ulong, 0644);
module_param(jiffies_till_next_fqs, ulong, 0644);
+static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
+ struct rcu_data *rdp);
static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *));
static void force_quiescent_state(struct rcu_state *rsp);
static int rcu_pending(int cpu);
@@ -310,6 +313,8 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
if (rcu_gp_in_progress(rsp))
return 0; /* No, a grace period is already in progress. */
+ if (rcu_nocb_needs_gp(rsp))
+ return 1; /* Yes, a no-CBs CPU needs one. */
if (!rdp->nxttail[RCU_NEXT_TAIL])
return 0; /* No, this is a no-CBs (or offline) CPU. */
if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
@@ -794,6 +799,16 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
rdp->offline_fqs++;
return 1;
}
+
+ /*
+ * There is a possibility that a CPU in adaptive-ticks state
+ * might run in the kernel with the scheduling-clock tick disabled
+ * for an extended time period. Invoke rcu_kick_nohz_cpu() to
+ * force the CPU to restart the scheduling-clock tick in this
+ * CPU is in this state.
+ */
+ rcu_kick_nohz_cpu(rdp->cpu);
+
return 0;
}
@@ -1035,10 +1050,11 @@ static void init_callback_list(struct rcu_data *rdp)
{
int i;
+ if (init_nocb_callback_list(rdp))
+ return;
rdp->nxtlist = NULL;
for (i = 0; i < RCU_NEXT_SIZE; i++)
rdp->nxttail[i] = &rdp->nxtlist;
- init_nocb_callback_list(rdp);
}
/*
@@ -1071,6 +1087,120 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp,
}
/*
+ * Trace-event helper function for rcu_start_future_gp() and
+ * rcu_nocb_wait_gp().
+ */
+static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
+ unsigned long c, char *s)
+{
+ trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum,
+ rnp->completed, c, rnp->level,
+ rnp->grplo, rnp->grphi, s);
+}
+
+/*
+ * Start some future grace period, as needed to handle newly arrived
+ * callbacks. The required future grace periods are recorded in each
+ * rcu_node structure's ->need_future_gp field.
+ *
+ * The caller must hold the specified rcu_node structure's ->lock.
+ */
+static unsigned long __maybe_unused
+rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp)
+{
+ unsigned long c;
+ int i;
+ struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
+
+ /*
+ * Pick up grace-period number for new callbacks. If this
+ * grace period is already marked as needed, return to the caller.
+ */
+ c = rcu_cbs_completed(rdp->rsp, rnp);
+ trace_rcu_future_gp(rnp, rdp, c, "Startleaf");
+ if (rnp->need_future_gp[c & 0x1]) {
+ trace_rcu_future_gp(rnp, rdp, c, "Prestartleaf");
+ return c;
+ }
+
+ /*
+ * If either this rcu_node structure or the root rcu_node structure
+ * believe that a grace period is in progress, then we must wait
+ * for the one following, which is in "c". Because our request
+ * will be noticed at the end of the current grace period, we don't
+ * need to explicitly start one.
+ */
+ if (rnp->gpnum != rnp->completed ||
+ ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) {
+ rnp->need_future_gp[c & 0x1]++;
+ trace_rcu_future_gp(rnp, rdp, c, "Startedleaf");
+ return c;
+ }
+
+ /*
+ * There might be no grace period in progress. If we don't already
+ * hold it, acquire the root rcu_node structure's lock in order to
+ * start one (if needed).
+ */
+ if (rnp != rnp_root)
+ raw_spin_lock(&rnp_root->lock);
+
+ /*
+ * Get a new grace-period number. If there really is no grace
+ * period in progress, it will be smaller than the one we obtained
+ * earlier. Adjust callbacks as needed. Note that even no-CBs
+ * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed.
+ */
+ c = rcu_cbs_completed(rdp->rsp, rnp_root);
+ for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++)
+ if (ULONG_CMP_LT(c, rdp->nxtcompleted[i]))
+ rdp->nxtcompleted[i] = c;
+
+ /*
+ * If the needed for the required grace period is already
+ * recorded, trace and leave.
+ */
+ if (rnp_root->need_future_gp[c & 0x1]) {
+ trace_rcu_future_gp(rnp, rdp, c, "Prestartedroot");
+ goto unlock_out;
+ }
+
+ /* Record the need for the future grace period. */
+ rnp_root->need_future_gp[c & 0x1]++;
+
+ /* If a grace period is not already in progress, start one. */
+ if (rnp_root->gpnum != rnp_root->completed) {
+ trace_rcu_future_gp(rnp, rdp, c, "Startedleafroot");
+ } else {
+ trace_rcu_future_gp(rnp, rdp, c, "Startedroot");
+ rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp);
+ }
+unlock_out:
+ if (rnp != rnp_root)
+ raw_spin_unlock(&rnp_root->lock);
+ return c;
+}
+
+/*
+ * Clean up any old requests for the just-ended grace period. Also return
+ * whether any additional grace periods have been requested. Also invoke
+ * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads
+ * waiting for this grace period to complete.
+ */
+static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+{
+ int c = rnp->completed;
+ int needmore;
+ struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
+
+ rcu_nocb_gp_cleanup(rsp, rnp);
+ rnp->need_future_gp[c & 0x1] = 0;
+ needmore = rnp->need_future_gp[(c + 1) & 0x1];
+ trace_rcu_future_gp(rnp, rdp, c, needmore ? "CleanupMore" : "Cleanup");
+ return needmore;
+}
+
+/*
* If there is room, assign a ->completed number to any callbacks on
* this CPU that have not already been assigned. Also accelerate any
* callbacks that were previously assigned a ->completed number that has
@@ -1129,6 +1259,8 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp,
rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL];
rdp->nxtcompleted[i] = c;
}
+ /* Record any needed additional grace periods. */
+ rcu_start_future_gp(rnp, rdp);
/* Trace depending on how much we were able to accelerate. */
if (!*rdp->nxttail[RCU_WAIT_TAIL])
@@ -1308,9 +1440,9 @@ static int rcu_gp_init(struct rcu_state *rsp)
rdp = this_cpu_ptr(rsp->rda);
rcu_preempt_check_blocked_tasks(rnp);
rnp->qsmask = rnp->qsmaskinit;
- rnp->gpnum = rsp->gpnum;
+ ACCESS_ONCE(rnp->gpnum) = rsp->gpnum;
WARN_ON_ONCE(rnp->completed != rsp->completed);
- rnp->completed = rsp->completed;
+ ACCESS_ONCE(rnp->completed) = rsp->completed;
if (rnp == rdp->mynode)
rcu_start_gp_per_cpu(rsp, rnp, rdp);
rcu_preempt_boost_start_gp(rnp);
@@ -1319,7 +1451,8 @@ static int rcu_gp_init(struct rcu_state *rsp)
rnp->grphi, rnp->qsmask);
raw_spin_unlock_irq(&rnp->lock);
#ifdef CONFIG_PROVE_RCU_DELAY
- if ((prandom_u32() % (rcu_num_nodes * 8)) == 0)
+ if ((prandom_u32() % (rcu_num_nodes * 8)) == 0 &&
+ system_state == SYSTEM_RUNNING)
schedule_timeout_uninterruptible(2);
#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
cond_resched();
@@ -1361,6 +1494,7 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
static void rcu_gp_cleanup(struct rcu_state *rsp)
{
unsigned long gp_duration;
+ int nocb = 0;
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root(rsp);
@@ -1390,17 +1524,23 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
*/
rcu_for_each_node_breadth_first(rsp, rnp) {
raw_spin_lock_irq(&rnp->lock);
- rnp->completed = rsp->gpnum;
+ ACCESS_ONCE(rnp->completed) = rsp->gpnum;
+ rdp = this_cpu_ptr(rsp->rda);
+ if (rnp == rdp->mynode)
+ __rcu_process_gp_end(rsp, rnp, rdp);
+ nocb += rcu_future_gp_cleanup(rsp, rnp);
raw_spin_unlock_irq(&rnp->lock);
cond_resched();
}
rnp = rcu_get_root(rsp);
raw_spin_lock_irq(&rnp->lock);
+ rcu_nocb_gp_set(rnp, nocb);
rsp->completed = rsp->gpnum; /* Declare grace period done. */
trace_rcu_grace_period(rsp->name, rsp->completed, "end");
rsp->fqs_state = RCU_GP_IDLE;
rdp = this_cpu_ptr(rsp->rda);
+ rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */
if (cpu_needs_another_gp(rsp, rdp))
rsp->gp_flags = 1;
raw_spin_unlock_irq(&rnp->lock);
@@ -1476,57 +1616,62 @@ static int __noreturn rcu_gp_kthread(void *arg)
/*
* Start a new RCU grace period if warranted, re-initializing the hierarchy
* in preparation for detecting the next grace period. The caller must hold
- * the root node's ->lock, which is released before return. Hard irqs must
- * be disabled.
+ * the root node's ->lock and hard irqs must be disabled.
*
* Note that it is legal for a dying CPU (which is marked as offline) to
* invoke this function. This can happen when the dying CPU reports its
* quiescent state.
*/
static void
-rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
- __releases(rcu_get_root(rsp)->lock)
+rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
+ struct rcu_data *rdp)
{
- struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
- struct rcu_node *rnp = rcu_get_root(rsp);
-
- if (!rsp->gp_kthread ||
- !cpu_needs_another_gp(rsp, rdp)) {
+ if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) {
/*
* Either we have not yet spawned the grace-period
* task, this CPU does not need another grace period,
* or a grace period is already in progress.
* Either way, don't start a new grace period.
*/
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
return;
}
-
- /*
- * Because there is no grace period in progress right now,
- * any callbacks we have up to this point will be satisfied
- * by the next grace period. So this is a good place to
- * assign a grace period number to recently posted callbacks.
- */
- rcu_accelerate_cbs(rsp, rnp, rdp);
-
rsp->gp_flags = RCU_GP_FLAG_INIT;
- raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
-
- /* Ensure that CPU is aware of completion of last grace period. */
- rcu_process_gp_end(rsp, rdp);
- local_irq_restore(flags);
/* Wake up rcu_gp_kthread() to start the grace period. */
wake_up(&rsp->gp_wq);
}
/*
+ * Similar to rcu_start_gp_advanced(), but also advance the calling CPU's
+ * callbacks. Note that rcu_start_gp_advanced() cannot do this because it
+ * is invoked indirectly from rcu_advance_cbs(), which would result in
+ * endless recursion -- or would do so if it wasn't for the self-deadlock
+ * that is encountered beforehand.
+ */
+static void
+rcu_start_gp(struct rcu_state *rsp)
+{
+ struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
+ struct rcu_node *rnp = rcu_get_root(rsp);
+
+ /*
+ * If there is no grace period in progress right now, any
+ * callbacks we have up to this point will be satisfied by the
+ * next grace period. Also, advancing the callbacks reduces the
+ * probability of false positives from cpu_needs_another_gp()
+ * resulting in pointless grace periods. So, advance callbacks
+ * then start the grace period!
+ */
+ rcu_advance_cbs(rsp, rnp, rdp);
+ rcu_start_gp_advanced(rsp, rnp, rdp);
+}
+
+/*
* Report a full set of quiescent states to the specified rcu_state
* data structure. This involves cleaning up after the prior grace
* period and letting rcu_start_gp() start up the next grace period
- * if one is needed. Note that the caller must hold rnp->lock, as
- * required by rcu_start_gp(), which will release it.
+ * if one is needed. Note that the caller must hold rnp->lock, which
+ * is released before return.
*/
static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
__releases(rcu_get_root(rsp)->lock)
@@ -1685,7 +1830,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
struct rcu_node *rnp, struct rcu_data *rdp)
{
/* No-CBs CPUs do not have orphanable callbacks. */
- if (is_nocb_cpu(rdp->cpu))
+ if (rcu_is_nocb_cpu(rdp->cpu))
return;
/*
@@ -2124,7 +2269,8 @@ __rcu_process_callbacks(struct rcu_state *rsp)
local_irq_save(flags);
if (cpu_needs_another_gp(rsp, rdp)) {
raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
- rcu_start_gp(rsp, flags); /* releases above lock */
+ rcu_start_gp(rsp);
+ raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
} else {
local_irq_restore(flags);
}
@@ -2169,7 +2315,8 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
static void invoke_rcu_core(void)
{
- raise_softirq(RCU_SOFTIRQ);
+ if (cpu_online(smp_processor_id()))
+ raise_softirq(RCU_SOFTIRQ);
}
/*
@@ -2204,11 +2351,11 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
/* Start a new grace period if one not already started. */
if (!rcu_gp_in_progress(rsp)) {
- unsigned long nestflag;
struct rcu_node *rnp_root = rcu_get_root(rsp);
- raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
- rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */
+ raw_spin_lock(&rnp_root->lock);
+ rcu_start_gp(rsp);
+ raw_spin_unlock(&rnp_root->lock);
} else {
/* Give the grace period a kick. */
rdp->blimit = LONG_MAX;
@@ -2628,19 +2775,27 @@ static int rcu_pending(int cpu)
}
/*
- * Check to see if any future RCU-related work will need to be done
- * by the current CPU, even if none need be done immediately, returning
- * 1 if so.
+ * Return true if the specified CPU has any callback. If all_lazy is
+ * non-NULL, store an indication of whether all callbacks are lazy.
+ * (If there are no callbacks, all of them are deemed to be lazy.)
*/
-static int rcu_cpu_has_callbacks(int cpu)
+static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
{
+ bool al = true;
+ bool hc = false;
+ struct rcu_data *rdp;
struct rcu_state *rsp;
- /* RCU callbacks either ready or pending? */
- for_each_rcu_flavor(rsp)
- if (per_cpu_ptr(rsp->rda, cpu)->nxtlist)
- return 1;
- return 0;
+ for_each_rcu_flavor(rsp) {
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+ if (rdp->qlen != rdp->qlen_lazy)
+ al = false;
+ if (rdp->nxtlist)
+ hc = true;
+ }
+ if (all_lazy)
+ *all_lazy = al;
+ return hc;
}
/*
@@ -2747,10 +2902,10 @@ static void _rcu_barrier(struct rcu_state *rsp)
* corresponding CPU's preceding callbacks have been invoked.
*/
for_each_possible_cpu(cpu) {
- if (!cpu_online(cpu) && !is_nocb_cpu(cpu))
+ if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu))
continue;
rdp = per_cpu_ptr(rsp->rda, cpu);
- if (is_nocb_cpu(cpu)) {
+ if (rcu_is_nocb_cpu(cpu)) {
_rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
rsp->n_barrier_done);
atomic_inc(&rsp->barrier_cpu_count);
@@ -2859,7 +3014,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
atomic_set(&rdp->dynticks->dynticks,
(atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
- rcu_prepare_for_idle_init(cpu);
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
/* Add CPU to rcu_node bitmasks. */
@@ -2909,7 +3063,6 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
struct rcu_node *rnp = rdp->mynode;
struct rcu_state *rsp;
- int ret = NOTIFY_OK;
trace_rcu_utilization("Start CPU hotplug");
switch (action) {
@@ -2923,21 +3076,12 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
rcu_boost_kthread_setaffinity(rnp, -1);
break;
case CPU_DOWN_PREPARE:
- if (nocb_cpu_expendable(cpu))
- rcu_boost_kthread_setaffinity(rnp, cpu);
- else
- ret = NOTIFY_BAD;
+ rcu_boost_kthread_setaffinity(rnp, cpu);
break;
case CPU_DYING:
case CPU_DYING_FROZEN:
- /*
- * The whole machine is "stopped" except this CPU, so we can
- * touch any data without introducing corruption. We send the
- * dying CPU's callbacks to an arbitrarily chosen online CPU.
- */
for_each_rcu_flavor(rsp)
rcu_cleanup_dying_cpu(rsp);
- rcu_cleanup_after_idle(cpu);
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
@@ -2950,7 +3094,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
break;
}
trace_rcu_utilization("End CPU hotplug");
- return ret;
+ return NOTIFY_OK;
}
/*
@@ -3085,6 +3229,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
}
rnp->level = i;
INIT_LIST_HEAD(&rnp->blkd_tasks);
+ rcu_init_one_nocb(rnp);
}
}
@@ -3170,8 +3315,7 @@ void __init rcu_init(void)
rcu_init_one(&rcu_sched_state, &rcu_sched_data);
rcu_init_one(&rcu_bh_state, &rcu_bh_data);
__rcu_init_preempt();
- rcu_init_nocb();
- open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+ open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
/*
* We don't need protection against CPU-hotplug here because
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index c896b5045d9..da77a8f57ff 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -88,18 +88,13 @@ struct rcu_dynticks {
int dynticks_nmi_nesting; /* Track NMI nesting level. */
atomic_t dynticks; /* Even value for idle, else odd. */
#ifdef CONFIG_RCU_FAST_NO_HZ
- int dyntick_drain; /* Prepare-for-idle state variable. */
- unsigned long dyntick_holdoff;
- /* No retries for the jiffy of failure. */
- struct timer_list idle_gp_timer;
- /* Wake up CPU sleeping with callbacks. */
- unsigned long idle_gp_timer_expires;
- /* When to wake up CPU (for repost). */
- bool idle_first_pass; /* First pass of attempt to go idle? */
+ bool all_lazy; /* Are all CPU's CBs lazy? */
unsigned long nonlazy_posted;
/* # times non-lazy CBs posted to CPU. */
unsigned long nonlazy_posted_snap;
/* idle-period nonlazy_posted snapshot. */
+ unsigned long last_accelerate;
+ /* Last jiffy CBs were accelerated. */
int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
};
@@ -134,9 +129,6 @@ struct rcu_node {
/* elements that need to drain to allow the */
/* current expedited grace period to */
/* complete (only for TREE_PREEMPT_RCU). */
- atomic_t wakemask; /* CPUs whose kthread needs to be awakened. */
- /* Since this has meaning only for leaf */
- /* rcu_node structures, 32 bits suffices. */
unsigned long qsmaskinit;
/* Per-GP initial value for qsmask & expmask. */
unsigned long grpmask; /* Mask to apply to parent qsmask. */
@@ -196,6 +188,12 @@ struct rcu_node {
/* Refused to boost: not sure why, though. */
/* This can happen due to race conditions. */
#endif /* #ifdef CONFIG_RCU_BOOST */
+#ifdef CONFIG_RCU_NOCB_CPU
+ wait_queue_head_t nocb_gp_wq[2];
+ /* Place for rcu_nocb_kthread() to wait GP. */
+#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
+ int need_future_gp[2];
+ /* Counts of upcoming no-CB GP requests. */
raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
} ____cacheline_internodealigned_in_smp;
@@ -328,6 +326,11 @@ struct rcu_data {
struct task_struct *nocb_kthread;
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
+ /* 8) RCU CPU stall data. */
+#ifdef CONFIG_RCU_CPU_STALL_INFO
+ unsigned int softirq_snap; /* Snapshot of softirq activity. */
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
+
int cpu;
struct rcu_state *rsp;
};
@@ -375,12 +378,6 @@ struct rcu_state {
struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
void (*func)(struct rcu_head *head));
-#ifdef CONFIG_RCU_NOCB_CPU
- void (*call_remote)(struct rcu_head *head,
- void (*func)(struct rcu_head *head));
- /* call_rcu() flavor, but for */
- /* placing on remote CPU. */
-#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
/* The following fields are guarded by the root rcu_node's lock. */
@@ -443,6 +440,7 @@ struct rcu_state {
unsigned long gp_max; /* Maximum GP duration in */
/* jiffies. */
char *name; /* Name of structure. */
+ char abbr; /* Abbreviated name. */
struct list_head flavors; /* List of RCU flavors. */
};
@@ -520,7 +518,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
struct rcu_node *rnp);
#endif /* #ifdef CONFIG_RCU_BOOST */
static void __cpuinit rcu_prepare_kthreads(int cpu);
-static void rcu_prepare_for_idle_init(int cpu);
static void rcu_cleanup_after_idle(int cpu);
static void rcu_prepare_for_idle(int cpu);
static void rcu_idle_count_callbacks_posted(void);
@@ -529,16 +526,18 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
static void print_cpu_stall_info_end(void);
static void zero_cpu_stall_ticks(struct rcu_data *rdp);
static void increment_cpu_stall_ticks(void);
-static bool is_nocb_cpu(int cpu);
+static int rcu_nocb_needs_gp(struct rcu_state *rsp);
+static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
+static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
+static void rcu_init_one_nocb(struct rcu_node *rnp);
static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
bool lazy);
static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
struct rcu_data *rdp);
-static bool nocb_cpu_expendable(int cpu);
static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp);
-static void init_nocb_callback_list(struct rcu_data *rdp);
-static void __init rcu_init_nocb(void);
+static void rcu_kick_nohz_cpu(int cpu);
+static bool init_nocb_callback_list(struct rcu_data *rdp);
#endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index c1cc7e17ff9..170814dc418 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -28,6 +28,7 @@
#include <linux/gfp.h>
#include <linux/oom.h>
#include <linux/smpboot.h>
+#include <linux/tick.h>
#define RCU_KTHREAD_PRIO 1
@@ -85,11 +86,21 @@ static void __init rcu_bootup_announce_oddness(void)
if (nr_cpu_ids != NR_CPUS)
printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
#ifdef CONFIG_RCU_NOCB_CPU
+#ifndef CONFIG_RCU_NOCB_CPU_NONE
+ if (!have_rcu_nocb_mask) {
+ alloc_bootmem_cpumask_var(&rcu_nocb_mask);
+ have_rcu_nocb_mask = true;
+ }
+#ifdef CONFIG_RCU_NOCB_CPU_ZERO
+ pr_info("\tExperimental no-CBs CPU 0\n");
+ cpumask_set_cpu(0, rcu_nocb_mask);
+#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
+#ifdef CONFIG_RCU_NOCB_CPU_ALL
+ pr_info("\tExperimental no-CBs for all CPUs\n");
+ cpumask_setall(rcu_nocb_mask);
+#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
+#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
if (have_rcu_nocb_mask) {
- if (cpumask_test_cpu(0, rcu_nocb_mask)) {
- cpumask_clear_cpu(0, rcu_nocb_mask);
- pr_info("\tCPU 0: illegal no-CBs CPU (cleared).\n");
- }
cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf);
if (rcu_nocb_poll)
@@ -101,7 +112,7 @@ static void __init rcu_bootup_announce_oddness(void)
#ifdef CONFIG_TREE_PREEMPT_RCU
struct rcu_state rcu_preempt_state =
- RCU_STATE_INITIALIZER(rcu_preempt, call_rcu);
+ RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
static struct rcu_state *rcu_state = &rcu_preempt_state;
@@ -1533,14 +1544,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
{
*delta_jiffies = ULONG_MAX;
- return rcu_cpu_has_callbacks(cpu);
-}
-
-/*
- * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it.
- */
-static void rcu_prepare_for_idle_init(int cpu)
-{
+ return rcu_cpu_has_callbacks(cpu, NULL);
}
/*
@@ -1577,16 +1581,6 @@ static void rcu_idle_count_callbacks_posted(void)
*
* The following three proprocessor symbols control this state machine:
*
- * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt
- * to satisfy RCU. Beyond this point, it is better to incur a periodic
- * scheduling-clock interrupt than to loop through the state machine
- * at full power.
- * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are
- * optional if RCU does not need anything immediately from this
- * CPU, even if this CPU still has RCU callbacks queued. The first
- * times through the state machine are mandatory: we need to give
- * the state machine a chance to communicate a quiescent state
- * to the RCU core.
* RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
* to sleep in dyntick-idle mode with RCU callbacks pending. This
* is sized to be roughly one RCU grace period. Those energy-efficiency
@@ -1602,186 +1596,108 @@ static void rcu_idle_count_callbacks_posted(void)
* adjustment, they can be converted into kernel config parameters, though
* making the state machine smarter might be a better option.
*/
-#define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */
-#define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */
#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */
#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
-extern int tick_nohz_enabled;
-
-/*
- * Does the specified flavor of RCU have non-lazy callbacks pending on
- * the specified CPU? Both RCU flavor and CPU are specified by the
- * rcu_data structure.
- */
-static bool __rcu_cpu_has_nonlazy_callbacks(struct rcu_data *rdp)
-{
- return rdp->qlen != rdp->qlen_lazy;
-}
+static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY;
+module_param(rcu_idle_gp_delay, int, 0644);
+static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
+module_param(rcu_idle_lazy_gp_delay, int, 0644);
-#ifdef CONFIG_TREE_PREEMPT_RCU
+extern int tick_nohz_enabled;
/*
- * Are there non-lazy RCU-preempt callbacks? (There cannot be if there
- * is no RCU-preempt in the kernel.)
+ * Try to advance callbacks for all flavors of RCU on the current CPU.
+ * Afterwards, if there are any callbacks ready for immediate invocation,
+ * return true.
*/
-static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu)
+static bool rcu_try_advance_all_cbs(void)
{
- struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
-
- return __rcu_cpu_has_nonlazy_callbacks(rdp);
-}
-
-#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+ bool cbs_ready = false;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+ struct rcu_state *rsp;
-static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu)
-{
- return 0;
-}
+ for_each_rcu_flavor(rsp) {
+ rdp = this_cpu_ptr(rsp->rda);
+ rnp = rdp->mynode;
-#endif /* else #ifdef CONFIG_TREE_PREEMPT_RCU */
+ /*
+ * Don't bother checking unless a grace period has
+ * completed since we last checked and there are
+ * callbacks not yet ready to invoke.
+ */
+ if (rdp->completed != rnp->completed &&
+ rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
+ rcu_process_gp_end(rsp, rdp);
-/*
- * Does any flavor of RCU have non-lazy callbacks on the specified CPU?
- */
-static bool rcu_cpu_has_nonlazy_callbacks(int cpu)
-{
- return __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_sched_data, cpu)) ||
- __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_bh_data, cpu)) ||
- rcu_preempt_cpu_has_nonlazy_callbacks(cpu);
+ if (cpu_has_callbacks_ready_to_invoke(rdp))
+ cbs_ready = true;
+ }
+ return cbs_ready;
}
/*
- * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
- * callbacks on this CPU, (2) this CPU has not yet attempted to enter
- * dyntick-idle mode, or (3) this CPU is in the process of attempting to
- * enter dyntick-idle mode. Otherwise, if we have recently tried and failed
- * to enter dyntick-idle mode, we refuse to try to enter it. After all,
- * it is better to incur scheduling-clock interrupts than to spin
- * continuously for the same time duration!
+ * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
+ * to invoke. If the CPU has callbacks, try to advance them. Tell the
+ * caller to set the timeout based on whether or not there are non-lazy
+ * callbacks.
*
- * The delta_jiffies argument is used to store the time when RCU is
- * going to need the CPU again if it still has callbacks. The reason
- * for this is that rcu_prepare_for_idle() might need to post a timer,
- * but if so, it will do so after tick_nohz_stop_sched_tick() has set
- * the wakeup time for this CPU. This means that RCU's timer can be
- * delayed until the wakeup time, which defeats the purpose of posting
- * a timer.
+ * The caller must have disabled interrupts.
*/
-int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
+int rcu_needs_cpu(int cpu, unsigned long *dj)
{
struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
- /* Flag a new idle sojourn to the idle-entry state machine. */
- rdtp->idle_first_pass = 1;
+ /* Snapshot to detect later posting of non-lazy callback. */
+ rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
+
/* If no callbacks, RCU doesn't need the CPU. */
- if (!rcu_cpu_has_callbacks(cpu)) {
- *delta_jiffies = ULONG_MAX;
+ if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) {
+ *dj = ULONG_MAX;
return 0;
}
- if (rdtp->dyntick_holdoff == jiffies) {
- /* RCU recently tried and failed, so don't try again. */
- *delta_jiffies = 1;
+
+ /* Attempt to advance callbacks. */
+ if (rcu_try_advance_all_cbs()) {
+ /* Some ready to invoke, so initiate later invocation. */
+ invoke_rcu_core();
return 1;
}
- /* Set up for the possibility that RCU will post a timer. */
- if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
- *delta_jiffies = round_up(RCU_IDLE_GP_DELAY + jiffies,
- RCU_IDLE_GP_DELAY) - jiffies;
+ rdtp->last_accelerate = jiffies;
+
+ /* Request timer delay depending on laziness, and round. */
+ if (rdtp->all_lazy) {
+ *dj = round_up(rcu_idle_gp_delay + jiffies,
+ rcu_idle_gp_delay) - jiffies;
} else {
- *delta_jiffies = jiffies + RCU_IDLE_LAZY_GP_DELAY;
- *delta_jiffies = round_jiffies(*delta_jiffies) - jiffies;
+ *dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
}
return 0;
}
/*
- * Handler for smp_call_function_single(). The only point of this
- * handler is to wake the CPU up, so the handler does only tracing.
- */
-void rcu_idle_demigrate(void *unused)
-{
- trace_rcu_prep_idle("Demigrate");
-}
-
-/*
- * Timer handler used to force CPU to start pushing its remaining RCU
- * callbacks in the case where it entered dyntick-idle mode with callbacks
- * pending. The hander doesn't really need to do anything because the
- * real work is done upon re-entry to idle, or by the next scheduling-clock
- * interrupt should idle not be re-entered.
- *
- * One special case: the timer gets migrated without awakening the CPU
- * on which the timer was scheduled on. In this case, we must wake up
- * that CPU. We do so with smp_call_function_single().
- */
-static void rcu_idle_gp_timer_func(unsigned long cpu_in)
-{
- int cpu = (int)cpu_in;
-
- trace_rcu_prep_idle("Timer");
- if (cpu != smp_processor_id())
- smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0);
- else
- WARN_ON_ONCE(1); /* Getting here can hang the system... */
-}
-
-/*
- * Initialize the timer used to pull CPUs out of dyntick-idle mode.
- */
-static void rcu_prepare_for_idle_init(int cpu)
-{
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
- rdtp->dyntick_holdoff = jiffies - 1;
- setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu);
- rdtp->idle_gp_timer_expires = jiffies - 1;
- rdtp->idle_first_pass = 1;
-}
-
-/*
- * Clean up for exit from idle. Because we are exiting from idle, there
- * is no longer any point to ->idle_gp_timer, so cancel it. This will
- * do nothing if this timer is not active, so just cancel it unconditionally.
- */
-static void rcu_cleanup_after_idle(int cpu)
-{
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
- del_timer(&rdtp->idle_gp_timer);
- trace_rcu_prep_idle("Cleanup after idle");
- rdtp->tick_nohz_enabled_snap = ACCESS_ONCE(tick_nohz_enabled);
-}
-
-/*
- * Check to see if any RCU-related work can be done by the current CPU,
- * and if so, schedule a softirq to get it done. This function is part
- * of the RCU implementation; it is -not- an exported member of the RCU API.
- *
- * The idea is for the current CPU to clear out all work required by the
- * RCU core for the current grace period, so that this CPU can be permitted
- * to enter dyntick-idle mode. In some cases, it will need to be awakened
- * at the end of the grace period by whatever CPU ends the grace period.
- * This allows CPUs to go dyntick-idle more quickly, and to reduce the
- * number of wakeups by a modest integer factor.
- *
- * Because it is not legal to invoke rcu_process_callbacks() with irqs
- * disabled, we do one pass of force_quiescent_state(), then do a
- * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
- * later. The ->dyntick_drain field controls the sequencing.
+ * Prepare a CPU for idle from an RCU perspective. The first major task
+ * is to sense whether nohz mode has been enabled or disabled via sysfs.
+ * The second major task is to check to see if a non-lazy callback has
+ * arrived at a CPU that previously had only lazy callbacks. The third
+ * major task is to accelerate (that is, assign grace-period numbers to)
+ * any recently arrived callbacks.
*
* The caller must have disabled interrupts.
*/
static void rcu_prepare_for_idle(int cpu)
{
- struct timer_list *tp;
+ struct rcu_data *rdp;
struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+ struct rcu_node *rnp;
+ struct rcu_state *rsp;
int tne;
/* Handle nohz enablement switches conservatively. */
tne = ACCESS_ONCE(tick_nohz_enabled);
if (tne != rdtp->tick_nohz_enabled_snap) {
- if (rcu_cpu_has_callbacks(cpu))
+ if (rcu_cpu_has_callbacks(cpu, NULL))
invoke_rcu_core(); /* force nohz to see update. */
rdtp->tick_nohz_enabled_snap = tne;
return;
@@ -1789,125 +1705,56 @@ static void rcu_prepare_for_idle(int cpu)
if (!tne)
return;
- /* Adaptive-tick mode, where usermode execution is idle to RCU. */
- if (!is_idle_task(current)) {
- rdtp->dyntick_holdoff = jiffies - 1;
- if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
- trace_rcu_prep_idle("User dyntick with callbacks");
- rdtp->idle_gp_timer_expires =
- round_up(jiffies + RCU_IDLE_GP_DELAY,
- RCU_IDLE_GP_DELAY);
- } else if (rcu_cpu_has_callbacks(cpu)) {
- rdtp->idle_gp_timer_expires =
- round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
- trace_rcu_prep_idle("User dyntick with lazy callbacks");
- } else {
- return;
- }
- tp = &rdtp->idle_gp_timer;
- mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
+ /* If this is a no-CBs CPU, no callbacks, just return. */
+ if (rcu_is_nocb_cpu(cpu))
return;
- }
/*
- * If this is an idle re-entry, for example, due to use of
- * RCU_NONIDLE() or the new idle-loop tracing API within the idle
- * loop, then don't take any state-machine actions, unless the
- * momentary exit from idle queued additional non-lazy callbacks.
- * Instead, repost the ->idle_gp_timer if this CPU has callbacks
- * pending.
+ * If a non-lazy callback arrived at a CPU having only lazy
+ * callbacks, invoke RCU core for the side-effect of recalculating
+ * idle duration on re-entry to idle.
*/
- if (!rdtp->idle_first_pass &&
- (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) {
- if (rcu_cpu_has_callbacks(cpu)) {
- tp = &rdtp->idle_gp_timer;
- mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
- }
+ if (rdtp->all_lazy &&
+ rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) {
+ invoke_rcu_core();
return;
}
- rdtp->idle_first_pass = 0;
- rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1;
/*
- * If there are no callbacks on this CPU, enter dyntick-idle mode.
- * Also reset state to avoid prejudicing later attempts.
+ * If we have not yet accelerated this jiffy, accelerate all
+ * callbacks on this CPU.
*/
- if (!rcu_cpu_has_callbacks(cpu)) {
- rdtp->dyntick_holdoff = jiffies - 1;
- rdtp->dyntick_drain = 0;
- trace_rcu_prep_idle("No callbacks");
+ if (rdtp->last_accelerate == jiffies)
return;
+ rdtp->last_accelerate = jiffies;
+ for_each_rcu_flavor(rsp) {
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+ if (!*rdp->nxttail[RCU_DONE_TAIL])
+ continue;
+ rnp = rdp->mynode;
+ raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+ rcu_accelerate_cbs(rsp, rnp, rdp);
+ raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
}
+}
- /*
- * If in holdoff mode, just return. We will presumably have
- * refrained from disabling the scheduling-clock tick.
- */
- if (rdtp->dyntick_holdoff == jiffies) {
- trace_rcu_prep_idle("In holdoff");
- return;
- }
+/*
+ * Clean up for exit from idle. Attempt to advance callbacks based on
+ * any grace periods that elapsed while the CPU was idle, and if any
+ * callbacks are now ready to invoke, initiate invocation.
+ */
+static void rcu_cleanup_after_idle(int cpu)
+{
+ struct rcu_data *rdp;
+ struct rcu_state *rsp;
- /* Check and update the ->dyntick_drain sequencing. */
- if (rdtp->dyntick_drain <= 0) {
- /* First time through, initialize the counter. */
- rdtp->dyntick_drain = RCU_IDLE_FLUSHES;
- } else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES &&
- !rcu_pending(cpu) &&
- !local_softirq_pending()) {
- /* Can we go dyntick-idle despite still having callbacks? */
- rdtp->dyntick_drain = 0;
- rdtp->dyntick_holdoff = jiffies;
- if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
- trace_rcu_prep_idle("Dyntick with callbacks");
- rdtp->idle_gp_timer_expires =
- round_up(jiffies + RCU_IDLE_GP_DELAY,
- RCU_IDLE_GP_DELAY);
- } else {
- rdtp->idle_gp_timer_expires =
- round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY);
- trace_rcu_prep_idle("Dyntick with lazy callbacks");
- }
- tp = &rdtp->idle_gp_timer;
- mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
- rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
- return; /* Nothing more to do immediately. */
- } else if (--(rdtp->dyntick_drain) <= 0) {
- /* We have hit the limit, so time to give up. */
- rdtp->dyntick_holdoff = jiffies;
- trace_rcu_prep_idle("Begin holdoff");
- invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */
+ if (rcu_is_nocb_cpu(cpu))
return;
- }
-
- /*
- * Do one step of pushing the remaining RCU callbacks through
- * the RCU core state machine.
- */
-#ifdef CONFIG_TREE_PREEMPT_RCU
- if (per_cpu(rcu_preempt_data, cpu).nxtlist) {
- rcu_preempt_qs(cpu);
- force_quiescent_state(&rcu_preempt_state);
- }
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
- if (per_cpu(rcu_sched_data, cpu).nxtlist) {
- rcu_sched_qs(cpu);
- force_quiescent_state(&rcu_sched_state);
- }
- if (per_cpu(rcu_bh_data, cpu).nxtlist) {
- rcu_bh_qs(cpu);
- force_quiescent_state(&rcu_bh_state);
- }
-
- /*
- * If RCU callbacks are still pending, RCU still needs this CPU.
- * So try forcing the callbacks through the grace period.
- */
- if (rcu_cpu_has_callbacks(cpu)) {
- trace_rcu_prep_idle("More callbacks");
- invoke_rcu_core();
- } else {
- trace_rcu_prep_idle("Callbacks drained");
+ rcu_try_advance_all_cbs();
+ for_each_rcu_flavor(rsp) {
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+ if (cpu_has_callbacks_ready_to_invoke(rdp))
+ invoke_rcu_core();
}
}
@@ -2015,16 +1862,13 @@ early_initcall(rcu_register_oom_notifier);
static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
{
struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
- struct timer_list *tltp = &rdtp->idle_gp_timer;
- char c;
+ unsigned long nlpd = rdtp->nonlazy_posted - rdtp->nonlazy_posted_snap;
- c = rdtp->dyntick_holdoff == jiffies ? 'H' : '.';
- if (timer_pending(tltp))
- sprintf(cp, "drain=%d %c timer=%lu",
- rdtp->dyntick_drain, c, tltp->expires - jiffies);
- else
- sprintf(cp, "drain=%d %c timer not pending",
- rdtp->dyntick_drain, c);
+ sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c",
+ rdtp->last_accelerate & 0xffff, jiffies & 0xffff,
+ ulong2long(nlpd),
+ rdtp->all_lazy ? 'L' : '.',
+ rdtp->tick_nohz_enabled_snap ? '.' : 'D');
}
#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
@@ -2070,10 +1914,11 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
ticks_value = rsp->gpnum - rdp->gpnum;
}
print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
- printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d %s\n",
+ printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n",
cpu, ticks_value, ticks_title,
atomic_read(&rdtp->dynticks) & 0xfff,
rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
+ rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
fast_no_hz);
}
@@ -2087,6 +1932,7 @@ static void print_cpu_stall_info_end(void)
static void zero_cpu_stall_ticks(struct rcu_data *rdp)
{
rdp->ticks_this_gp = 0;
+ rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id());
}
/* Increment ->ticks_this_gp for all flavors of RCU. */
@@ -2165,8 +2011,49 @@ static int __init parse_rcu_nocb_poll(char *arg)
}
early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
+/*
+ * Do any no-CBs CPUs need another grace period?
+ *
+ * Interrupts must be disabled. If the caller does not hold the root
+ * rnp_node structure's ->lock, the results are advisory only.
+ */
+static int rcu_nocb_needs_gp(struct rcu_state *rsp)
+{
+ struct rcu_node *rnp = rcu_get_root(rsp);
+
+ return rnp->need_future_gp[(ACCESS_ONCE(rnp->completed) + 1) & 0x1];
+}
+
+/*
+ * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
+ * grace period.
+ */
+static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+{
+ wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
+}
+
+/*
+ * Set the root rcu_node structure's ->need_future_gp field
+ * based on the sum of those of all rcu_node structures. This does
+ * double-count the root rcu_node structure's requests, but this
+ * is necessary to handle the possibility of a rcu_nocb_kthread()
+ * having awakened during the time that the rcu_node structures
+ * were being updated for the end of the previous grace period.
+ */
+static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
+{
+ rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
+}
+
+static void rcu_init_one_nocb(struct rcu_node *rnp)
+{
+ init_waitqueue_head(&rnp->nocb_gp_wq[0]);
+ init_waitqueue_head(&rnp->nocb_gp_wq[1]);
+}
+
/* Is the specified CPU a no-CPUs CPU? */
-static bool is_nocb_cpu(int cpu)
+bool rcu_is_nocb_cpu(int cpu)
{
if (have_rcu_nocb_mask)
return cpumask_test_cpu(cpu, rcu_nocb_mask);
@@ -2224,9 +2111,16 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
bool lazy)
{
- if (!is_nocb_cpu(rdp->cpu))
+ if (!rcu_is_nocb_cpu(rdp->cpu))
return 0;
__call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy);
+ if (__is_kfree_rcu_offset((unsigned long)rhp->func))
+ trace_rcu_kfree_callback(rdp->rsp->name, rhp,
+ (unsigned long)rhp->func,
+ rdp->qlen_lazy, rdp->qlen);
+ else
+ trace_rcu_callback(rdp->rsp->name, rhp,
+ rdp->qlen_lazy, rdp->qlen);
return 1;
}
@@ -2241,7 +2135,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
long qll = rsp->qlen_lazy;
/* If this is not a no-CBs CPU, tell the caller to do it the old way. */
- if (!is_nocb_cpu(smp_processor_id()))
+ if (!rcu_is_nocb_cpu(smp_processor_id()))
return 0;
rsp->qlen = 0;
rsp->qlen_lazy = 0;
@@ -2265,95 +2159,36 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
}
/*
- * There must be at least one non-no-CBs CPU in operation at any given
- * time, because no-CBs CPUs are not capable of initiating grace periods
- * independently. This function therefore complains if the specified
- * CPU is the last non-no-CBs CPU, allowing the CPU-hotplug system to
- * avoid offlining the last such CPU. (Recursion is a wonderful thing,
- * but you have to have a base case!)
+ * If necessary, kick off a new grace period, and either way wait
+ * for a subsequent grace period to complete.
*/
-static bool nocb_cpu_expendable(int cpu)
+static void rcu_nocb_wait_gp(struct rcu_data *rdp)
{
- cpumask_var_t non_nocb_cpus;
- int ret;
+ unsigned long c;
+ bool d;
+ unsigned long flags;
+ struct rcu_node *rnp = rdp->mynode;
+
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ c = rcu_start_future_gp(rnp, rdp);
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
/*
- * If there are no no-CB CPUs or if this CPU is not a no-CB CPU,
- * then offlining this CPU is harmless. Let it happen.
+ * Wait for the grace period. Do so interruptibly to avoid messing
+ * up the load average.
*/
- if (!have_rcu_nocb_mask || is_nocb_cpu(cpu))
- return 1;
-
- /* If no memory, play it safe and keep the CPU around. */
- if (!alloc_cpumask_var(&non_nocb_cpus, GFP_NOIO))
- return 0;
- cpumask_andnot(non_nocb_cpus, cpu_online_mask, rcu_nocb_mask);
- cpumask_clear_cpu(cpu, non_nocb_cpus);
- ret = !cpumask_empty(non_nocb_cpus);
- free_cpumask_var(non_nocb_cpus);
- return ret;
-}
-
-/*
- * Helper structure for remote registry of RCU callbacks.
- * This is needed for when a no-CBs CPU needs to start a grace period.
- * If it just invokes call_rcu(), the resulting callback will be queued,
- * which can result in deadlock.
- */
-struct rcu_head_remote {
- struct rcu_head *rhp;
- call_rcu_func_t *crf;
- void (*func)(struct rcu_head *rhp);
-};
-
-/*
- * Register a callback as specified by the rcu_head_remote struct.
- * This function is intended to be invoked via smp_call_function_single().
- */
-static void call_rcu_local(void *arg)
-{
- struct rcu_head_remote *rhrp =
- container_of(arg, struct rcu_head_remote, rhp);
-
- rhrp->crf(rhrp->rhp, rhrp->func);
-}
-
-/*
- * Set up an rcu_head_remote structure and the invoke call_rcu_local()
- * on CPU 0 (which is guaranteed to be a non-no-CBs CPU) via
- * smp_call_function_single().
- */
-static void invoke_crf_remote(struct rcu_head *rhp,
- void (*func)(struct rcu_head *rhp),
- call_rcu_func_t crf)
-{
- struct rcu_head_remote rhr;
-
- rhr.rhp = rhp;
- rhr.crf = crf;
- rhr.func = func;
- smp_call_function_single(0, call_rcu_local, &rhr, 1);
-}
-
-/*
- * Helper functions to be passed to wait_rcu_gp(), each of which
- * invokes invoke_crf_remote() to register a callback appropriately.
- */
-static void __maybe_unused
-call_rcu_preempt_remote(struct rcu_head *rhp,
- void (*func)(struct rcu_head *rhp))
-{
- invoke_crf_remote(rhp, func, call_rcu);
-}
-static void call_rcu_bh_remote(struct rcu_head *rhp,
- void (*func)(struct rcu_head *rhp))
-{
- invoke_crf_remote(rhp, func, call_rcu_bh);
-}
-static void call_rcu_sched_remote(struct rcu_head *rhp,
- void (*func)(struct rcu_head *rhp))
-{
- invoke_crf_remote(rhp, func, call_rcu_sched);
+ trace_rcu_future_gp(rnp, rdp, c, "StartWait");
+ for (;;) {
+ wait_event_interruptible(
+ rnp->nocb_gp_wq[c & 0x1],
+ (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c)));
+ if (likely(d))
+ break;
+ flush_signals(current);
+ trace_rcu_future_gp(rnp, rdp, c, "ResumeWait");
+ }
+ trace_rcu_future_gp(rnp, rdp, c, "EndWait");
+ smp_mb(); /* Ensure that CB invocation happens after GP end. */
}
/*
@@ -2390,7 +2225,7 @@ static int rcu_nocb_kthread(void *arg)
cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
ACCESS_ONCE(rdp->nocb_p_count) += c;
ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl;
- wait_rcu_gp(rdp->rsp->call_remote);
+ rcu_nocb_wait_gp(rdp);
/* Each pass through the following loop invokes a callback. */
trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
@@ -2436,36 +2271,40 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
return;
for_each_cpu(cpu, rcu_nocb_mask) {
rdp = per_cpu_ptr(rsp->rda, cpu);
- t = kthread_run(rcu_nocb_kthread, rdp, "rcuo%d", cpu);
+ t = kthread_run(rcu_nocb_kthread, rdp,
+ "rcuo%c/%d", rsp->abbr, cpu);
BUG_ON(IS_ERR(t));
ACCESS_ONCE(rdp->nocb_kthread) = t;
}
}
/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */
-static void init_nocb_callback_list(struct rcu_data *rdp)
+static bool init_nocb_callback_list(struct rcu_data *rdp)
{
if (rcu_nocb_mask == NULL ||
!cpumask_test_cpu(rdp->cpu, rcu_nocb_mask))
- return;
+ return false;
rdp->nxttail[RCU_NEXT_TAIL] = NULL;
+ return true;
}
-/* Initialize the ->call_remote fields in the rcu_state structures. */
-static void __init rcu_init_nocb(void)
+#else /* #ifdef CONFIG_RCU_NOCB_CPU */
+
+static int rcu_nocb_needs_gp(struct rcu_state *rsp)
{
-#ifdef CONFIG_PREEMPT_RCU
- rcu_preempt_state.call_remote = call_rcu_preempt_remote;
-#endif /* #ifdef CONFIG_PREEMPT_RCU */
- rcu_bh_state.call_remote = call_rcu_bh_remote;
- rcu_sched_state.call_remote = call_rcu_sched_remote;
+ return 0;
}
-#else /* #ifdef CONFIG_RCU_NOCB_CPU */
+static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
+{
+}
-static bool is_nocb_cpu(int cpu)
+static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
+{
+}
+
+static void rcu_init_one_nocb(struct rcu_node *rnp)
{
- return false;
}
static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
@@ -2480,11 +2319,6 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
return 0;
}
-static bool nocb_cpu_expendable(int cpu)
-{
- return 1;
-}
-
static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
{
}
@@ -2493,12 +2327,26 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
{
}
-static void init_nocb_callback_list(struct rcu_data *rdp)
+static bool init_nocb_callback_list(struct rcu_data *rdp)
{
+ return false;
}
-static void __init rcu_init_nocb(void)
+#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
+
+/*
+ * An adaptive-ticks CPU can potentially execute in kernel mode for an
+ * arbitrarily long period of time with the scheduling-clock tick turned
+ * off. RCU will be paying attention to this CPU because it is in the
+ * kernel, but the CPU cannot be guaranteed to be executing the RCU state
+ * machine because the scheduling-clock tick has been disabled. Therefore,
+ * if an adaptive-ticks CPU is failing to respond to the current grace
+ * period and has not be idle from an RCU perspective, kick it.
+ */
+static void rcu_kick_nohz_cpu(int cpu)
{
+#ifdef CONFIG_NO_HZ_FULL
+ if (tick_nohz_full_cpu(cpu))
+ smp_send_reschedule(cpu);
+#endif /* #ifdef CONFIG_NO_HZ_FULL */
}
-
-#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 0d095dcaa67..cf6c1741293 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -46,8 +46,6 @@
#define RCU_TREE_NONCORE
#include "rcutree.h"
-#define ulong2long(a) (*(long *)(&(a)))
-
static int r_open(struct inode *inode, struct file *file,
const struct seq_operations *op)
{
@@ -97,7 +95,7 @@ static const struct file_operations rcubarrier_fops = {
.open = rcubarrier_open,
.read = seq_read,
.llseek = no_llseek,
- .release = seq_release,
+ .release = single_release,
};
#ifdef CONFIG_RCU_BOOST
@@ -208,7 +206,7 @@ static const struct file_operations rcuexp_fops = {
.open = rcuexp_open,
.read = seq_read,
.llseek = no_llseek,
- .release = seq_release,
+ .release = single_release,
};
#ifdef CONFIG_RCU_BOOST
@@ -308,7 +306,7 @@ static const struct file_operations rcuhier_fops = {
.open = rcuhier_open,
.read = seq_read,
.llseek = no_llseek,
- .release = seq_release,
+ .release = single_release,
};
static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
@@ -350,7 +348,7 @@ static const struct file_operations rcugp_fops = {
.open = rcugp_open,
.read = seq_read,
.llseek = no_llseek,
- .release = seq_release,
+ .release = single_release,
};
static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
diff --git a/kernel/relay.c b/kernel/relay.c
index 01ab081ac53..b91488ba2e5 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -234,7 +234,6 @@ static void relay_destroy_buf(struct rchan_buf *buf)
static void relay_remove_buf(struct kref *kref)
{
struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
- buf->chan->cb->remove_buf_file(buf->dentry);
relay_destroy_buf(buf);
}
@@ -484,6 +483,7 @@ static void relay_close_buf(struct rchan_buf *buf)
{
buf->finalized = 1;
del_timer_sync(&buf->timer);
+ buf->chan->cb->remove_buf_file(buf->dentry);
kref_put(&buf->kref, relay_remove_buf);
}
@@ -588,7 +588,7 @@ struct rchan *relay_open(const char *base_filename,
chan->version = RELAYFS_CHANNEL_VERSION;
chan->n_subbufs = n_subbufs;
chan->subbuf_size = subbuf_size;
- chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs);
+ chan->alloc_size = PAGE_ALIGN(subbuf_size * n_subbufs);
chan->parent = parent;
chan->private_data = private_data;
if (base_filename) {
@@ -1099,8 +1099,7 @@ static size_t relay_file_read_end_pos(struct rchan_buf *buf,
static int subbuf_read_actor(size_t read_start,
struct rchan_buf *buf,
size_t avail,
- read_descriptor_t *desc,
- read_actor_t actor)
+ read_descriptor_t *desc)
{
void *from;
int ret = 0;
@@ -1121,15 +1120,13 @@ static int subbuf_read_actor(size_t read_start,
typedef int (*subbuf_actor_t) (size_t read_start,
struct rchan_buf *buf,
size_t avail,
- read_descriptor_t *desc,
- read_actor_t actor);
+ read_descriptor_t *desc);
/*
* relay_file_read_subbufs - read count bytes, bridging subbuf boundaries
*/
static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
subbuf_actor_t subbuf_actor,
- read_actor_t actor,
read_descriptor_t *desc)
{
struct rchan_buf *buf = filp->private_data;
@@ -1150,7 +1147,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos,
break;
avail = min(desc->count, avail);
- ret = subbuf_actor(read_start, buf, avail, desc, actor);
+ ret = subbuf_actor(read_start, buf, avail, desc);
if (desc->error < 0)
break;
@@ -1174,8 +1171,7 @@ static ssize_t relay_file_read(struct file *filp,
desc.count = count;
desc.arg.buf = buffer;
desc.error = 0;
- return relay_file_read_subbufs(filp, ppos, subbuf_read_actor,
- NULL, &desc);
+ return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, &desc);
}
static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed)
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index b3c6c3fcd84..cfff1435bdf 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -126,6 +126,15 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
EXPORT_SYMBOL(_down_write_nest_lock);
+void down_read_non_owner(struct rw_semaphore *sem)
+{
+ might_sleep();
+
+ __down_read(sem);
+}
+
+EXPORT_SYMBOL(down_read_non_owner);
+
void down_write_nested(struct rw_semaphore *sem, int subclass)
{
might_sleep();
@@ -136,6 +145,13 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
EXPORT_SYMBOL(down_write_nested);
+void up_read_non_owner(struct rw_semaphore *sem)
+{
+ __up_read(sem);
+}
+
+EXPORT_SYMBOL(up_read_non_owner);
+
#endif
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index f06d249e103..deaf90e4a1d 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -16,3 +16,4 @@ obj-$(CONFIG_SMP) += cpupri.o
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_SCHED_DEBUG) += debug.o
+obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d8285eb0cde..58453b8272f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -512,11 +512,6 @@ static inline void init_hrtick(void)
* the target CPU.
*/
#ifdef CONFIG_SMP
-
-#ifndef tsk_is_polling
-#define tsk_is_polling(t) 0
-#endif
-
void resched_task(struct task_struct *p)
{
int cpu;
@@ -549,7 +544,7 @@ void resched_cpu(int cpu)
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
/*
* In the semi idle case, use the nearest busy cpu for migrating timers
* from an idle cpu. This is good for power-savings.
@@ -587,7 +582,7 @@ unlock:
* account when the CPU goes back to idle and evaluates the timer
* wheel for the next timer event.
*/
-void wake_up_idle_cpu(int cpu)
+static void wake_up_idle_cpu(int cpu)
{
struct rq *rq = cpu_rq(cpu);
@@ -617,20 +612,56 @@ void wake_up_idle_cpu(int cpu)
smp_send_reschedule(cpu);
}
+static bool wake_up_full_nohz_cpu(int cpu)
+{
+ if (tick_nohz_full_cpu(cpu)) {
+ if (cpu != smp_processor_id() ||
+ tick_nohz_tick_stopped())
+ smp_send_reschedule(cpu);
+ return true;
+ }
+
+ return false;
+}
+
+void wake_up_nohz_cpu(int cpu)
+{
+ if (!wake_up_full_nohz_cpu(cpu))
+ wake_up_idle_cpu(cpu);
+}
+
static inline bool got_nohz_idle_kick(void)
{
int cpu = smp_processor_id();
return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
}
-#else /* CONFIG_NO_HZ */
+#else /* CONFIG_NO_HZ_COMMON */
static inline bool got_nohz_idle_kick(void)
{
return false;
}
-#endif /* CONFIG_NO_HZ */
+#endif /* CONFIG_NO_HZ_COMMON */
+
+#ifdef CONFIG_NO_HZ_FULL
+bool sched_can_stop_tick(void)
+{
+ struct rq *rq;
+
+ rq = this_rq();
+
+ /* Make sure rq->nr_running update is visible after the IPI */
+ smp_rmb();
+
+ /* More than one running task need preemption */
+ if (rq->nr_running > 1)
+ return false;
+
+ return true;
+}
+#endif /* CONFIG_NO_HZ_FULL */
void sched_avg_update(struct rq *rq)
{
@@ -1288,8 +1319,8 @@ static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
static void
ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
{
- trace_sched_wakeup(p, true);
check_preempt_curr(rq, p, wake_flags);
+ trace_sched_wakeup(p, true);
p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
@@ -1362,7 +1393,8 @@ static void sched_ttwu_pending(void)
void scheduler_ipi(void)
{
- if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
+ if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()
+ && !tick_nohz_full_cpu(smp_processor_id()))
return;
/*
@@ -1379,6 +1411,7 @@ void scheduler_ipi(void)
* somewhat pessimize the simple resched case.
*/
irq_enter();
+ tick_nohz_full_check();
sched_ttwu_pending();
/*
@@ -1860,6 +1893,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
kprobe_flush_task(prev);
put_task_struct(prev);
}
+
+ tick_nohz_task_switch(current);
}
#ifdef CONFIG_SMP
@@ -2123,7 +2158,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
return load >> FSHIFT;
}
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
/*
* Handle NO_HZ for the global load-average.
*
@@ -2349,12 +2384,12 @@ static void calc_global_nohz(void)
smp_wmb();
calc_load_idx++;
}
-#else /* !CONFIG_NO_HZ */
+#else /* !CONFIG_NO_HZ_COMMON */
static inline long calc_load_fold_idle(void) { return 0; }
static inline void calc_global_nohz(void) { }
-#endif /* CONFIG_NO_HZ */
+#endif /* CONFIG_NO_HZ_COMMON */
/*
* calc_load - update the avenrun load estimates 10 ticks after the
@@ -2514,7 +2549,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
sched_avg_update(this_rq);
}
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
/*
* There is no sane way to deal with nohz on smp when using jiffies because the
* cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
@@ -2574,7 +2609,7 @@ void update_cpu_load_nohz(void)
}
raw_spin_unlock(&this_rq->lock);
}
-#endif /* CONFIG_NO_HZ */
+#endif /* CONFIG_NO_HZ_COMMON */
/*
* Called from scheduler_tick()
@@ -2701,8 +2736,35 @@ void scheduler_tick(void)
rq->idle_balance = idle_cpu(cpu);
trigger_load_balance(rq, cpu);
#endif
+ rq_last_tick_reset(rq);
}
+#ifdef CONFIG_NO_HZ_FULL
+/**
+ * scheduler_tick_max_deferment
+ *
+ * Keep at least one tick per second when a single
+ * active task is running because the scheduler doesn't
+ * yet completely support full dynticks environment.
+ *
+ * This makes sure that uptime, CFS vruntime, load
+ * balancing, etc... continue to move forward, even
+ * with a very low granularity.
+ */
+u64 scheduler_tick_max_deferment(void)
+{
+ struct rq *rq = this_rq();
+ unsigned long next, now = ACCESS_ONCE(jiffies);
+
+ next = rq->last_sched_tick + HZ;
+
+ if (time_before_eq(next, now))
+ return 0;
+
+ return jiffies_to_usecs(next - now) * NSEC_PER_USEC;
+}
+#endif
+
notrace unsigned long get_parent_ip(unsigned long addr)
{
if (in_lock_functions(addr)) {
@@ -3039,11 +3101,13 @@ EXPORT_SYMBOL(preempt_schedule);
asmlinkage void __sched preempt_schedule_irq(void)
{
struct thread_info *ti = current_thread_info();
+ enum ctx_state prev_state;
/* Catch callers which need to be fixed */
BUG_ON(ti->preempt_count || !irqs_disabled());
- user_exit();
+ prev_state = exception_enter();
+
do {
add_preempt_count(PREEMPT_ACTIVE);
local_irq_enable();
@@ -3057,6 +3121,8 @@ asmlinkage void __sched preempt_schedule_irq(void)
*/
barrier();
} while (need_resched());
+
+ exception_exit(prev_state);
}
#endif /* CONFIG_PREEMPT */
@@ -4587,6 +4653,7 @@ void sched_show_task(struct task_struct *p)
task_pid_nr(p), ppid,
(unsigned long)task_thread_info(p)->flags);
+ print_worker_info(KERN_INFO, p);
show_stack(p, NULL);
}
@@ -6204,7 +6271,7 @@ static void sched_init_numa(void)
* 'level' contains the number of unique distances, excluding the
* identity distance node_distance(i,i).
*
- * The sched_domains_nume_distance[] array includes the actual distance
+ * The sched_domains_numa_distance[] array includes the actual distance
* numbers.
*/
@@ -6817,11 +6884,15 @@ int in_sched_functions(unsigned long addr)
}
#ifdef CONFIG_CGROUP_SCHED
+/*
+ * Default task group.
+ * Every task in system belongs to this group at bootup.
+ */
struct task_group root_task_group;
LIST_HEAD(task_groups);
#endif
-DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
+DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
void __init sched_init(void)
{
@@ -6858,7 +6929,7 @@ void __init sched_init(void)
#endif /* CONFIG_RT_GROUP_SCHED */
#ifdef CONFIG_CPUMASK_OFFSTACK
for_each_possible_cpu(i) {
- per_cpu(load_balance_tmpmask, i) = (void *)ptr;
+ per_cpu(load_balance_mask, i) = (void *)ptr;
ptr += cpumask_size();
}
#endif /* CONFIG_CPUMASK_OFFSTACK */
@@ -6884,12 +6955,6 @@ void __init sched_init(void)
#endif /* CONFIG_CGROUP_SCHED */
-#ifdef CONFIG_CGROUP_CPUACCT
- root_cpuacct.cpustat = &kernel_cpustat;
- root_cpuacct.cpuusage = alloc_percpu(u64);
- /* Too early, not expected to fail */
- BUG_ON(!root_cpuacct.cpuusage);
-#endif
for_each_possible_cpu(i) {
struct rq *rq;
@@ -6953,9 +7018,12 @@ void __init sched_init(void)
INIT_LIST_HEAD(&rq->cfs_tasks);
rq_attach_root(rq, &def_root_domain);
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
rq->nohz_flags = 0;
#endif
+#ifdef CONFIG_NO_HZ_FULL
+ rq->last_sched_tick = 0;
+#endif
#endif
init_rq_hrtick(rq);
atomic_set(&rq->nr_iowait, 0);
@@ -7411,7 +7479,7 @@ unlock:
return err;
}
-int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
+static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
{
u64 rt_runtime, rt_period;
@@ -7423,7 +7491,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
}
-long sched_group_rt_runtime(struct task_group *tg)
+static long sched_group_rt_runtime(struct task_group *tg)
{
u64 rt_runtime_us;
@@ -7435,7 +7503,7 @@ long sched_group_rt_runtime(struct task_group *tg)
return rt_runtime_us;
}
-int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
+static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
{
u64 rt_runtime, rt_period;
@@ -7448,7 +7516,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
}
-long sched_group_rt_period(struct task_group *tg)
+static long sched_group_rt_period(struct task_group *tg)
{
u64 rt_period_us;
@@ -7483,7 +7551,7 @@ static int sched_rt_global_constraints(void)
return ret;
}
-int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
+static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
{
/* Don't accept realtime tasks when there is no way for them to run */
if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
@@ -7991,226 +8059,6 @@ struct cgroup_subsys cpu_cgroup_subsys = {
#endif /* CONFIG_CGROUP_SCHED */
-#ifdef CONFIG_CGROUP_CPUACCT
-
-/*
- * CPU accounting code for task groups.
- *
- * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
- * (balbir@in.ibm.com).
- */
-
-struct cpuacct root_cpuacct;
-
-/* create a new cpu accounting group */
-static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
-{
- struct cpuacct *ca;
-
- if (!cgrp->parent)
- return &root_cpuacct.css;
-
- ca = kzalloc(sizeof(*ca), GFP_KERNEL);
- if (!ca)
- goto out;
-
- ca->cpuusage = alloc_percpu(u64);
- if (!ca->cpuusage)
- goto out_free_ca;
-
- ca->cpustat = alloc_percpu(struct kernel_cpustat);
- if (!ca->cpustat)
- goto out_free_cpuusage;
-
- return &ca->css;
-
-out_free_cpuusage:
- free_percpu(ca->cpuusage);
-out_free_ca:
- kfree(ca);
-out:
- return ERR_PTR(-ENOMEM);
-}
-
-/* destroy an existing cpu accounting group */
-static void cpuacct_css_free(struct cgroup *cgrp)
-{
- struct cpuacct *ca = cgroup_ca(cgrp);
-
- free_percpu(ca->cpustat);
- free_percpu(ca->cpuusage);
- kfree(ca);
-}
-
-static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
-{
- u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
- u64 data;
-
-#ifndef CONFIG_64BIT
- /*
- * Take rq->lock to make 64-bit read safe on 32-bit platforms.
- */
- raw_spin_lock_irq(&cpu_rq(cpu)->lock);
- data = *cpuusage;
- raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
-#else
- data = *cpuusage;
-#endif
-
- return data;
-}
-
-static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
-{
- u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
-
-#ifndef CONFIG_64BIT
- /*
- * Take rq->lock to make 64-bit write safe on 32-bit platforms.
- */
- raw_spin_lock_irq(&cpu_rq(cpu)->lock);
- *cpuusage = val;
- raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
-#else
- *cpuusage = val;
-#endif
-}
-
-/* return total cpu usage (in nanoseconds) of a group */
-static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
-{
- struct cpuacct *ca = cgroup_ca(cgrp);
- u64 totalcpuusage = 0;
- int i;
-
- for_each_present_cpu(i)
- totalcpuusage += cpuacct_cpuusage_read(ca, i);
-
- return totalcpuusage;
-}
-
-static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
- u64 reset)
-{
- struct cpuacct *ca = cgroup_ca(cgrp);
- int err = 0;
- int i;
-
- if (reset) {
- err = -EINVAL;
- goto out;
- }
-
- for_each_present_cpu(i)
- cpuacct_cpuusage_write(ca, i, 0);
-
-out:
- return err;
-}
-
-static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
- struct seq_file *m)
-{
- struct cpuacct *ca = cgroup_ca(cgroup);
- u64 percpu;
- int i;
-
- for_each_present_cpu(i) {
- percpu = cpuacct_cpuusage_read(ca, i);
- seq_printf(m, "%llu ", (unsigned long long) percpu);
- }
- seq_printf(m, "\n");
- return 0;
-}
-
-static const char *cpuacct_stat_desc[] = {
- [CPUACCT_STAT_USER] = "user",
- [CPUACCT_STAT_SYSTEM] = "system",
-};
-
-static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
- struct cgroup_map_cb *cb)
-{
- struct cpuacct *ca = cgroup_ca(cgrp);
- int cpu;
- s64 val = 0;
-
- for_each_online_cpu(cpu) {
- struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
- val += kcpustat->cpustat[CPUTIME_USER];
- val += kcpustat->cpustat[CPUTIME_NICE];
- }
- val = cputime64_to_clock_t(val);
- cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
-
- val = 0;
- for_each_online_cpu(cpu) {
- struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
- val += kcpustat->cpustat[CPUTIME_SYSTEM];
- val += kcpustat->cpustat[CPUTIME_IRQ];
- val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
- }
-
- val = cputime64_to_clock_t(val);
- cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
-
- return 0;
-}
-
-static struct cftype files[] = {
- {
- .name = "usage",
- .read_u64 = cpuusage_read,
- .write_u64 = cpuusage_write,
- },
- {
- .name = "usage_percpu",
- .read_seq_string = cpuacct_percpu_seq_read,
- },
- {
- .name = "stat",
- .read_map = cpuacct_stats_show,
- },
- { } /* terminate */
-};
-
-/*
- * charge this task's execution time to its accounting group.
- *
- * called with rq->lock held.
- */
-void cpuacct_charge(struct task_struct *tsk, u64 cputime)
-{
- struct cpuacct *ca;
- int cpu;
-
- if (unlikely(!cpuacct_subsys.active))
- return;
-
- cpu = task_cpu(tsk);
-
- rcu_read_lock();
-
- ca = task_ca(tsk);
-
- for (; ca; ca = parent_ca(ca)) {
- u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
- *cpuusage += cputime;
- }
-
- rcu_read_unlock();
-}
-
-struct cgroup_subsys cpuacct_subsys = {
- .name = "cpuacct",
- .css_alloc = cpuacct_css_alloc,
- .css_free = cpuacct_css_free,
- .subsys_id = cpuacct_subsys_id,
- .base_cftypes = files,
-};
-#endif /* CONFIG_CGROUP_CPUACCT */
-
void dump_cpu_task(int cpu)
{
pr_info("Task dump for CPU %d:\n", cpu);
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
new file mode 100644
index 00000000000..dbb7e2cd95e
--- /dev/null
+++ b/kernel/sched/cpuacct.c
@@ -0,0 +1,296 @@
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+#include <linux/percpu.h>
+#include <linux/spinlock.h>
+#include <linux/cpumask.h>
+#include <linux/seq_file.h>
+#include <linux/rcupdate.h>
+#include <linux/kernel_stat.h>
+#include <linux/err.h>
+
+#include "sched.h"
+
+/*
+ * CPU accounting code for task groups.
+ *
+ * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
+ * (balbir@in.ibm.com).
+ */
+
+/* Time spent by the tasks of the cpu accounting group executing in ... */
+enum cpuacct_stat_index {
+ CPUACCT_STAT_USER, /* ... user mode */
+ CPUACCT_STAT_SYSTEM, /* ... kernel mode */
+
+ CPUACCT_STAT_NSTATS,
+};
+
+/* track cpu usage of a group of tasks and its child groups */
+struct cpuacct {
+ struct cgroup_subsys_state css;
+ /* cpuusage holds pointer to a u64-type object on every cpu */
+ u64 __percpu *cpuusage;
+ struct kernel_cpustat __percpu *cpustat;
+};
+
+/* return cpu accounting group corresponding to this container */
+static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
+{
+ return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
+ struct cpuacct, css);
+}
+
+/* return cpu accounting group to which this task belongs */
+static inline struct cpuacct *task_ca(struct task_struct *tsk)
+{
+ return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
+ struct cpuacct, css);
+}
+
+static inline struct cpuacct *__parent_ca(struct cpuacct *ca)
+{
+ return cgroup_ca(ca->css.cgroup->parent);
+}
+
+static inline struct cpuacct *parent_ca(struct cpuacct *ca)
+{
+ if (!ca->css.cgroup->parent)
+ return NULL;
+ return cgroup_ca(ca->css.cgroup->parent);
+}
+
+static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage);
+static struct cpuacct root_cpuacct = {
+ .cpustat = &kernel_cpustat,
+ .cpuusage = &root_cpuacct_cpuusage,
+};
+
+/* create a new cpu accounting group */
+static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp)
+{
+ struct cpuacct *ca;
+
+ if (!cgrp->parent)
+ return &root_cpuacct.css;
+
+ ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+ if (!ca)
+ goto out;
+
+ ca->cpuusage = alloc_percpu(u64);
+ if (!ca->cpuusage)
+ goto out_free_ca;
+
+ ca->cpustat = alloc_percpu(struct kernel_cpustat);
+ if (!ca->cpustat)
+ goto out_free_cpuusage;
+
+ return &ca->css;
+
+out_free_cpuusage:
+ free_percpu(ca->cpuusage);
+out_free_ca:
+ kfree(ca);
+out:
+ return ERR_PTR(-ENOMEM);
+}
+
+/* destroy an existing cpu accounting group */
+static void cpuacct_css_free(struct cgroup *cgrp)
+{
+ struct cpuacct *ca = cgroup_ca(cgrp);
+
+ free_percpu(ca->cpustat);
+ free_percpu(ca->cpuusage);
+ kfree(ca);
+}
+
+static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
+{
+ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+ u64 data;
+
+#ifndef CONFIG_64BIT
+ /*
+ * Take rq->lock to make 64-bit read safe on 32-bit platforms.
+ */
+ raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+ data = *cpuusage;
+ raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+#else
+ data = *cpuusage;
+#endif
+
+ return data;
+}
+
+static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
+{
+ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+
+#ifndef CONFIG_64BIT
+ /*
+ * Take rq->lock to make 64-bit write safe on 32-bit platforms.
+ */
+ raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+ *cpuusage = val;
+ raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+#else
+ *cpuusage = val;
+#endif
+}
+
+/* return total cpu usage (in nanoseconds) of a group */
+static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
+{
+ struct cpuacct *ca = cgroup_ca(cgrp);
+ u64 totalcpuusage = 0;
+ int i;
+
+ for_each_present_cpu(i)
+ totalcpuusage += cpuacct_cpuusage_read(ca, i);
+
+ return totalcpuusage;
+}
+
+static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
+ u64 reset)
+{
+ struct cpuacct *ca = cgroup_ca(cgrp);
+ int err = 0;
+ int i;
+
+ if (reset) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ for_each_present_cpu(i)
+ cpuacct_cpuusage_write(ca, i, 0);
+
+out:
+ return err;
+}
+
+static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
+ struct seq_file *m)
+{
+ struct cpuacct *ca = cgroup_ca(cgroup);
+ u64 percpu;
+ int i;
+
+ for_each_present_cpu(i) {
+ percpu = cpuacct_cpuusage_read(ca, i);
+ seq_printf(m, "%llu ", (unsigned long long) percpu);
+ }
+ seq_printf(m, "\n");
+ return 0;
+}
+
+static const char * const cpuacct_stat_desc[] = {
+ [CPUACCT_STAT_USER] = "user",
+ [CPUACCT_STAT_SYSTEM] = "system",
+};
+
+static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
+ struct cgroup_map_cb *cb)
+{
+ struct cpuacct *ca = cgroup_ca(cgrp);
+ int cpu;
+ s64 val = 0;
+
+ for_each_online_cpu(cpu) {
+ struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
+ val += kcpustat->cpustat[CPUTIME_USER];
+ val += kcpustat->cpustat[CPUTIME_NICE];
+ }
+ val = cputime64_to_clock_t(val);
+ cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
+
+ val = 0;
+ for_each_online_cpu(cpu) {
+ struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
+ val += kcpustat->cpustat[CPUTIME_SYSTEM];
+ val += kcpustat->cpustat[CPUTIME_IRQ];
+ val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
+ }
+
+ val = cputime64_to_clock_t(val);
+ cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
+
+ return 0;
+}
+
+static struct cftype files[] = {
+ {
+ .name = "usage",
+ .read_u64 = cpuusage_read,
+ .write_u64 = cpuusage_write,
+ },
+ {
+ .name = "usage_percpu",
+ .read_seq_string = cpuacct_percpu_seq_read,
+ },
+ {
+ .name = "stat",
+ .read_map = cpuacct_stats_show,
+ },
+ { } /* terminate */
+};
+
+/*
+ * charge this task's execution time to its accounting group.
+ *
+ * called with rq->lock held.
+ */
+void cpuacct_charge(struct task_struct *tsk, u64 cputime)
+{
+ struct cpuacct *ca;
+ int cpu;
+
+ cpu = task_cpu(tsk);
+
+ rcu_read_lock();
+
+ ca = task_ca(tsk);
+
+ while (true) {
+ u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+ *cpuusage += cputime;
+
+ ca = parent_ca(ca);
+ if (!ca)
+ break;
+ }
+
+ rcu_read_unlock();
+}
+
+/*
+ * Add user/system time to cpuacct.
+ *
+ * Note: it's the caller that updates the account of the root cgroup.
+ */
+void cpuacct_account_field(struct task_struct *p, int index, u64 val)
+{
+ struct kernel_cpustat *kcpustat;
+ struct cpuacct *ca;
+
+ rcu_read_lock();
+ ca = task_ca(p);
+ while (ca != &root_cpuacct) {
+ kcpustat = this_cpu_ptr(ca->cpustat);
+ kcpustat->cpustat[index] += val;
+ ca = __parent_ca(ca);
+ }
+ rcu_read_unlock();
+}
+
+struct cgroup_subsys cpuacct_subsys = {
+ .name = "cpuacct",
+ .css_alloc = cpuacct_css_alloc,
+ .css_free = cpuacct_css_free,
+ .subsys_id = cpuacct_subsys_id,
+ .base_cftypes = files,
+ .early_init = 1,
+};
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
new file mode 100644
index 00000000000..ed605624a5e
--- /dev/null
+++ b/kernel/sched/cpuacct.h
@@ -0,0 +1,17 @@
+#ifdef CONFIG_CGROUP_CPUACCT
+
+extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
+extern void cpuacct_account_field(struct task_struct *p, int index, u64 val);
+
+#else
+
+static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime)
+{
+}
+
+static inline void
+cpuacct_account_field(struct task_struct *p, int index, u64 val)
+{
+}
+
+#endif
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index e93cca92f38..cc2dc3eea8a 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -115,10 +115,6 @@ static int irqtime_account_si_update(void)
static inline void task_group_account_field(struct task_struct *p, int index,
u64 tmp)
{
-#ifdef CONFIG_CGROUP_CPUACCT
- struct kernel_cpustat *kcpustat;
- struct cpuacct *ca;
-#endif
/*
* Since all updates are sure to touch the root cgroup, we
* get ourselves ahead and touch it first. If the root cgroup
@@ -127,19 +123,7 @@ static inline void task_group_account_field(struct task_struct *p, int index,
*/
__get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
-#ifdef CONFIG_CGROUP_CPUACCT
- if (unlikely(!cpuacct_subsys.active))
- return;
-
- rcu_read_lock();
- ca = task_ca(p);
- while (ca && (ca != &root_cpuacct)) {
- kcpustat = this_cpu_ptr(ca->cpustat);
- kcpustat->cpustat[index] += tmp;
- ca = parent_ca(ca);
- }
- rcu_read_unlock();
-#endif
+ cpuacct_account_field(p, index, tmp);
}
/*
@@ -388,7 +372,84 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_
struct rq *rq) {}
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+/*
+ * Use precise platform statistics if available:
+ */
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+
+#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
+void vtime_task_switch(struct task_struct *prev)
+{
+ if (!vtime_accounting_enabled())
+ return;
+
+ if (is_idle_task(prev))
+ vtime_account_idle(prev);
+ else
+ vtime_account_system(prev);
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+ vtime_account_user(prev);
+#endif
+ arch_vtime_task_switch(prev);
+}
+#endif
+
+/*
+ * Archs that account the whole time spent in the idle task
+ * (outside irq) as idle time can rely on this and just implement
+ * vtime_account_system() and vtime_account_idle(). Archs that
+ * have other meaning of the idle time (s390 only includes the
+ * time spent by the CPU when it's in low power mode) must override
+ * vtime_account().
+ */
+#ifndef __ARCH_HAS_VTIME_ACCOUNT
+void vtime_account_irq_enter(struct task_struct *tsk)
+{
+ if (!vtime_accounting_enabled())
+ return;
+
+ if (!in_interrupt()) {
+ /*
+ * If we interrupted user, context_tracking_in_user()
+ * is 1 because the context tracking don't hook
+ * on irq entry/exit. This way we know if
+ * we need to flush user time on kernel entry.
+ */
+ if (context_tracking_in_user()) {
+ vtime_account_user(tsk);
+ return;
+ }
+
+ if (is_idle_task(tsk)) {
+ vtime_account_idle(tsk);
+ return;
+ }
+ }
+ vtime_account_system(tsk);
+}
+EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
+#endif /* __ARCH_HAS_VTIME_ACCOUNT */
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
+
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+ *ut = p->utime;
+ *st = p->stime;
+}
+
+void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+ struct task_cputime cputime;
+
+ thread_group_cputime(p, &cputime);
+
+ *ut = cputime.utime;
+ *st = cputime.stime;
+}
+#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
/*
* Account a single tick of cpu time.
* @p: the process that the cpu time gets accounted to
@@ -443,96 +504,50 @@ void account_idle_ticks(unsigned long ticks)
account_idle_time(jiffies_to_cputime(ticks));
}
-#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
-
-/*
- * Use precise platform statistics if available:
- */
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
- *ut = p->utime;
- *st = p->stime;
-}
-
-void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
- struct task_cputime cputime;
-
- thread_group_cputime(p, &cputime);
-
- *ut = cputime.utime;
- *st = cputime.stime;
-}
-
-#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
-void vtime_task_switch(struct task_struct *prev)
-{
- if (!vtime_accounting_enabled())
- return;
-
- if (is_idle_task(prev))
- vtime_account_idle(prev);
- else
- vtime_account_system(prev);
-
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
- vtime_account_user(prev);
-#endif
- arch_vtime_task_switch(prev);
-}
-#endif
/*
- * Archs that account the whole time spent in the idle task
- * (outside irq) as idle time can rely on this and just implement
- * vtime_account_system() and vtime_account_idle(). Archs that
- * have other meaning of the idle time (s390 only includes the
- * time spent by the CPU when it's in low power mode) must override
- * vtime_account().
+ * Perform (stime * rtime) / total, but avoid multiplication overflow by
+ * loosing precision when the numbers are big.
*/
-#ifndef __ARCH_HAS_VTIME_ACCOUNT
-void vtime_account_irq_enter(struct task_struct *tsk)
+static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
{
- if (!vtime_accounting_enabled())
- return;
+ u64 scaled;
- if (!in_interrupt()) {
- /*
- * If we interrupted user, context_tracking_in_user()
- * is 1 because the context tracking don't hook
- * on irq entry/exit. This way we know if
- * we need to flush user time on kernel entry.
- */
- if (context_tracking_in_user()) {
- vtime_account_user(tsk);
- return;
+ for (;;) {
+ /* Make sure "rtime" is the bigger of stime/rtime */
+ if (stime > rtime) {
+ u64 tmp = rtime; rtime = stime; stime = tmp;
}
- if (is_idle_task(tsk)) {
- vtime_account_idle(tsk);
- return;
- }
- }
- vtime_account_system(tsk);
-}
-EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
-#endif /* __ARCH_HAS_VTIME_ACCOUNT */
+ /* Make sure 'total' fits in 32 bits */
+ if (total >> 32)
+ goto drop_precision;
-#else /* !CONFIG_VIRT_CPU_ACCOUNTING */
+ /* Does rtime (and thus stime) fit in 32 bits? */
+ if (!(rtime >> 32))
+ break;
-static cputime_t scale_stime(cputime_t stime, cputime_t rtime, cputime_t total)
-{
- u64 temp = (__force u64) rtime;
+ /* Can we just balance rtime/stime rather than dropping bits? */
+ if (stime >> 31)
+ goto drop_precision;
- temp *= (__force u64) stime;
+ /* We can grow stime and shrink rtime and try to make them both fit */
+ stime <<= 1;
+ rtime >>= 1;
+ continue;
- if (sizeof(cputime_t) == 4)
- temp = div_u64(temp, (__force u32) total);
- else
- temp = div64_u64(temp, (__force u64) total);
+drop_precision:
+ /* We drop from rtime, it has more bits than stime */
+ rtime >>= 1;
+ total >>= 1;
+ }
- return (__force cputime_t) temp;
+ /*
+ * Make sure gcc understands that this is a 32x32->64 multiply,
+ * followed by a 64/32->64 divide.
+ */
+ scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
+ return (__force cputime_t) scaled;
}
/*
@@ -543,7 +558,13 @@ static void cputime_adjust(struct task_cputime *curr,
struct cputime *prev,
cputime_t *ut, cputime_t *st)
{
- cputime_t rtime, stime, total;
+ cputime_t rtime, stime, utime, total;
+
+ if (vtime_accounting_enabled()) {
+ *ut = curr->utime;
+ *st = curr->stime;
+ return;
+ }
stime = curr->stime;
total = stime + curr->utime;
@@ -560,10 +581,22 @@ static void cputime_adjust(struct task_cputime *curr,
*/
rtime = nsecs_to_cputime(curr->sum_exec_runtime);
- if (total)
- stime = scale_stime(stime, rtime, total);
- else
+ /*
+ * Update userspace visible utime/stime values only if actual execution
+ * time is bigger than already exported. Note that can happen, that we
+ * provided bigger values due to scaling inaccuracy on big numbers.
+ */
+ if (prev->stime + prev->utime >= rtime)
+ goto out;
+
+ if (total) {
+ stime = scale_stime((__force u64)stime,
+ (__force u64)rtime, (__force u64)total);
+ utime = rtime - stime;
+ } else {
stime = rtime;
+ utime = 0;
+ }
/*
* If the tick based count grows faster than the scheduler one,
@@ -571,8 +604,9 @@ static void cputime_adjust(struct task_cputime *curr,
* Let's enforce monotonicity.
*/
prev->stime = max(prev->stime, stime);
- prev->utime = max(prev->utime, rtime - prev->stime);
+ prev->utime = max(prev->utime, utime);
+out:
*ut = prev->utime;
*st = prev->stime;
}
@@ -597,7 +631,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
thread_group_cputime(p, &cputime);
cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
}
-#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
+#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
static unsigned long long vtime_delta(struct task_struct *tsk)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7a33e5986fc..c61a614465c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -431,13 +431,13 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
* Scheduling class tree data structure manipulation methods:
*/
-static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
+static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
{
- s64 delta = (s64)(vruntime - min_vruntime);
+ s64 delta = (s64)(vruntime - max_vruntime);
if (delta > 0)
- min_vruntime = vruntime;
+ max_vruntime = vruntime;
- return min_vruntime;
+ return max_vruntime;
}
static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
@@ -473,6 +473,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
vruntime = min_vruntime(vruntime, se->vruntime);
}
+ /* ensure we never gain time by being placed backwards. */
cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
#ifndef CONFIG_64BIT
smp_wmb();
@@ -652,7 +653,7 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
/*
- * We calculate the vruntime slice of a to be inserted task
+ * We calculate the vruntime slice of a to-be-inserted task.
*
* vs = s/w
*/
@@ -1562,6 +1563,27 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
} /* migrations, e.g. sleep=0 leave decay_count == 0 */
}
+
+/*
+ * Update the rq's load with the elapsed running time before entering
+ * idle. if the last scheduled task is not a CFS task, idle_enter will
+ * be the only way to update the runnable statistic.
+ */
+void idle_enter_fair(struct rq *this_rq)
+{
+ update_rq_runnable_avg(this_rq, 1);
+}
+
+/*
+ * Update the rq's load with the elapsed idle time before a task is
+ * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
+ * be the only way to update the runnable statistic.
+ */
+void idle_exit_fair(struct rq *this_rq)
+{
+ update_rq_runnable_avg(this_rq, 0);
+}
+
#else
static inline void update_entity_load_avg(struct sched_entity *se,
int update_cfs_rq) {}
@@ -3874,12 +3896,16 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
int tsk_cache_hot = 0;
/*
* We do not migrate tasks that are:
- * 1) running (obviously), or
+ * 1) throttled_lb_pair, or
* 2) cannot be migrated to this CPU due to cpus_allowed, or
- * 3) are cache-hot on their current CPU.
+ * 3) running (obviously), or
+ * 4) are cache-hot on their current CPU.
*/
+ if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
+ return 0;
+
if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
- int new_dst_cpu;
+ int cpu;
schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
@@ -3894,12 +3920,15 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
return 0;
- new_dst_cpu = cpumask_first_and(env->dst_grpmask,
- tsk_cpus_allowed(p));
- if (new_dst_cpu < nr_cpu_ids) {
- env->flags |= LBF_SOME_PINNED;
- env->new_dst_cpu = new_dst_cpu;
+ /* Prevent to re-select dst_cpu via env's cpus */
+ for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
+ if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
+ env->flags |= LBF_SOME_PINNED;
+ env->new_dst_cpu = cpu;
+ break;
+ }
}
+
return 0;
}
@@ -3920,20 +3949,17 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
if (!tsk_cache_hot ||
env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
-#ifdef CONFIG_SCHEDSTATS
+
if (tsk_cache_hot) {
schedstat_inc(env->sd, lb_hot_gained[env->idle]);
schedstat_inc(p, se.statistics.nr_forced_migrations);
}
-#endif
+
return 1;
}
- if (tsk_cache_hot) {
- schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
- return 0;
- }
- return 1;
+ schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
+ return 0;
}
/*
@@ -3948,9 +3974,6 @@ static int move_one_task(struct lb_env *env)
struct task_struct *p, *n;
list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
- if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
- continue;
-
if (!can_migrate_task(p, env))
continue;
@@ -4002,7 +4025,7 @@ static int move_tasks(struct lb_env *env)
break;
}
- if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
+ if (!can_migrate_task(p, env))
goto next;
load = task_h_load(p);
@@ -4013,9 +4036,6 @@ static int move_tasks(struct lb_env *env)
if ((load / 2) > env->imbalance)
goto next;
- if (!can_migrate_task(p, env))
- goto next;
-
move_task(p, env);
pulled++;
env->imbalance -= load;
@@ -4245,7 +4265,7 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
return load_idx;
}
-unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
+static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
{
return SCHED_POWER_SCALE;
}
@@ -4255,7 +4275,7 @@ unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
return default_scale_freq_power(sd, cpu);
}
-unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
+static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
{
unsigned long weight = sd->span_weight;
unsigned long smt_gain = sd->smt_gain;
@@ -4270,7 +4290,7 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
return default_scale_smt_power(sd, cpu);
}
-unsigned long scale_rt_power(int cpu)
+static unsigned long scale_rt_power(int cpu)
{
struct rq *rq = cpu_rq(cpu);
u64 total, available, age_stamp, avg;
@@ -4960,7 +4980,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
#define MAX_PINNED_INTERVAL 512
/* Working cpumask for load_balance and load_balance_newidle. */
-DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
+DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
static int need_active_balance(struct lb_env *env)
{
@@ -4991,11 +5011,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
int *balance)
{
int ld_moved, cur_ld_moved, active_balance = 0;
- int lb_iterations, max_lb_iterations;
struct sched_group *group;
struct rq *busiest;
unsigned long flags;
- struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
+ struct cpumask *cpus = __get_cpu_var(load_balance_mask);
struct lb_env env = {
.sd = sd,
@@ -5007,8 +5026,14 @@ static int load_balance(int this_cpu, struct rq *this_rq,
.cpus = cpus,
};
+ /*
+ * For NEWLY_IDLE load_balancing, we don't need to consider
+ * other cpus in our group
+ */
+ if (idle == CPU_NEWLY_IDLE)
+ env.dst_grpmask = NULL;
+
cpumask_copy(cpus, cpu_active_mask);
- max_lb_iterations = cpumask_weight(env.dst_grpmask);
schedstat_inc(sd, lb_count[idle]);
@@ -5034,7 +5059,6 @@ redo:
schedstat_add(sd, lb_imbalance[idle], env.imbalance);
ld_moved = 0;
- lb_iterations = 1;
if (busiest->nr_running > 1) {
/*
* Attempt to move tasks. If find_busiest_group has found
@@ -5061,17 +5085,17 @@ more_balance:
double_rq_unlock(env.dst_rq, busiest);
local_irq_restore(flags);
- if (env.flags & LBF_NEED_BREAK) {
- env.flags &= ~LBF_NEED_BREAK;
- goto more_balance;
- }
-
/*
* some other cpu did the load balance for us.
*/
if (cur_ld_moved && env.dst_cpu != smp_processor_id())
resched_cpu(env.dst_cpu);
+ if (env.flags & LBF_NEED_BREAK) {
+ env.flags &= ~LBF_NEED_BREAK;
+ goto more_balance;
+ }
+
/*
* Revisit (affine) tasks on src_cpu that couldn't be moved to
* us and move them to an alternate dst_cpu in our sched_group
@@ -5091,14 +5115,17 @@ more_balance:
* moreover subsequent load balance cycles should correct the
* excess load moved.
*/
- if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
- lb_iterations++ < max_lb_iterations) {
+ if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
env.dst_rq = cpu_rq(env.new_dst_cpu);
env.dst_cpu = env.new_dst_cpu;
env.flags &= ~LBF_SOME_PINNED;
env.loop = 0;
env.loop_break = sched_nr_migrate_break;
+
+ /* Prevent to re-select dst_cpu via env's cpus */
+ cpumask_clear_cpu(env.dst_cpu, env.cpus);
+
/*
* Go back to "more_balance" rather than "redo" since we
* need to continue with same src_cpu.
@@ -5219,8 +5246,6 @@ void idle_balance(int this_cpu, struct rq *this_rq)
if (this_rq->avg_idle < sysctl_sched_migration_cost)
return;
- update_rq_runnable_avg(this_rq, 1);
-
/*
* Drop the rq->lock, but keep IRQ/preempt disabled.
*/
@@ -5330,7 +5355,7 @@ out_unlock:
return 0;
}
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
/*
* idle load balancing details
* - When one of the busy CPUs notice that there may be an idle rebalancing
@@ -5395,13 +5420,16 @@ static inline void set_cpu_sd_state_busy(void)
struct sched_domain *sd;
int cpu = smp_processor_id();
- if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
- return;
- clear_bit(NOHZ_IDLE, nohz_flags(cpu));
-
rcu_read_lock();
- for_each_domain(cpu, sd)
+ sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
+
+ if (!sd || !sd->nohz_idle)
+ goto unlock;
+ sd->nohz_idle = 0;
+
+ for (; sd; sd = sd->parent)
atomic_inc(&sd->groups->sgp->nr_busy_cpus);
+unlock:
rcu_read_unlock();
}
@@ -5410,13 +5438,16 @@ void set_cpu_sd_state_idle(void)
struct sched_domain *sd;
int cpu = smp_processor_id();
- if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
- return;
- set_bit(NOHZ_IDLE, nohz_flags(cpu));
-
rcu_read_lock();
- for_each_domain(cpu, sd)
+ sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
+
+ if (!sd || sd->nohz_idle)
+ goto unlock;
+ sd->nohz_idle = 1;
+
+ for (; sd; sd = sd->parent)
atomic_dec(&sd->groups->sgp->nr_busy_cpus);
+unlock:
rcu_read_unlock();
}
@@ -5468,7 +5499,7 @@ void update_max_interval(void)
* It checks each scheduling domain to see if it is due to be balanced,
* and initiates a balancing operation if so.
*
- * Balancing parameters are set up in arch_init_sched_domains.
+ * Balancing parameters are set up in init_sched_domains.
*/
static void rebalance_domains(int cpu, enum cpu_idle_type idle)
{
@@ -5506,10 +5537,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
if (time_after_eq(jiffies, sd->last_balance + interval)) {
if (load_balance(cpu, rq, sd, idle, &balance)) {
/*
- * We've pulled tasks over so either we're no
- * longer idle.
+ * The LBF_SOME_PINNED logic could have changed
+ * env->dst_cpu, so we can't know our idle
+ * state even if we migrated tasks. Update it.
*/
- idle = CPU_NOT_IDLE;
+ idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
}
sd->last_balance = jiffies;
}
@@ -5540,9 +5572,9 @@ out:
rq->next_balance = next_balance;
}
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
/*
- * In CONFIG_NO_HZ case, the idle balance kickee will do the
+ * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
* rebalancing for all the cpus for whom scheduler ticks are stopped.
*/
static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
@@ -5685,7 +5717,7 @@ void trigger_load_balance(struct rq *rq, int cpu)
if (time_after_eq(jiffies, rq->next_balance) &&
likely(!on_null_domain(cpu)))
raise_softirq(SCHED_SOFTIRQ);
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
nohz_balancer_kick(cpu);
#endif
@@ -6155,7 +6187,7 @@ __init void init_sched_fair_class(void)
#ifdef CONFIG_SMP
open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
nohz.next_balance = jiffies;
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
cpu_notifier(sched_ilb_notifier, 0);
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index b6baf370cae..d8da01008d3 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -13,6 +13,17 @@ select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
{
return task_cpu(p); /* IDLE tasks as never migrated */
}
+
+static void pre_schedule_idle(struct rq *rq, struct task_struct *prev)
+{
+ idle_exit_fair(rq);
+ rq_last_tick_reset(rq);
+}
+
+static void post_schedule_idle(struct rq *rq)
+{
+ idle_enter_fair(rq);
+}
#endif /* CONFIG_SMP */
/*
* Idle tasks are unconditionally rescheduled:
@@ -25,6 +36,10 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
static struct task_struct *pick_next_task_idle(struct rq *rq)
{
schedstat_inc(rq, sched_goidle);
+#ifdef CONFIG_SMP
+ /* Trigger the post schedule to do an idle_enter for CFS */
+ rq->post_schedule = 1;
+#endif
return rq->idle;
}
@@ -86,6 +101,8 @@ const struct sched_class idle_sched_class = {
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_idle,
+ .pre_schedule = pre_schedule_idle,
+ .post_schedule = post_schedule_idle,
#endif
.set_curr_task = set_curr_task_idle,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index cc03cfdf469..ce39224d615 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -5,8 +5,10 @@
#include <linux/mutex.h>
#include <linux/spinlock.h>
#include <linux/stop_machine.h>
+#include <linux/tick.h>
#include "cpupri.h"
+#include "cpuacct.h"
extern __read_mostly int scheduler_running;
@@ -33,6 +35,31 @@ extern __read_mostly int scheduler_running;
*/
#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
+/*
+ * Increase resolution of nice-level calculations for 64-bit architectures.
+ * The extra resolution improves shares distribution and load balancing of
+ * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup
+ * hierarchies, especially on larger systems. This is not a user-visible change
+ * and does not change the user-interface for setting shares/weights.
+ *
+ * We increase resolution only if we have enough bits to allow this increased
+ * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution
+ * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the
+ * increased costs.
+ */
+#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load */
+# define SCHED_LOAD_RESOLUTION 10
+# define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION)
+# define scale_load_down(w) ((w) >> SCHED_LOAD_RESOLUTION)
+#else
+# define SCHED_LOAD_RESOLUTION 0
+# define scale_load(w) (w)
+# define scale_load_down(w) (w)
+#endif
+
+#define SCHED_LOAD_SHIFT (10 + SCHED_LOAD_RESOLUTION)
+#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT)
+
#define NICE_0_LOAD SCHED_LOAD_SCALE
#define NICE_0_SHIFT SCHED_LOAD_SHIFT
@@ -154,11 +181,6 @@ struct task_group {
#define MAX_SHARES (1UL << 18)
#endif
-/* Default task group.
- * Every task in system belong to this group at bootup.
- */
-extern struct task_group root_task_group;
-
typedef int (*tg_visitor)(struct task_group *, void *);
extern int walk_tg_tree_from(struct task_group *from,
@@ -196,6 +218,18 @@ extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
struct sched_rt_entity *rt_se, int cpu,
struct sched_rt_entity *parent);
+extern struct task_group *sched_create_group(struct task_group *parent);
+extern void sched_online_group(struct task_group *tg,
+ struct task_group *parent);
+extern void sched_destroy_group(struct task_group *tg);
+extern void sched_offline_group(struct task_group *tg);
+
+extern void sched_move_task(struct task_struct *tsk);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
+#endif
+
#else /* CONFIG_CGROUP_SCHED */
struct cfs_bandwidth { };
@@ -372,10 +406,13 @@ struct rq {
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
unsigned long last_load_update_tick;
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
u64 nohz_stamp;
unsigned long nohz_flags;
#endif
+#ifdef CONFIG_NO_HZ_FULL
+ unsigned long last_sched_tick;
+#endif
int skip_clock_update;
/* capture load from *all* tasks on this cpu: */
@@ -547,6 +584,62 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
DECLARE_PER_CPU(struct sched_domain *, sd_llc);
DECLARE_PER_CPU(int, sd_llc_id);
+struct sched_group_power {
+ atomic_t ref;
+ /*
+ * CPU power of this group, SCHED_LOAD_SCALE being max power for a
+ * single CPU.
+ */
+ unsigned int power, power_orig;
+ unsigned long next_update;
+ /*
+ * Number of busy cpus in this group.
+ */
+ atomic_t nr_busy_cpus;
+
+ unsigned long cpumask[0]; /* iteration mask */
+};
+
+struct sched_group {
+ struct sched_group *next; /* Must be a circular list */
+ atomic_t ref;
+
+ unsigned int group_weight;
+ struct sched_group_power *sgp;
+
+ /*
+ * The CPUs this group covers.
+ *
+ * NOTE: this field is variable length. (Allocated dynamically
+ * by attaching extra space to the end of the structure,
+ * depending on how many CPUs the kernel has booted up with)
+ */
+ unsigned long cpumask[0];
+};
+
+static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
+{
+ return to_cpumask(sg->cpumask);
+}
+
+/*
+ * cpumask masking which cpus in the group are allowed to iterate up the domain
+ * tree.
+ */
+static inline struct cpumask *sched_group_mask(struct sched_group *sg)
+{
+ return to_cpumask(sg->sgp->cpumask);
+}
+
+/**
+ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
+ * @group: The group whose first cpu is to be returned.
+ */
+static inline unsigned int group_first_cpu(struct sched_group *group)
+{
+ return cpumask_first(sched_group_cpus(group));
+}
+
extern int group_balance_cpu(struct sched_group *sg);
#endif /* CONFIG_SMP */
@@ -784,6 +877,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
}
#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
+/*
+ * wake flags
+ */
+#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */
+#define WF_FORK 0x02 /* child wakeup after fork */
+#define WF_MIGRATED 0x4 /* internal use, task got migrated */
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
@@ -856,14 +955,61 @@ static const u32 prio_to_wmult[40] = {
/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
};
-/* Time spent by the tasks of the cpu accounting group executing in ... */
-enum cpuacct_stat_index {
- CPUACCT_STAT_USER, /* ... user mode */
- CPUACCT_STAT_SYSTEM, /* ... kernel mode */
+#define ENQUEUE_WAKEUP 1
+#define ENQUEUE_HEAD 2
+#ifdef CONFIG_SMP
+#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */
+#else
+#define ENQUEUE_WAKING 0
+#endif
- CPUACCT_STAT_NSTATS,
-};
+#define DEQUEUE_SLEEP 1
+struct sched_class {
+ const struct sched_class *next;
+
+ void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
+ void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
+ void (*yield_task) (struct rq *rq);
+ bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt);
+
+ void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
+
+ struct task_struct * (*pick_next_task) (struct rq *rq);
+ void (*put_prev_task) (struct rq *rq, struct task_struct *p);
+
+#ifdef CONFIG_SMP
+ int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
+ void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
+
+ void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
+ void (*post_schedule) (struct rq *this_rq);
+ void (*task_waking) (struct task_struct *task);
+ void (*task_woken) (struct rq *this_rq, struct task_struct *task);
+
+ void (*set_cpus_allowed)(struct task_struct *p,
+ const struct cpumask *newmask);
+
+ void (*rq_online)(struct rq *rq);
+ void (*rq_offline)(struct rq *rq);
+#endif
+
+ void (*set_curr_task) (struct rq *rq);
+ void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
+ void (*task_fork) (struct task_struct *p);
+
+ void (*switched_from) (struct rq *this_rq, struct task_struct *task);
+ void (*switched_to) (struct rq *this_rq, struct task_struct *task);
+ void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
+ int oldprio);
+
+ unsigned int (*get_rr_interval) (struct rq *rq,
+ struct task_struct *task);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ void (*task_move_group) (struct task_struct *p, int on_rq);
+#endif
+};
#define sched_class_highest (&stop_sched_class)
#define for_each_class(class) \
@@ -877,9 +1023,23 @@ extern const struct sched_class idle_sched_class;
#ifdef CONFIG_SMP
+extern void update_group_power(struct sched_domain *sd, int cpu);
+
extern void trigger_load_balance(struct rq *rq, int cpu);
extern void idle_balance(int this_cpu, struct rq *this_rq);
+/*
+ * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg
+ * becomes useful in lb
+ */
+#if defined(CONFIG_FAIR_GROUP_SCHED)
+extern void idle_enter_fair(struct rq *this_rq);
+extern void idle_exit_fair(struct rq *this_rq);
+#else
+static inline void idle_enter_fair(struct rq *this_rq) {}
+static inline void idle_exit_fair(struct rq *this_rq) {}
+#endif
+
#else /* CONFIG_SMP */
static inline void idle_balance(int cpu, struct rq *rq)
@@ -891,7 +1051,6 @@ static inline void idle_balance(int cpu, struct rq *rq)
extern void sysrq_sched_debug_show(void);
extern void sched_init_granularity(void);
extern void update_max_interval(void);
-extern void update_group_power(struct sched_domain *sd, int cpu);
extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu);
extern void init_sched_rt_class(void);
extern void init_sched_fair_class(void);
@@ -904,45 +1063,6 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime
extern void update_idle_cpu_load(struct rq *this_rq);
-#ifdef CONFIG_CGROUP_CPUACCT
-#include <linux/cgroup.h>
-/* track cpu usage of a group of tasks and its child groups */
-struct cpuacct {
- struct cgroup_subsys_state css;
- /* cpuusage holds pointer to a u64-type object on every cpu */
- u64 __percpu *cpuusage;
- struct kernel_cpustat __percpu *cpustat;
-};
-
-extern struct cgroup_subsys cpuacct_subsys;
-extern struct cpuacct root_cpuacct;
-
-/* return cpu accounting group corresponding to this container */
-static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
-{
- return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
- struct cpuacct, css);
-}
-
-/* return cpu accounting group to which this task belongs */
-static inline struct cpuacct *task_ca(struct task_struct *tsk)
-{
- return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
- struct cpuacct, css);
-}
-
-static inline struct cpuacct *parent_ca(struct cpuacct *ca)
-{
- if (!ca || !ca->css.cgroup->parent)
- return NULL;
- return cgroup_ca(ca->css.cgroup->parent);
-}
-
-extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
-#else
-static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
-#endif
-
#ifdef CONFIG_PARAVIRT
static inline u64 steal_ticks(u64 steal)
{
@@ -956,6 +1076,16 @@ static inline u64 steal_ticks(u64 steal)
static inline void inc_nr_running(struct rq *rq)
{
rq->nr_running++;
+
+#ifdef CONFIG_NO_HZ_FULL
+ if (rq->nr_running == 2) {
+ if (tick_nohz_full_cpu(rq->cpu)) {
+ /* Order rq->nr_running write against the IPI */
+ smp_wmb();
+ smp_send_reschedule(rq->cpu);
+ }
+ }
+#endif
}
static inline void dec_nr_running(struct rq *rq)
@@ -963,6 +1093,13 @@ static inline void dec_nr_running(struct rq *rq)
rq->nr_running--;
}
+static inline void rq_last_tick_reset(struct rq *rq)
+{
+#ifdef CONFIG_NO_HZ_FULL
+ rq->last_sched_tick = jiffies;
+#endif
+}
+
extern void update_rq_clock(struct rq *rq);
extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
@@ -1183,11 +1320,10 @@ extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
enum rq_nohz_flag_bits {
NOHZ_TICK_STOPPED,
NOHZ_BALANCE_KICK,
- NOHZ_IDLE,
};
#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index e036eda1a9c..da98af347e8 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -130,16 +130,11 @@ static int schedstat_open(struct inode *inode, struct file *file)
return seq_open(file, &schedstat_sops);
}
-static int schedstat_release(struct inode *inode, struct file *file)
-{
- return 0;
-};
-
static const struct file_operations proc_schedstat_operations = {
.open = schedstat_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = schedstat_release,
+ .release = seq_release,
};
static int __init proc_schedstat_init(void)
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 5af44b59377..b7a10048a32 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -160,6 +160,8 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
case BPF_S_ALU_AND_X:
case BPF_S_ALU_OR_K:
case BPF_S_ALU_OR_X:
+ case BPF_S_ALU_XOR_K:
+ case BPF_S_ALU_XOR_X:
case BPF_S_ALU_LSH_K:
case BPF_S_ALU_LSH_X:
case BPF_S_ALU_RSH_K:
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index 4567fc020fe..6815171a4ff 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -193,7 +193,7 @@ EXPORT_SYMBOL(up);
struct semaphore_waiter {
struct list_head list;
struct task_struct *task;
- int up;
+ bool up;
};
/*
@@ -209,12 +209,12 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
list_add_tail(&waiter.list, &sem->wait_list);
waiter.task = task;
- waiter.up = 0;
+ waiter.up = false;
for (;;) {
if (signal_pending_state(state, task))
goto interrupted;
- if (timeout <= 0)
+ if (unlikely(timeout <= 0))
goto timed_out;
__set_task_state(task, state);
raw_spin_unlock_irq(&sem->lock);
@@ -258,6 +258,6 @@ static noinline void __sched __up(struct semaphore *sem)
struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list,
struct semaphore_waiter, list);
list_del(&waiter->list);
- waiter->up = 1;
+ waiter->up = true;
wake_up_process(waiter->task);
}
diff --git a/kernel/signal.c b/kernel/signal.c
index 598dc06be42..113411bfe8b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -32,6 +32,7 @@
#include <linux/user_namespace.h>
#include <linux/uprobes.h>
#include <linux/compat.h>
+#include <linux/cn_proc.h>
#define CREATE_TRACE_POINTS
#include <trace/events/signal.h>
@@ -854,12 +855,14 @@ static void ptrace_trap_notify(struct task_struct *t)
* Returns true if the signal should be actually delivered, otherwise
* it should be dropped.
*/
-static int prepare_signal(int sig, struct task_struct *p, bool force)
+static bool prepare_signal(int sig, struct task_struct *p, bool force)
{
struct signal_struct *signal = p->signal;
struct task_struct *t;
- if (unlikely(signal->flags & SIGNAL_GROUP_EXIT)) {
+ if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) {
+ if (signal->flags & SIGNAL_GROUP_COREDUMP)
+ return sig == SIGKILL;
/*
* The process is in the middle of dying, nothing to do.
*/
@@ -1160,8 +1163,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
static void print_fatal_signal(int signr)
{
struct pt_regs *regs = signal_pt_regs();
- printk(KERN_INFO "%s/%d: potentially unexpected fatal signal %d.\n",
- current->comm, task_pid_nr(current), signr);
+ printk(KERN_INFO "potentially unexpected fatal signal %d.\n", signr);
#if defined(__i386__) && !defined(__arch_um__)
printk(KERN_INFO "code at %08lx: ", regs->ip);
@@ -2350,6 +2352,7 @@ relock:
if (sig_kernel_coredump(signr)) {
if (print_fatal_signals)
print_fatal_signal(info->si_signo);
+ proc_coredump_connector(current);
/*
* If it was able to dump core, this kills all
* other threads in the group and synchronizes with
diff --git a/kernel/smp.c b/kernel/smp.c
index 8e451f3ff51..4dba0f7b72a 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -100,16 +100,16 @@ void __init call_function_init(void)
* previous function call. For multi-cpu calls its even more interesting
* as we'll have to ensure no other cpu is observing our csd.
*/
-static void csd_lock_wait(struct call_single_data *data)
+static void csd_lock_wait(struct call_single_data *csd)
{
- while (data->flags & CSD_FLAG_LOCK)
+ while (csd->flags & CSD_FLAG_LOCK)
cpu_relax();
}
-static void csd_lock(struct call_single_data *data)
+static void csd_lock(struct call_single_data *csd)
{
- csd_lock_wait(data);
- data->flags = CSD_FLAG_LOCK;
+ csd_lock_wait(csd);
+ csd->flags |= CSD_FLAG_LOCK;
/*
* prevent CPU from reordering the above assignment
@@ -119,16 +119,16 @@ static void csd_lock(struct call_single_data *data)
smp_mb();
}
-static void csd_unlock(struct call_single_data *data)
+static void csd_unlock(struct call_single_data *csd)
{
- WARN_ON(!(data->flags & CSD_FLAG_LOCK));
+ WARN_ON(!(csd->flags & CSD_FLAG_LOCK));
/*
* ensure we're all done before releasing data:
*/
smp_mb();
- data->flags &= ~CSD_FLAG_LOCK;
+ csd->flags &= ~CSD_FLAG_LOCK;
}
/*
@@ -137,7 +137,7 @@ static void csd_unlock(struct call_single_data *data)
* ->func, ->info, and ->flags set.
*/
static
-void generic_exec_single(int cpu, struct call_single_data *data, int wait)
+void generic_exec_single(int cpu, struct call_single_data *csd, int wait)
{
struct call_single_queue *dst = &per_cpu(call_single_queue, cpu);
unsigned long flags;
@@ -145,7 +145,7 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
raw_spin_lock_irqsave(&dst->lock, flags);
ipi = list_empty(&dst->list);
- list_add_tail(&data->list, &dst->list);
+ list_add_tail(&csd->list, &dst->list);
raw_spin_unlock_irqrestore(&dst->lock, flags);
/*
@@ -163,7 +163,7 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
arch_send_call_function_single_ipi(cpu);
if (wait)
- csd_lock_wait(data);
+ csd_lock_wait(csd);
}
/*
@@ -173,7 +173,6 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
void generic_smp_call_function_single_interrupt(void)
{
struct call_single_queue *q = &__get_cpu_var(call_single_queue);
- unsigned int data_flags;
LIST_HEAD(list);
/*
@@ -186,25 +185,26 @@ void generic_smp_call_function_single_interrupt(void)
raw_spin_unlock(&q->lock);
while (!list_empty(&list)) {
- struct call_single_data *data;
+ struct call_single_data *csd;
+ unsigned int csd_flags;
- data = list_entry(list.next, struct call_single_data, list);
- list_del(&data->list);
+ csd = list_entry(list.next, struct call_single_data, list);
+ list_del(&csd->list);
/*
- * 'data' can be invalid after this call if flags == 0
+ * 'csd' can be invalid after this call if flags == 0
* (when called through generic_exec_single()),
* so save them away before making the call:
*/
- data_flags = data->flags;
+ csd_flags = csd->flags;
- data->func(data->info);
+ csd->func(csd->info);
/*
* Unlocked CSDs are valid through generic_exec_single():
*/
- if (data_flags & CSD_FLAG_LOCK)
- csd_unlock(data);
+ if (csd_flags & CSD_FLAG_LOCK)
+ csd_unlock(csd);
}
}
@@ -249,16 +249,16 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
local_irq_restore(flags);
} else {
if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
- struct call_single_data *data = &d;
+ struct call_single_data *csd = &d;
if (!wait)
- data = &__get_cpu_var(csd_data);
+ csd = &__get_cpu_var(csd_data);
- csd_lock(data);
+ csd_lock(csd);
- data->func = func;
- data->info = info;
- generic_exec_single(cpu, data, wait);
+ csd->func = func;
+ csd->info = info;
+ generic_exec_single(cpu, csd, wait);
} else {
err = -ENXIO; /* CPU not online */
}
@@ -325,7 +325,7 @@ EXPORT_SYMBOL_GPL(smp_call_function_any);
* pre-allocated data structure. Useful for embedding @data inside
* other structures, for instance.
*/
-void __smp_call_function_single(int cpu, struct call_single_data *data,
+void __smp_call_function_single(int cpu, struct call_single_data *csd,
int wait)
{
unsigned int this_cpu;
@@ -343,11 +343,11 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
if (cpu == this_cpu) {
local_irq_save(flags);
- data->func(data->info);
+ csd->func(csd->info);
local_irq_restore(flags);
} else {
- csd_lock(data);
- generic_exec_single(cpu, data, wait);
+ csd_lock(csd);
+ generic_exec_single(cpu, csd, wait);
}
put_cpu();
}
@@ -369,7 +369,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
void smp_call_function_many(const struct cpumask *mask,
smp_call_func_t func, void *info, bool wait)
{
- struct call_function_data *data;
+ struct call_function_data *cfd;
int cpu, next_cpu, this_cpu = smp_processor_id();
/*
@@ -401,24 +401,24 @@ void smp_call_function_many(const struct cpumask *mask,
return;
}
- data = &__get_cpu_var(cfd_data);
+ cfd = &__get_cpu_var(cfd_data);
- cpumask_and(data->cpumask, mask, cpu_online_mask);
- cpumask_clear_cpu(this_cpu, data->cpumask);
+ cpumask_and(cfd->cpumask, mask, cpu_online_mask);
+ cpumask_clear_cpu(this_cpu, cfd->cpumask);
/* Some callers race with other cpus changing the passed mask */
- if (unlikely(!cpumask_weight(data->cpumask)))
+ if (unlikely(!cpumask_weight(cfd->cpumask)))
return;
/*
- * After we put an entry into the list, data->cpumask
- * may be cleared again when another CPU sends another IPI for
- * a SMP function call, so data->cpumask will be zero.
+ * After we put an entry into the list, cfd->cpumask may be cleared
+ * again when another CPU sends another IPI for a SMP function call, so
+ * cfd->cpumask will be zero.
*/
- cpumask_copy(data->cpumask_ipi, data->cpumask);
+ cpumask_copy(cfd->cpumask_ipi, cfd->cpumask);
- for_each_cpu(cpu, data->cpumask) {
- struct call_single_data *csd = per_cpu_ptr(data->csd, cpu);
+ for_each_cpu(cpu, cfd->cpumask) {
+ struct call_single_data *csd = per_cpu_ptr(cfd->csd, cpu);
struct call_single_queue *dst =
&per_cpu(call_single_queue, cpu);
unsigned long flags;
@@ -433,12 +433,13 @@ void smp_call_function_many(const struct cpumask *mask,
}
/* Send a message to all CPUs in the map */
- arch_send_call_function_ipi_mask(data->cpumask_ipi);
+ arch_send_call_function_ipi_mask(cfd->cpumask_ipi);
if (wait) {
- for_each_cpu(cpu, data->cpumask) {
- struct call_single_data *csd =
- per_cpu_ptr(data->csd, cpu);
+ for_each_cpu(cpu, cfd->cpumask) {
+ struct call_single_data *csd;
+
+ csd = per_cpu_ptr(cfd->csd, cpu);
csd_lock_wait(csd);
}
}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 14d7758074a..b5197dcb0da 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -329,6 +329,19 @@ static inline void invoke_softirq(void)
wakeup_softirqd();
}
+static inline void tick_irq_exit(void)
+{
+#ifdef CONFIG_NO_HZ_COMMON
+ int cpu = smp_processor_id();
+
+ /* Make sure that timer wheel updates are propagated */
+ if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) {
+ if (!in_interrupt())
+ tick_nohz_irq_exit();
+ }
+#endif
+}
+
/*
* Exit an interrupt context. Process softirqs if needed and possible:
*/
@@ -346,11 +359,7 @@ void irq_exit(void)
if (!in_interrupt() && local_softirq_pending())
invoke_softirq();
-#ifdef CONFIG_NO_HZ
- /* Make sure that timer wheel updates are propagated */
- if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
- tick_nohz_irq_exit();
-#endif
+ tick_irq_exit();
rcu_irq_exit();
}
@@ -620,8 +629,7 @@ static void remote_softirq_receive(void *data)
unsigned long flags;
int softirq;
- softirq = cp->priv;
-
+ softirq = *(int *)cp->info;
local_irq_save(flags);
__local_trigger(cp, softirq);
local_irq_restore(flags);
@@ -631,9 +639,8 @@ static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softir
{
if (cpu_online(cpu)) {
cp->func = remote_softirq_receive;
- cp->info = cp;
+ cp->info = &softirq;
cp->flags = 0;
- cp->priv = softirq;
__smp_call_function_single(cpu, cp, 0);
return 0;
diff --git a/kernel/sys.c b/kernel/sys.c
index 0da73cf73e6..b95d3c72ba2 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -49,6 +49,11 @@
#include <linux/user_namespace.h>
#include <linux/binfmts.h>
+#include <linux/sched.h>
+#include <linux/rcupdate.h>
+#include <linux/uidgid.h>
+#include <linux/cred.h>
+
#include <linux/kmsg_dump.h>
/* Move somewhere else to avoid recompiling? */
#include <generated/utsrelease.h>
@@ -1044,6 +1049,67 @@ change_okay:
return old_fsgid;
}
+/**
+ * sys_getpid - return the thread group id of the current process
+ *
+ * Note, despite the name, this returns the tgid not the pid. The tgid and
+ * the pid are identical unless CLONE_THREAD was specified on clone() in
+ * which case the tgid is the same in all threads of the same group.
+ *
+ * This is SMP safe as current->tgid does not change.
+ */
+SYSCALL_DEFINE0(getpid)
+{
+ return task_tgid_vnr(current);
+}
+
+/* Thread ID - the internal kernel "pid" */
+SYSCALL_DEFINE0(gettid)
+{
+ return task_pid_vnr(current);
+}
+
+/*
+ * Accessing ->real_parent is not SMP-safe, it could
+ * change from under us. However, we can use a stale
+ * value of ->real_parent under rcu_read_lock(), see
+ * release_task()->call_rcu(delayed_put_task_struct).
+ */
+SYSCALL_DEFINE0(getppid)
+{
+ int pid;
+
+ rcu_read_lock();
+ pid = task_tgid_vnr(rcu_dereference(current->real_parent));
+ rcu_read_unlock();
+
+ return pid;
+}
+
+SYSCALL_DEFINE0(getuid)
+{
+ /* Only we change this so SMP safe */
+ return from_kuid_munged(current_user_ns(), current_uid());
+}
+
+SYSCALL_DEFINE0(geteuid)
+{
+ /* Only we change this so SMP safe */
+ return from_kuid_munged(current_user_ns(), current_euid());
+}
+
+SYSCALL_DEFINE0(getgid)
+{
+ /* Only we change this so SMP safe */
+ return from_kgid_munged(current_user_ns(), current_gid());
+}
+
+SYSCALL_DEFINE0(getegid)
+{
+ /* Only we change this so SMP safe */
+ return from_kgid_munged(current_user_ns(), current_egid());
+}
+
void do_sys_times(struct tms *tms)
{
cputime_t tgutime, tgstime, cutime, cstime;
@@ -1785,13 +1851,26 @@ SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
return getrusage(current, who, ru);
}
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(getrusage, int, who, struct compat_rusage __user *, ru)
+{
+ struct rusage r;
+
+ if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
+ who != RUSAGE_THREAD)
+ return -EINVAL;
+
+ k_getrusage(current, who, &r);
+ return put_compat_rusage(&r, ru);
+}
+#endif
+
SYSCALL_DEFINE1(umask, int, mask)
{
mask = xchg(&current->fs->umask, mask & S_IRWXUGO);
return mask;
}
-#ifdef CONFIG_CHECKPOINT_RESTORE
static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
{
struct fd exe;
@@ -1985,17 +2064,12 @@ out:
return error;
}
+#ifdef CONFIG_CHECKPOINT_RESTORE
static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
{
return put_user(me->clear_child_tid, tid_addr);
}
-
-#else /* CONFIG_CHECKPOINT_RESTORE */
-static int prctl_set_mm(int opt, unsigned long addr,
- unsigned long arg4, unsigned long arg5)
-{
- return -EINVAL;
-}
+#else
static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr)
{
return -EINVAL;
@@ -2245,3 +2319,148 @@ int orderly_poweroff(bool force)
return 0;
}
EXPORT_SYMBOL_GPL(orderly_poweroff);
+
+/**
+ * do_sysinfo - fill in sysinfo struct
+ * @info: pointer to buffer to fill
+ */
+static int do_sysinfo(struct sysinfo *info)
+{
+ unsigned long mem_total, sav_total;
+ unsigned int mem_unit, bitcount;
+ struct timespec tp;
+
+ memset(info, 0, sizeof(struct sysinfo));
+
+ ktime_get_ts(&tp);
+ monotonic_to_bootbased(&tp);
+ info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
+
+ get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
+
+ info->procs = nr_threads;
+
+ si_meminfo(info);
+ si_swapinfo(info);
+
+ /*
+ * If the sum of all the available memory (i.e. ram + swap)
+ * is less than can be stored in a 32 bit unsigned long then
+ * we can be binary compatible with 2.2.x kernels. If not,
+ * well, in that case 2.2.x was broken anyways...
+ *
+ * -Erik Andersen <andersee@debian.org>
+ */
+
+ mem_total = info->totalram + info->totalswap;
+ if (mem_total < info->totalram || mem_total < info->totalswap)
+ goto out;
+ bitcount = 0;
+ mem_unit = info->mem_unit;
+ while (mem_unit > 1) {
+ bitcount++;
+ mem_unit >>= 1;
+ sav_total = mem_total;
+ mem_total <<= 1;
+ if (mem_total < sav_total)
+ goto out;
+ }
+
+ /*
+ * If mem_total did not overflow, multiply all memory values by
+ * info->mem_unit and set it to 1. This leaves things compatible
+ * with 2.2.x, and also retains compatibility with earlier 2.4.x
+ * kernels...
+ */
+
+ info->mem_unit = 1;
+ info->totalram <<= bitcount;
+ info->freeram <<= bitcount;
+ info->sharedram <<= bitcount;
+ info->bufferram <<= bitcount;
+ info->totalswap <<= bitcount;
+ info->freeswap <<= bitcount;
+ info->totalhigh <<= bitcount;
+ info->freehigh <<= bitcount;
+
+out:
+ return 0;
+}
+
+SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info)
+{
+ struct sysinfo val;
+
+ do_sysinfo(&val);
+
+ if (copy_to_user(info, &val, sizeof(struct sysinfo)))
+ return -EFAULT;
+
+ return 0;
+}
+
+#ifdef CONFIG_COMPAT
+struct compat_sysinfo {
+ s32 uptime;
+ u32 loads[3];
+ u32 totalram;
+ u32 freeram;
+ u32 sharedram;
+ u32 bufferram;
+ u32 totalswap;
+ u32 freeswap;
+ u16 procs;
+ u16 pad;
+ u32 totalhigh;
+ u32 freehigh;
+ u32 mem_unit;
+ char _f[20-2*sizeof(u32)-sizeof(int)];
+};
+
+COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
+{
+ struct sysinfo s;
+
+ do_sysinfo(&s);
+
+ /* Check to see if any memory value is too large for 32-bit and scale
+ * down if needed
+ */
+ if ((s.totalram >> 32) || (s.totalswap >> 32)) {
+ int bitcount = 0;
+
+ while (s.mem_unit < PAGE_SIZE) {
+ s.mem_unit <<= 1;
+ bitcount++;
+ }
+
+ s.totalram >>= bitcount;
+ s.freeram >>= bitcount;
+ s.sharedram >>= bitcount;
+ s.bufferram >>= bitcount;
+ s.totalswap >>= bitcount;
+ s.freeswap >>= bitcount;
+ s.totalhigh >>= bitcount;
+ s.freehigh >>= bitcount;
+ }
+
+ if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) ||
+ __put_user(s.uptime, &info->uptime) ||
+ __put_user(s.loads[0], &info->loads[0]) ||
+ __put_user(s.loads[1], &info->loads[1]) ||
+ __put_user(s.loads[2], &info->loads[2]) ||
+ __put_user(s.totalram, &info->totalram) ||
+ __put_user(s.freeram, &info->freeram) ||
+ __put_user(s.sharedram, &info->sharedram) ||
+ __put_user(s.bufferram, &info->bufferram) ||
+ __put_user(s.totalswap, &info->totalswap) ||
+ __put_user(s.freeswap, &info->freeswap) ||
+ __put_user(s.procs, &info->procs) ||
+ __put_user(s.totalhigh, &info->totalhigh) ||
+ __put_user(s.freehigh, &info->freehigh) ||
+ __put_user(s.mem_unit, &info->mem_unit))
+ return -EFAULT;
+
+ return 0;
+}
+#endif /* CONFIG_COMPAT */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 395084d4ce1..7078052284f 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -20,6 +20,7 @@ cond_syscall(sys_quotactl);
cond_syscall(sys32_quotactl);
cond_syscall(sys_acct);
cond_syscall(sys_lookup_dcookie);
+cond_syscall(compat_sys_lookup_dcookie);
cond_syscall(sys_swapon);
cond_syscall(sys_swapoff);
cond_syscall(sys_kexec_load);
@@ -155,7 +156,7 @@ cond_syscall(compat_sys_process_vm_writev);
cond_syscall(sys_pciconfig_read);
cond_syscall(sys_pciconfig_write);
cond_syscall(sys_pciconfig_iobase);
-cond_syscall(sys32_ipc);
+cond_syscall(compat_sys_s390_ipc);
cond_syscall(ppc_rtas);
cond_syscall(sys_spu_run);
cond_syscall(sys_spu_create);
@@ -199,6 +200,7 @@ cond_syscall(sys_perf_event_open);
/* fanotify! */
cond_syscall(sys_fanotify_init);
cond_syscall(sys_fanotify_mark);
+cond_syscall(compat_sys_fanotify_mark);
/* open by handle */
cond_syscall(sys_name_to_handle_at);
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index ebf72358e86..aea4a9ea6fc 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -15,6 +15,7 @@
#include <linux/netdevice.h>
#include <linux/kernel.h>
#include <linux/slab.h>
+#include <linux/compat.h>
#ifdef CONFIG_SYSCTL_SYSCALL
@@ -1447,7 +1448,6 @@ SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args)
#ifdef CONFIG_COMPAT
-#include <asm/compat.h>
struct compat_sysctl_args {
compat_uptr_t name;
@@ -1459,7 +1459,7 @@ struct compat_sysctl_args {
compat_ulong_t __unused[4];
};
-asmlinkage long compat_sys_sysctl(struct compat_sysctl_args __user *args)
+COMPAT_SYSCALL_DEFINE1(sysctl, struct compat_sysctl_args __user *, args)
{
struct compat_sysctl_args tmp;
compat_size_t __user *compat_oldlenp;
diff --git a/kernel/time.c b/kernel/time.c
index f8342a41efa..d3617dbd3dc 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -138,13 +138,14 @@ int persistent_clock_is_local;
*/
static inline void warp_clock(void)
{
- struct timespec adjust;
+ if (sys_tz.tz_minuteswest != 0) {
+ struct timespec adjust;
- adjust = current_kernel_time();
- if (sys_tz.tz_minuteswest != 0)
persistent_clock_is_local = 1;
- adjust.tv_sec += sys_tz.tz_minuteswest * 60;
- do_settimeofday(&adjust);
+ adjust.tv_sec = sys_tz.tz_minuteswest * 60;
+ adjust.tv_nsec = 0;
+ timekeeping_inject_offset(&adjust);
+ }
}
/*
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 24510d84efd..e4c07b0692b 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -64,20 +64,88 @@ config GENERIC_CMOS_UPDATE
if GENERIC_CLOCKEVENTS
menu "Timers subsystem"
-# Core internal switch. Selected by NO_HZ / HIGH_RES_TIMERS. This is
+# Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is
# only related to the tick functionality. Oneshot clockevent devices
# are supported independ of this.
config TICK_ONESHOT
bool
-config NO_HZ
- bool "Tickless System (Dynamic Ticks)"
+config NO_HZ_COMMON
+ bool
depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
select TICK_ONESHOT
+
+choice
+ prompt "Timer tick handling"
+ default NO_HZ_IDLE if NO_HZ
+
+config HZ_PERIODIC
+ bool "Periodic timer ticks (constant rate, no dynticks)"
+ help
+ This option keeps the tick running periodically at a constant
+ rate, even when the CPU doesn't need it.
+
+config NO_HZ_IDLE
+ bool "Idle dynticks system (tickless idle)"
+ depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
+ select NO_HZ_COMMON
+ help
+ This option enables a tickless idle system: timer interrupts
+ will only trigger on an as-needed basis when the system is idle.
+ This is usually interesting for energy saving.
+
+ Most of the time you want to say Y here.
+
+config NO_HZ_FULL
+ bool "Full dynticks system (tickless)"
+ # NO_HZ_COMMON dependency
+ depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
+ # We need at least one periodic CPU for timekeeping
+ depends on SMP
+ # RCU_USER_QS dependency
+ depends on HAVE_CONTEXT_TRACKING
+ # VIRT_CPU_ACCOUNTING_GEN dependency
+ depends on 64BIT
+ select NO_HZ_COMMON
+ select RCU_USER_QS
+ select RCU_NOCB_CPU
+ select VIRT_CPU_ACCOUNTING_GEN
+ select CONTEXT_TRACKING_FORCE
+ select IRQ_WORK
+ help
+ Adaptively try to shutdown the tick whenever possible, even when
+ the CPU is running tasks. Typically this requires running a single
+ task on the CPU. Chances for running tickless are maximized when
+ the task mostly runs in userspace and has few kernel activity.
+
+ You need to fill up the nohz_full boot parameter with the
+ desired range of dynticks CPUs.
+
+ This is implemented at the expense of some overhead in user <-> kernel
+ transitions: syscalls, exceptions and interrupts. Even when it's
+ dynamically off.
+
+ Say N.
+
+endchoice
+
+config NO_HZ_FULL_ALL
+ bool "Full dynticks system on all CPUs by default"
+ depends on NO_HZ_FULL
+ help
+ If the user doesn't pass the nohz_full boot option to
+ define the range of full dynticks CPUs, consider that all
+ CPUs in the system are full dynticks by default.
+ Note the boot CPU will still be kept outside the range to
+ handle the timekeeping duty.
+
+config NO_HZ
+ bool "Old Idle dynticks config"
+ depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
help
- This option enables a tickless system: timer interrupts will
- only trigger on an as-needed basis both when the system is
- busy and when the system is idle.
+ This is the old config entry that enables dynticks idle.
+ We keep it around for a little while to enforce backward
+ compatibility with older config files.
config HIGH_RES_TIMERS
bool "High Resolution Timer Support"
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 072bb066bb7..12ff13a838c 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -18,13 +18,14 @@
#include <linux/rtc.h>
#include "tick-internal.h"
+#include "ntp_internal.h"
/*
* NTP timekeeping variables:
+ *
+ * Note: All of the NTP state is protected by the timekeeping locks.
*/
-DEFINE_RAW_SPINLOCK(ntp_lock);
-
/* USER_HZ period (usecs): */
unsigned long tick_usec = TICK_USEC;
@@ -53,9 +54,6 @@ static int time_state = TIME_OK;
/* clock status bits: */
static int time_status = STA_UNSYNC;
-/* TAI offset (secs): */
-static long time_tai;
-
/* time adjustment (nsecs): */
static s64 time_offset;
@@ -134,8 +132,6 @@ static inline void pps_reset_freq_interval(void)
/**
* pps_clear - Clears the PPS state variables
- *
- * Must be called while holding a write on the ntp_lock
*/
static inline void pps_clear(void)
{
@@ -150,8 +146,6 @@ static inline void pps_clear(void)
/* Decrease pps_valid to indicate that another second has passed since
* the last PPS signal. When it reaches 0, indicate that PPS signal is
* missing.
- *
- * Must be called while holding a write on the ntp_lock
*/
static inline void pps_dec_valid(void)
{
@@ -346,10 +340,6 @@ static void ntp_update_offset(long offset)
*/
void ntp_clear(void)
{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&ntp_lock, flags);
-
time_adjust = 0; /* stop active adjtime() */
time_status |= STA_UNSYNC;
time_maxerror = NTP_PHASE_LIMIT;
@@ -362,20 +352,12 @@ void ntp_clear(void)
/* Clear PPS state variables */
pps_clear();
- raw_spin_unlock_irqrestore(&ntp_lock, flags);
-
}
u64 ntp_tick_length(void)
{
- unsigned long flags;
- s64 ret;
-
- raw_spin_lock_irqsave(&ntp_lock, flags);
- ret = tick_length;
- raw_spin_unlock_irqrestore(&ntp_lock, flags);
- return ret;
+ return tick_length;
}
@@ -393,9 +375,6 @@ int second_overflow(unsigned long secs)
{
s64 delta;
int leap = 0;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&ntp_lock, flags);
/*
* Leap second processing. If in leap-insert state at the end of the
@@ -415,7 +394,6 @@ int second_overflow(unsigned long secs)
else if (secs % 86400 == 0) {
leap = -1;
time_state = TIME_OOP;
- time_tai++;
printk(KERN_NOTICE
"Clock: inserting leap second 23:59:60 UTC\n");
}
@@ -425,7 +403,6 @@ int second_overflow(unsigned long secs)
time_state = TIME_OK;
else if ((secs + 1) % 86400 == 0) {
leap = 1;
- time_tai--;
time_state = TIME_WAIT;
printk(KERN_NOTICE
"Clock: deleting leap second 23:59:59 UTC\n");
@@ -479,8 +456,6 @@ int second_overflow(unsigned long secs)
time_adjust = 0;
out:
- raw_spin_unlock_irqrestore(&ntp_lock, flags);
-
return leap;
}
@@ -575,11 +550,10 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
time_status |= txc->status & ~STA_RONLY;
}
-/*
- * Called with ntp_lock held, so we can access and modify
- * all the global NTP state:
- */
-static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts)
+
+static inline void process_adjtimex_modes(struct timex *txc,
+ struct timespec *ts,
+ s32 *time_tai)
{
if (txc->modes & ADJ_STATUS)
process_adj_status(txc, ts);
@@ -613,7 +587,7 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts
}
if (txc->modes & ADJ_TAI && txc->constant > 0)
- time_tai = txc->constant;
+ *time_tai = txc->constant;
if (txc->modes & ADJ_OFFSET)
ntp_update_offset(txc->offset);
@@ -625,16 +599,13 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts
ntp_update_frequency();
}
-/*
- * adjtimex mainly allows reading (and writing, if superuser) of
- * kernel time-keeping variables. used by xntpd.
+
+
+/**
+ * ntp_validate_timex - Ensures the timex is ok for use in do_adjtimex
*/
-int do_adjtimex(struct timex *txc)
+int ntp_validate_timex(struct timex *txc)
{
- struct timespec ts;
- int result;
-
- /* Validate the data before disabling interrupts */
if (txc->modes & ADJ_ADJTIME) {
/* singleshot must not be used with any other mode bits */
if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
@@ -646,7 +617,6 @@ int do_adjtimex(struct timex *txc)
/* In order to modify anything, you gotta be super-user! */
if (txc->modes && !capable(CAP_SYS_TIME))
return -EPERM;
-
/*
* if the quartz is off by more than 10% then
* something is VERY wrong!
@@ -657,22 +627,20 @@ int do_adjtimex(struct timex *txc)
return -EINVAL;
}
- if (txc->modes & ADJ_SETOFFSET) {
- struct timespec delta;
- delta.tv_sec = txc->time.tv_sec;
- delta.tv_nsec = txc->time.tv_usec;
- if (!capable(CAP_SYS_TIME))
- return -EPERM;
- if (!(txc->modes & ADJ_NANO))
- delta.tv_nsec *= 1000;
- result = timekeeping_inject_offset(&delta);
- if (result)
- return result;
- }
+ if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME)))
+ return -EPERM;
- getnstimeofday(&ts);
+ return 0;
+}
- raw_spin_lock_irq(&ntp_lock);
+
+/*
+ * adjtimex mainly allows reading (and writing, if superuser) of
+ * kernel time-keeping variables. used by xntpd.
+ */
+int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai)
+{
+ int result;
if (txc->modes & ADJ_ADJTIME) {
long save_adjust = time_adjust;
@@ -687,7 +655,7 @@ int do_adjtimex(struct timex *txc)
/* If there are input parameters, then process them: */
if (txc->modes)
- process_adjtimex_modes(txc, &ts);
+ process_adjtimex_modes(txc, ts, time_tai);
txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
NTP_SCALE_SHIFT);
@@ -709,15 +677,13 @@ int do_adjtimex(struct timex *txc)
txc->precision = 1;
txc->tolerance = MAXFREQ_SCALED / PPM_SCALE;
txc->tick = tick_usec;
- txc->tai = time_tai;
+ txc->tai = *time_tai;
/* fill PPS status fields */
pps_fill_timex(txc);
- raw_spin_unlock_irq(&ntp_lock);
-
- txc->time.tv_sec = ts.tv_sec;
- txc->time.tv_usec = ts.tv_nsec;
+ txc->time.tv_sec = ts->tv_sec;
+ txc->time.tv_usec = ts->tv_nsec;
if (!(time_status & STA_NANO))
txc->time.tv_usec /= NSEC_PER_USEC;
@@ -894,7 +860,7 @@ static void hardpps_update_phase(long error)
}
/*
- * hardpps() - discipline CPU clock oscillator to external PPS signal
+ * __hardpps() - discipline CPU clock oscillator to external PPS signal
*
* This routine is called at each PPS signal arrival in order to
* discipline the CPU clock oscillator to the PPS signal. It takes two
@@ -905,15 +871,13 @@ static void hardpps_update_phase(long error)
* This code is based on David Mills's reference nanokernel
* implementation. It was mostly rewritten but keeps the same idea.
*/
-void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
+void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
{
struct pps_normtime pts_norm, freq_norm;
unsigned long flags;
pts_norm = pps_normalize_ts(*phase_ts);
- raw_spin_lock_irqsave(&ntp_lock, flags);
-
/* clear the error bits, they will be set again if needed */
time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
@@ -925,7 +889,6 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
* just start the frequency interval */
if (unlikely(pps_fbase.tv_sec == 0)) {
pps_fbase = *raw_ts;
- raw_spin_unlock_irqrestore(&ntp_lock, flags);
return;
}
@@ -940,7 +903,6 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
time_status |= STA_PPSJITTER;
/* restart the frequency calibration interval */
pps_fbase = *raw_ts;
- raw_spin_unlock_irqrestore(&ntp_lock, flags);
pr_err("hardpps: PPSJITTER: bad pulse\n");
return;
}
@@ -957,10 +919,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
hardpps_update_phase(pts_norm.nsec);
- raw_spin_unlock_irqrestore(&ntp_lock, flags);
}
-EXPORT_SYMBOL(hardpps);
-
#endif /* CONFIG_NTP_PPS */
static int __init ntp_tick_adj_setup(char *str)
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
new file mode 100644
index 00000000000..1950cb4ca2a
--- /dev/null
+++ b/kernel/time/ntp_internal.h
@@ -0,0 +1,12 @@
+#ifndef _LINUX_NTP_INTERNAL_H
+#define _LINUX_NTP_INTERNAL_H
+
+extern void ntp_init(void);
+extern void ntp_clear(void);
+/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
+extern u64 ntp_tick_length(void);
+extern int second_overflow(unsigned long secs);
+extern int ntp_validate_timex(struct timex *);
+extern int __do_adjtimex(struct timex *, struct timespec *, s32 *);
+extern void __hardpps(const struct timespec *, const struct timespec *);
+#endif /* _LINUX_NTP_INTERNAL_H */
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 7f32fe0e52c..206bbfb34e0 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -28,9 +28,8 @@
*/
static struct tick_device tick_broadcast_device;
-/* FIXME: Use cpumask_var_t. */
-static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS);
-static DECLARE_BITMAP(tmpmask, NR_CPUS);
+static cpumask_var_t tick_broadcast_mask;
+static cpumask_var_t tmpmask;
static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
static int tick_broadcast_force;
@@ -50,7 +49,7 @@ struct tick_device *tick_get_broadcast_device(void)
struct cpumask *tick_get_broadcast_mask(void)
{
- return to_cpumask(tick_broadcast_mask);
+ return tick_broadcast_mask;
}
/*
@@ -67,6 +66,8 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc)
*/
int tick_check_broadcast_device(struct clock_event_device *dev)
{
+ struct clock_event_device *cur = tick_broadcast_device.evtdev;
+
if ((dev->features & CLOCK_EVT_FEAT_DUMMY) ||
(tick_broadcast_device.evtdev &&
tick_broadcast_device.evtdev->rating >= dev->rating) ||
@@ -74,9 +75,21 @@ int tick_check_broadcast_device(struct clock_event_device *dev)
return 0;
clockevents_exchange_device(tick_broadcast_device.evtdev, dev);
+ if (cur)
+ cur->event_handler = clockevents_handle_noop;
tick_broadcast_device.evtdev = dev;
- if (!cpumask_empty(tick_get_broadcast_mask()))
+ if (!cpumask_empty(tick_broadcast_mask))
tick_broadcast_start_periodic(dev);
+ /*
+ * Inform all cpus about this. We might be in a situation
+ * where we did not switch to oneshot mode because the per cpu
+ * devices are affected by CLOCK_EVT_FEAT_C3STOP and the lack
+ * of a oneshot capable broadcast device. Without that
+ * notification the systems stays stuck in periodic mode
+ * forever.
+ */
+ if (dev->features & CLOCK_EVT_FEAT_ONESHOT)
+ tick_clock_notify();
return 1;
}
@@ -124,7 +137,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
if (!tick_device_is_functional(dev)) {
dev->event_handler = tick_handle_periodic;
tick_device_setup_broadcast_func(dev);
- cpumask_set_cpu(cpu, tick_get_broadcast_mask());
+ cpumask_set_cpu(cpu, tick_broadcast_mask);
tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
ret = 1;
} else {
@@ -135,7 +148,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
*/
if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) {
int cpu = smp_processor_id();
- cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
+ cpumask_clear_cpu(cpu, tick_broadcast_mask);
tick_broadcast_clear_oneshot(cpu);
} else {
tick_device_setup_broadcast_func(dev);
@@ -199,9 +212,8 @@ static void tick_do_periodic_broadcast(void)
{
raw_spin_lock(&tick_broadcast_lock);
- cpumask_and(to_cpumask(tmpmask),
- cpu_online_mask, tick_get_broadcast_mask());
- tick_do_broadcast(to_cpumask(tmpmask));
+ cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask);
+ tick_do_broadcast(tmpmask);
raw_spin_unlock(&tick_broadcast_lock);
}
@@ -264,13 +276,12 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
if (!tick_device_is_functional(dev))
goto out;
- bc_stopped = cpumask_empty(tick_get_broadcast_mask());
+ bc_stopped = cpumask_empty(tick_broadcast_mask);
switch (*reason) {
case CLOCK_EVT_NOTIFY_BROADCAST_ON:
case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
- if (!cpumask_test_cpu(cpu, tick_get_broadcast_mask())) {
- cpumask_set_cpu(cpu, tick_get_broadcast_mask());
+ if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) {
if (tick_broadcast_device.mode ==
TICKDEV_MODE_PERIODIC)
clockevents_shutdown(dev);
@@ -280,8 +291,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
break;
case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
if (!tick_broadcast_force &&
- cpumask_test_cpu(cpu, tick_get_broadcast_mask())) {
- cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
+ cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) {
if (tick_broadcast_device.mode ==
TICKDEV_MODE_PERIODIC)
tick_setup_periodic(dev, 0);
@@ -289,7 +299,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
break;
}
- if (cpumask_empty(tick_get_broadcast_mask())) {
+ if (cpumask_empty(tick_broadcast_mask)) {
if (!bc_stopped)
clockevents_shutdown(bc);
} else if (bc_stopped) {
@@ -338,10 +348,10 @@ void tick_shutdown_broadcast(unsigned int *cpup)
raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
bc = tick_broadcast_device.evtdev;
- cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
+ cpumask_clear_cpu(cpu, tick_broadcast_mask);
if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
- if (bc && cpumask_empty(tick_get_broadcast_mask()))
+ if (bc && cpumask_empty(tick_broadcast_mask))
clockevents_shutdown(bc);
}
@@ -377,13 +387,13 @@ int tick_resume_broadcast(void)
switch (tick_broadcast_device.mode) {
case TICKDEV_MODE_PERIODIC:
- if (!cpumask_empty(tick_get_broadcast_mask()))
+ if (!cpumask_empty(tick_broadcast_mask))
tick_broadcast_start_periodic(bc);
broadcast = cpumask_test_cpu(smp_processor_id(),
- tick_get_broadcast_mask());
+ tick_broadcast_mask);
break;
case TICKDEV_MODE_ONESHOT:
- if (!cpumask_empty(tick_get_broadcast_mask()))
+ if (!cpumask_empty(tick_broadcast_mask))
broadcast = tick_resume_broadcast_oneshot(bc);
break;
}
@@ -396,25 +406,58 @@ int tick_resume_broadcast(void)
#ifdef CONFIG_TICK_ONESHOT
-/* FIXME: use cpumask_var_t. */
-static DECLARE_BITMAP(tick_broadcast_oneshot_mask, NR_CPUS);
+static cpumask_var_t tick_broadcast_oneshot_mask;
+static cpumask_var_t tick_broadcast_pending_mask;
+static cpumask_var_t tick_broadcast_force_mask;
/*
* Exposed for debugging: see timer_list.c
*/
struct cpumask *tick_get_broadcast_oneshot_mask(void)
{
- return to_cpumask(tick_broadcast_oneshot_mask);
+ return tick_broadcast_oneshot_mask;
}
-static int tick_broadcast_set_event(ktime_t expires, int force)
+/*
+ * Called before going idle with interrupts disabled. Checks whether a
+ * broadcast event from the other core is about to happen. We detected
+ * that in tick_broadcast_oneshot_control(). The callsite can use this
+ * to avoid a deep idle transition as we are about to get the
+ * broadcast IPI right away.
+ */
+int tick_check_broadcast_expired(void)
{
- struct clock_event_device *bc = tick_broadcast_device.evtdev;
+ return cpumask_test_cpu(smp_processor_id(), tick_broadcast_force_mask);
+}
+
+/*
+ * Set broadcast interrupt affinity
+ */
+static void tick_broadcast_set_affinity(struct clock_event_device *bc,
+ const struct cpumask *cpumask)
+{
+ if (!(bc->features & CLOCK_EVT_FEAT_DYNIRQ))
+ return;
+
+ if (cpumask_equal(bc->cpumask, cpumask))
+ return;
+
+ bc->cpumask = cpumask;
+ irq_set_affinity(bc->irq, bc->cpumask);
+}
+
+static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
+ ktime_t expires, int force)
+{
+ int ret;
if (bc->mode != CLOCK_EVT_MODE_ONESHOT)
clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
- return clockevents_program_event(bc, expires, force);
+ ret = clockevents_program_event(bc, expires, force);
+ if (!ret)
+ tick_broadcast_set_affinity(bc, cpumask_of(cpu));
+ return ret;
}
int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
@@ -429,7 +472,7 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
*/
void tick_check_oneshot_broadcast(int cpu)
{
- if (cpumask_test_cpu(cpu, to_cpumask(tick_broadcast_oneshot_mask))) {
+ if (cpumask_test_cpu(cpu, tick_broadcast_oneshot_mask)) {
struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT);
@@ -443,27 +486,39 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
{
struct tick_device *td;
ktime_t now, next_event;
- int cpu;
+ int cpu, next_cpu = 0;
raw_spin_lock(&tick_broadcast_lock);
again:
dev->next_event.tv64 = KTIME_MAX;
next_event.tv64 = KTIME_MAX;
- cpumask_clear(to_cpumask(tmpmask));
+ cpumask_clear(tmpmask);
now = ktime_get();
/* Find all expired events */
- for_each_cpu(cpu, tick_get_broadcast_oneshot_mask()) {
+ for_each_cpu(cpu, tick_broadcast_oneshot_mask) {
td = &per_cpu(tick_cpu_device, cpu);
- if (td->evtdev->next_event.tv64 <= now.tv64)
- cpumask_set_cpu(cpu, to_cpumask(tmpmask));
- else if (td->evtdev->next_event.tv64 < next_event.tv64)
+ if (td->evtdev->next_event.tv64 <= now.tv64) {
+ cpumask_set_cpu(cpu, tmpmask);
+ /*
+ * Mark the remote cpu in the pending mask, so
+ * it can avoid reprogramming the cpu local
+ * timer in tick_broadcast_oneshot_control().
+ */
+ cpumask_set_cpu(cpu, tick_broadcast_pending_mask);
+ } else if (td->evtdev->next_event.tv64 < next_event.tv64) {
next_event.tv64 = td->evtdev->next_event.tv64;
+ next_cpu = cpu;
+ }
}
+ /* Take care of enforced broadcast requests */
+ cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask);
+ cpumask_clear(tick_broadcast_force_mask);
+
/*
* Wakeup the cpus which have an expired event.
*/
- tick_do_broadcast(to_cpumask(tmpmask));
+ tick_do_broadcast(tmpmask);
/*
* Two reasons for reprogram:
@@ -480,7 +535,7 @@ again:
* Rearm the broadcast device. If event expired,
* repeat the above
*/
- if (tick_broadcast_set_event(next_event, 0))
+ if (tick_broadcast_set_event(dev, next_cpu, next_event, 0))
goto again;
}
raw_spin_unlock(&tick_broadcast_lock);
@@ -495,6 +550,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
struct clock_event_device *bc, *dev;
struct tick_device *td;
unsigned long flags;
+ ktime_t now;
int cpu;
/*
@@ -519,21 +575,84 @@ void tick_broadcast_oneshot_control(unsigned long reason)
raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
- if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
- cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask());
+ WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask));
+ if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {
clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
- if (dev->next_event.tv64 < bc->next_event.tv64)
- tick_broadcast_set_event(dev->next_event, 1);
+ /*
+ * We only reprogram the broadcast timer if we
+ * did not mark ourself in the force mask and
+ * if the cpu local event is earlier than the
+ * broadcast event. If the current CPU is in
+ * the force mask, then we are going to be
+ * woken by the IPI right away.
+ */
+ if (!cpumask_test_cpu(cpu, tick_broadcast_force_mask) &&
+ dev->next_event.tv64 < bc->next_event.tv64)
+ tick_broadcast_set_event(bc, cpu, dev->next_event, 1);
}
} else {
- if (cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
- cpumask_clear_cpu(cpu,
- tick_get_broadcast_oneshot_mask());
+ if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
- if (dev->next_event.tv64 != KTIME_MAX)
- tick_program_event(dev->next_event, 1);
+ if (dev->next_event.tv64 == KTIME_MAX)
+ goto out;
+ /*
+ * The cpu which was handling the broadcast
+ * timer marked this cpu in the broadcast
+ * pending mask and fired the broadcast
+ * IPI. So we are going to handle the expired
+ * event anyway via the broadcast IPI
+ * handler. No need to reprogram the timer
+ * with an already expired event.
+ */
+ if (cpumask_test_and_clear_cpu(cpu,
+ tick_broadcast_pending_mask))
+ goto out;
+
+ /*
+ * If the pending bit is not set, then we are
+ * either the CPU handling the broadcast
+ * interrupt or we got woken by something else.
+ *
+ * We are not longer in the broadcast mask, so
+ * if the cpu local expiry time is already
+ * reached, we would reprogram the cpu local
+ * timer with an already expired event.
+ *
+ * This can lead to a ping-pong when we return
+ * to idle and therefor rearm the broadcast
+ * timer before the cpu local timer was able
+ * to fire. This happens because the forced
+ * reprogramming makes sure that the event
+ * will happen in the future and depending on
+ * the min_delta setting this might be far
+ * enough out that the ping-pong starts.
+ *
+ * If the cpu local next_event has expired
+ * then we know that the broadcast timer
+ * next_event has expired as well and
+ * broadcast is about to be handled. So we
+ * avoid reprogramming and enforce that the
+ * broadcast handler, which did not run yet,
+ * will invoke the cpu local handler.
+ *
+ * We cannot call the handler directly from
+ * here, because we might be in a NOHZ phase
+ * and we did not go through the irq_enter()
+ * nohz fixups.
+ */
+ now = ktime_get();
+ if (dev->next_event.tv64 <= now.tv64) {
+ cpumask_set_cpu(cpu, tick_broadcast_force_mask);
+ goto out;
+ }
+ /*
+ * We got woken by something else. Reprogram
+ * the cpu local timer device.
+ */
+ tick_program_event(dev->next_event, 1);
}
}
+out:
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}
@@ -544,7 +663,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
*/
static void tick_broadcast_clear_oneshot(int cpu)
{
- cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
+ cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
}
static void tick_broadcast_init_next_event(struct cpumask *mask,
@@ -574,7 +693,8 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
bc->event_handler = tick_handle_oneshot_broadcast;
/* Take the do_timer update */
- tick_do_timer_cpu = cpu;
+ if (!tick_nohz_full_cpu(cpu))
+ tick_do_timer_cpu = cpu;
/*
* We must be careful here. There might be other CPUs
@@ -582,17 +702,16 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
* oneshot_mask bits for those and program the
* broadcast device to fire.
*/
- cpumask_copy(to_cpumask(tmpmask), tick_get_broadcast_mask());
- cpumask_clear_cpu(cpu, to_cpumask(tmpmask));
- cpumask_or(tick_get_broadcast_oneshot_mask(),
- tick_get_broadcast_oneshot_mask(),
- to_cpumask(tmpmask));
+ cpumask_copy(tmpmask, tick_broadcast_mask);
+ cpumask_clear_cpu(cpu, tmpmask);
+ cpumask_or(tick_broadcast_oneshot_mask,
+ tick_broadcast_oneshot_mask, tmpmask);
- if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) {
+ if (was_periodic && !cpumask_empty(tmpmask)) {
clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
- tick_broadcast_init_next_event(to_cpumask(tmpmask),
+ tick_broadcast_init_next_event(tmpmask,
tick_next_period);
- tick_broadcast_set_event(tick_next_period, 1);
+ tick_broadcast_set_event(bc, cpu, tick_next_period, 1);
} else
bc->next_event.tv64 = KTIME_MAX;
} else {
@@ -640,7 +759,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
* Clear the broadcast mask flag for the dead cpu, but do not
* stop the broadcast device!
*/
- cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
+ cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}
@@ -664,3 +783,14 @@ bool tick_broadcast_oneshot_available(void)
}
#endif
+
+void __init tick_broadcast_init(void)
+{
+ alloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT);
+ alloc_cpumask_var(&tmpmask, GFP_NOWAIT);
+#ifdef CONFIG_TICK_ONESHOT
+ alloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT);
+ alloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT);
+ alloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT);
+#endif
+}
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index b1600a6973f..5d3fb100bc0 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -163,7 +163,10 @@ static void tick_setup_device(struct tick_device *td,
* this cpu:
*/
if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
- tick_do_timer_cpu = cpu;
+ if (!tick_nohz_full_cpu(cpu))
+ tick_do_timer_cpu = cpu;
+ else
+ tick_do_timer_cpu = TICK_DO_TIMER_NONE;
tick_next_period = ktime_get();
tick_period = ktime_set(0, NSEC_PER_SEC / HZ);
}
@@ -323,6 +326,7 @@ static void tick_shutdown(unsigned int *cpup)
*/
dev->mode = CLOCK_EVT_MODE_UNUSED;
clockevents_exchange_device(dev, NULL);
+ dev->event_handler = clockevents_handle_noop;
td->evtdev = NULL;
}
raw_spin_unlock_irqrestore(&tick_device_lock, flags);
@@ -416,4 +420,5 @@ static struct notifier_block tick_notifier = {
void __init tick_init(void)
{
clockevents_register_notifier(&tick_notifier);
+ tick_broadcast_init();
}
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index cf3e59ed6dc..f0299eae460 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -4,6 +4,8 @@
#include <linux/hrtimer.h>
#include <linux/tick.h>
+extern seqlock_t jiffies_lock;
+
#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
#define TICK_DO_TIMER_NONE -1
@@ -94,7 +96,7 @@ extern void tick_broadcast_on_off(unsigned long reason, int *oncpu);
extern void tick_shutdown_broadcast(unsigned int *cpup);
extern void tick_suspend_broadcast(void);
extern int tick_resume_broadcast(void);
-
+extern void tick_broadcast_init(void);
extern void
tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
@@ -119,6 +121,7 @@ static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { }
static inline void tick_shutdown_broadcast(unsigned int *cpup) { }
static inline void tick_suspend_broadcast(void) { }
static inline int tick_resume_broadcast(void) { return 0; }
+static inline void tick_broadcast_init(void) { }
/*
* Set the periodic handler in non broadcast mode
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index a19a39952c1..bc67d4245e1 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -21,11 +21,15 @@
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/irq_work.h>
+#include <linux/posix-timers.h>
+#include <linux/perf_event.h>
#include <asm/irq_regs.h>
#include "tick-internal.h"
+#include <trace/events/timer.h>
+
/*
* Per cpu nohz control structure
*/
@@ -104,7 +108,7 @@ static void tick_sched_do_timer(ktime_t now)
{
int cpu = smp_processor_id();
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
/*
* Check if the do_timer duty was dropped. We don't care about
* concurrency: This happens only when the cpu in charge went
@@ -112,7 +116,8 @@ static void tick_sched_do_timer(ktime_t now)
* this duty, then the jiffies update is still serialized by
* jiffies_lock.
*/
- if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
+ if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)
+ && !tick_nohz_full_cpu(cpu))
tick_do_timer_cpu = cpu;
#endif
@@ -123,7 +128,7 @@ static void tick_sched_do_timer(ktime_t now)
static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
{
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
/*
* When we are idle and the tick is stopped, we have to touch
* the watchdog as we might not schedule for a really long
@@ -142,10 +147,226 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
profile_tick(CPU_PROFILING);
}
+#ifdef CONFIG_NO_HZ_FULL
+static cpumask_var_t nohz_full_mask;
+bool have_nohz_full_mask;
+
+static bool can_stop_full_tick(void)
+{
+ WARN_ON_ONCE(!irqs_disabled());
+
+ if (!sched_can_stop_tick()) {
+ trace_tick_stop(0, "more than 1 task in runqueue\n");
+ return false;
+ }
+
+ if (!posix_cpu_timers_can_stop_tick(current)) {
+ trace_tick_stop(0, "posix timers running\n");
+ return false;
+ }
+
+ if (!perf_event_can_stop_tick()) {
+ trace_tick_stop(0, "perf events running\n");
+ return false;
+ }
+
+ /* sched_clock_tick() needs us? */
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+ /*
+ * TODO: kick full dynticks CPUs when
+ * sched_clock_stable is set.
+ */
+ if (!sched_clock_stable) {
+ trace_tick_stop(0, "unstable sched clock\n");
+ return false;
+ }
+#endif
+
+ return true;
+}
+
+static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now);
+
+/*
+ * Re-evaluate the need for the tick on the current CPU
+ * and restart it if necessary.
+ */
+void tick_nohz_full_check(void)
+{
+ struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+
+ if (tick_nohz_full_cpu(smp_processor_id())) {
+ if (ts->tick_stopped && !is_idle_task(current)) {
+ if (!can_stop_full_tick())
+ tick_nohz_restart_sched_tick(ts, ktime_get());
+ }
+ }
+}
+
+static void nohz_full_kick_work_func(struct irq_work *work)
+{
+ tick_nohz_full_check();
+}
+
+static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
+ .func = nohz_full_kick_work_func,
+};
+
+/*
+ * Kick the current CPU if it's full dynticks in order to force it to
+ * re-evaluate its dependency on the tick and restart it if necessary.
+ */
+void tick_nohz_full_kick(void)
+{
+ if (tick_nohz_full_cpu(smp_processor_id()))
+ irq_work_queue(&__get_cpu_var(nohz_full_kick_work));
+}
+
+static void nohz_full_kick_ipi(void *info)
+{
+ tick_nohz_full_check();
+}
+
+/*
+ * Kick all full dynticks CPUs in order to force these to re-evaluate
+ * their dependency on the tick and restart it if necessary.
+ */
+void tick_nohz_full_kick_all(void)
+{
+ if (!have_nohz_full_mask)
+ return;
+
+ preempt_disable();
+ smp_call_function_many(nohz_full_mask,
+ nohz_full_kick_ipi, NULL, false);
+ preempt_enable();
+}
+
+/*
+ * Re-evaluate the need for the tick as we switch the current task.
+ * It might need the tick due to per task/process properties:
+ * perf events, posix cpu timers, ...
+ */
+void tick_nohz_task_switch(struct task_struct *tsk)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ if (!tick_nohz_full_cpu(smp_processor_id()))
+ goto out;
+
+ if (tick_nohz_tick_stopped() && !can_stop_full_tick())
+ tick_nohz_full_kick();
+
+out:
+ local_irq_restore(flags);
+}
+
+int tick_nohz_full_cpu(int cpu)
+{
+ if (!have_nohz_full_mask)
+ return 0;
+
+ return cpumask_test_cpu(cpu, nohz_full_mask);
+}
+
+/* Parse the boot-time nohz CPU list from the kernel parameters. */
+static int __init tick_nohz_full_setup(char *str)
+{
+ int cpu;
+
+ alloc_bootmem_cpumask_var(&nohz_full_mask);
+ if (cpulist_parse(str, nohz_full_mask) < 0) {
+ pr_warning("NOHZ: Incorrect nohz_full cpumask\n");
+ return 1;
+ }
+
+ cpu = smp_processor_id();
+ if (cpumask_test_cpu(cpu, nohz_full_mask)) {
+ pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
+ cpumask_clear_cpu(cpu, nohz_full_mask);
+ }
+ have_nohz_full_mask = true;
+
+ return 1;
+}
+__setup("nohz_full=", tick_nohz_full_setup);
+
+static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb,
+ unsigned long action,
+ void *hcpu)
+{
+ unsigned int cpu = (unsigned long)hcpu;
+
+ switch (action & ~CPU_TASKS_FROZEN) {
+ case CPU_DOWN_PREPARE:
+ /*
+ * If we handle the timekeeping duty for full dynticks CPUs,
+ * we can't safely shutdown that CPU.
+ */
+ if (have_nohz_full_mask && tick_do_timer_cpu == cpu)
+ return -EINVAL;
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+/*
+ * Worst case string length in chunks of CPU range seems 2 steps
+ * separations: 0,2,4,6,...
+ * This is NR_CPUS + sizeof('\0')
+ */
+static char __initdata nohz_full_buf[NR_CPUS + 1];
+
+static int tick_nohz_init_all(void)
+{
+ int err = -1;
+
+#ifdef CONFIG_NO_HZ_FULL_ALL
+ if (!alloc_cpumask_var(&nohz_full_mask, GFP_KERNEL)) {
+ pr_err("NO_HZ: Can't allocate full dynticks cpumask\n");
+ return err;
+ }
+ err = 0;
+ cpumask_setall(nohz_full_mask);
+ cpumask_clear_cpu(smp_processor_id(), nohz_full_mask);
+ have_nohz_full_mask = true;
+#endif
+ return err;
+}
+
+void __init tick_nohz_init(void)
+{
+ int cpu;
+
+ if (!have_nohz_full_mask) {
+ if (tick_nohz_init_all() < 0)
+ return;
+ }
+
+ cpu_notifier(tick_nohz_cpu_down_callback, 0);
+
+ /* Make sure full dynticks CPU are also RCU nocbs */
+ for_each_cpu(cpu, nohz_full_mask) {
+ if (!rcu_is_nocb_cpu(cpu)) {
+ pr_warning("NO_HZ: CPU %d is not RCU nocb: "
+ "cleared from nohz_full range", cpu);
+ cpumask_clear_cpu(cpu, nohz_full_mask);
+ }
+ }
+
+ cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask);
+ pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf);
+}
+#else
+#define have_nohz_full_mask (0)
+#endif
+
/*
* NOHZ - aka dynamic tick functionality
*/
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
/*
* NO HZ enabled ?
*/
@@ -345,11 +566,12 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
delta_jiffies = rcu_delta_jiffies;
}
}
+
/*
- * Do not stop the tick, if we are only one off
- * or if the cpu is required for rcu
+ * Do not stop the tick, if we are only one off (or less)
+ * or if the cpu is required for RCU:
*/
- if (!ts->tick_stopped && delta_jiffies == 1)
+ if (!ts->tick_stopped && delta_jiffies <= 1)
goto out;
/* Schedule the tick, if we are at least one jiffie off */
@@ -378,6 +600,13 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
time_delta = KTIME_MAX;
}
+#ifdef CONFIG_NO_HZ_FULL
+ if (!ts->inidle) {
+ time_delta = min(time_delta,
+ scheduler_tick_max_deferment());
+ }
+#endif
+
/*
* calculate the expiry time for the next timer wheel
* timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals
@@ -421,6 +650,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
ts->tick_stopped = 1;
+ trace_tick_stop(1, " ");
}
/*
@@ -457,6 +687,24 @@ out:
return ret;
}
+static void tick_nohz_full_stop_tick(struct tick_sched *ts)
+{
+#ifdef CONFIG_NO_HZ_FULL
+ int cpu = smp_processor_id();
+
+ if (!tick_nohz_full_cpu(cpu) || is_idle_task(current))
+ return;
+
+ if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
+ return;
+
+ if (!can_stop_full_tick())
+ return;
+
+ tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
+#endif
+}
+
static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
{
/*
@@ -482,13 +730,28 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
if (ratelimit < 10 &&
(local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
- printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
- (unsigned int) local_softirq_pending());
+ pr_warn("NOHZ: local_softirq_pending %02x\n",
+ (unsigned int) local_softirq_pending());
ratelimit++;
}
return false;
}
+ if (have_nohz_full_mask) {
+ /*
+ * Keep the tick alive to guarantee timekeeping progression
+ * if there are full dynticks CPUs around
+ */
+ if (tick_do_timer_cpu == cpu)
+ return false;
+ /*
+ * Boot safety: make sure the timekeeping duty has been
+ * assigned before entering dyntick-idle mode,
+ */
+ if (tick_do_timer_cpu == TICK_DO_TIMER_NONE)
+ return false;
+ }
+
return true;
}
@@ -568,12 +831,13 @@ void tick_nohz_irq_exit(void)
{
struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
- if (!ts->inidle)
- return;
-
- /* Cancel the timer because CPU already waken up from the C-states*/
- menu_hrtimer_cancel();
- __tick_nohz_idle_enter(ts);
+ if (ts->inidle) {
+ /* Cancel the timer because CPU already waken up from the C-states*/
+ menu_hrtimer_cancel();
+ __tick_nohz_idle_enter(ts);
+ } else {
+ tick_nohz_full_stop_tick(ts);
+ }
}
/**
@@ -802,7 +1066,7 @@ static inline void tick_check_nohz(int cpu)
static inline void tick_nohz_switch_to_nohz(void) { }
static inline void tick_check_nohz(int cpu) { }
-#endif /* NO_HZ */
+#endif /* CONFIG_NO_HZ_COMMON */
/*
* Called from irq_enter to notify about the possible interruption of idle()
@@ -887,14 +1151,14 @@ void tick_setup_sched_timer(void)
now = ktime_get();
}
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
if (tick_nohz_enabled)
ts->nohz_mode = NOHZ_MODE_HIGHRES;
#endif
}
#endif /* HIGH_RES_TIMERS */
-#if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS
+#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS
void tick_cancel_sched_timer(int cpu)
{
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 9a0bc98fbe1..98cd470bbe4 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -23,8 +23,13 @@
#include <linux/stop_machine.h>
#include <linux/pvclock_gtod.h>
+#include "tick-internal.h"
+#include "ntp_internal.h"
static struct timekeeper timekeeper;
+static DEFINE_RAW_SPINLOCK(timekeeper_lock);
+static seqcount_t timekeeper_seq;
+static struct timekeeper shadow_timekeeper;
/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended;
@@ -67,6 +72,7 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm)
tk->wall_to_monotonic = wtm;
set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
tk->offs_real = timespec_to_ktime(tmp);
+ tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tk->tai_offset, 0));
}
static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t)
@@ -96,7 +102,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
old_clock = tk->clock;
tk->clock = clock;
- clock->cycle_last = clock->read(clock);
+ tk->cycle_last = clock->cycle_last = clock->read(clock);
/* Do the ns -> cycle conversion first, using original mult */
tmp = NTP_INTERVAL_LENGTH;
@@ -201,8 +207,6 @@ static void update_pvclock_gtod(struct timekeeper *tk)
/**
* pvclock_gtod_register_notifier - register a pvclock timedata update listener
- *
- * Must hold write on timekeeper.lock
*/
int pvclock_gtod_register_notifier(struct notifier_block *nb)
{
@@ -210,11 +214,10 @@ int pvclock_gtod_register_notifier(struct notifier_block *nb)
unsigned long flags;
int ret;
- write_seqlock_irqsave(&tk->lock, flags);
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb);
- /* update timekeeping data */
update_pvclock_gtod(tk);
- write_sequnlock_irqrestore(&tk->lock, flags);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
return ret;
}
@@ -223,25 +226,22 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier);
/**
* pvclock_gtod_unregister_notifier - unregister a pvclock
* timedata update listener
- *
- * Must hold write on timekeeper.lock
*/
int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
{
- struct timekeeper *tk = &timekeeper;
unsigned long flags;
int ret;
- write_seqlock_irqsave(&tk->lock, flags);
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb);
- write_sequnlock_irqrestore(&tk->lock, flags);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
return ret;
}
EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
-/* must hold write on timekeeper.lock */
-static void timekeeping_update(struct timekeeper *tk, bool clearntp)
+/* must hold timekeeper_lock */
+static void timekeeping_update(struct timekeeper *tk, bool clearntp, bool mirror)
{
if (clearntp) {
tk->ntp_error = 0;
@@ -249,6 +249,9 @@ static void timekeeping_update(struct timekeeper *tk, bool clearntp)
}
update_vsyscall(tk);
update_pvclock_gtod(tk);
+
+ if (mirror)
+ memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper));
}
/**
@@ -267,7 +270,7 @@ static void timekeeping_forward_now(struct timekeeper *tk)
clock = tk->clock;
cycle_now = clock->read(clock);
cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
- clock->cycle_last = cycle_now;
+ tk->cycle_last = clock->cycle_last = cycle_now;
tk->xtime_nsec += cycle_delta * tk->mult;
@@ -294,12 +297,12 @@ int __getnstimeofday(struct timespec *ts)
s64 nsecs = 0;
do {
- seq = read_seqbegin(&tk->lock);
+ seq = read_seqcount_begin(&timekeeper_seq);
ts->tv_sec = tk->xtime_sec;
nsecs = timekeeping_get_ns(tk);
- } while (read_seqretry(&tk->lock, seq));
+ } while (read_seqcount_retry(&timekeeper_seq, seq));
ts->tv_nsec = 0;
timespec_add_ns(ts, nsecs);
@@ -335,11 +338,11 @@ ktime_t ktime_get(void)
WARN_ON(timekeeping_suspended);
do {
- seq = read_seqbegin(&tk->lock);
+ seq = read_seqcount_begin(&timekeeper_seq);
secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec;
- } while (read_seqretry(&tk->lock, seq));
+ } while (read_seqcount_retry(&timekeeper_seq, seq));
/*
* Use ktime_set/ktime_add_ns to create a proper ktime on
* 32-bit architectures without CONFIG_KTIME_SCALAR.
@@ -366,12 +369,12 @@ void ktime_get_ts(struct timespec *ts)
WARN_ON(timekeeping_suspended);
do {
- seq = read_seqbegin(&tk->lock);
+ seq = read_seqcount_begin(&timekeeper_seq);
ts->tv_sec = tk->xtime_sec;
nsec = timekeeping_get_ns(tk);
tomono = tk->wall_to_monotonic;
- } while (read_seqretry(&tk->lock, seq));
+ } while (read_seqcount_retry(&timekeeper_seq, seq));
ts->tv_sec += tomono.tv_sec;
ts->tv_nsec = 0;
@@ -379,6 +382,50 @@ void ktime_get_ts(struct timespec *ts)
}
EXPORT_SYMBOL_GPL(ktime_get_ts);
+
+/**
+ * timekeeping_clocktai - Returns the TAI time of day in a timespec
+ * @ts: pointer to the timespec to be set
+ *
+ * Returns the time of day in a timespec.
+ */
+void timekeeping_clocktai(struct timespec *ts)
+{
+ struct timekeeper *tk = &timekeeper;
+ unsigned long seq;
+ u64 nsecs;
+
+ WARN_ON(timekeeping_suspended);
+
+ do {
+ seq = read_seqcount_begin(&timekeeper_seq);
+
+ ts->tv_sec = tk->xtime_sec + tk->tai_offset;
+ nsecs = timekeeping_get_ns(tk);
+
+ } while (read_seqcount_retry(&timekeeper_seq, seq));
+
+ ts->tv_nsec = 0;
+ timespec_add_ns(ts, nsecs);
+
+}
+EXPORT_SYMBOL(timekeeping_clocktai);
+
+
+/**
+ * ktime_get_clocktai - Returns the TAI time of day in a ktime
+ *
+ * Returns the time of day in a ktime.
+ */
+ktime_t ktime_get_clocktai(void)
+{
+ struct timespec ts;
+
+ timekeeping_clocktai(&ts);
+ return timespec_to_ktime(ts);
+}
+EXPORT_SYMBOL(ktime_get_clocktai);
+
#ifdef CONFIG_NTP_PPS
/**
@@ -399,7 +446,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
WARN_ON_ONCE(timekeeping_suspended);
do {
- seq = read_seqbegin(&tk->lock);
+ seq = read_seqcount_begin(&timekeeper_seq);
*ts_raw = tk->raw_time;
ts_real->tv_sec = tk->xtime_sec;
@@ -408,7 +455,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
nsecs_raw = timekeeping_get_ns_raw(tk);
nsecs_real = timekeeping_get_ns(tk);
- } while (read_seqretry(&tk->lock, seq));
+ } while (read_seqcount_retry(&timekeeper_seq, seq));
timespec_add_ns(ts_raw, nsecs_raw);
timespec_add_ns(ts_real, nsecs_real);
@@ -448,7 +495,8 @@ int do_settimeofday(const struct timespec *tv)
if (!timespec_valid_strict(tv))
return -EINVAL;
- write_seqlock_irqsave(&tk->lock, flags);
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ write_seqcount_begin(&timekeeper_seq);
timekeeping_forward_now(tk);
@@ -460,9 +508,10 @@ int do_settimeofday(const struct timespec *tv)
tk_set_xtime(tk, tv);
- timekeeping_update(tk, true);
+ timekeeping_update(tk, true, true);
- write_sequnlock_irqrestore(&tk->lock, flags);
+ write_seqcount_end(&timekeeper_seq);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
/* signal hrtimers about time change */
clock_was_set();
@@ -487,7 +536,8 @@ int timekeeping_inject_offset(struct timespec *ts)
if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
return -EINVAL;
- write_seqlock_irqsave(&tk->lock, flags);
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ write_seqcount_begin(&timekeeper_seq);
timekeeping_forward_now(tk);
@@ -502,9 +552,10 @@ int timekeeping_inject_offset(struct timespec *ts)
tk_set_wall_to_mono(tk, timespec_sub(tk->wall_to_monotonic, *ts));
error: /* even if we error out, we forwarded the time, so call update */
- timekeeping_update(tk, true);
+ timekeeping_update(tk, true, true);
- write_sequnlock_irqrestore(&tk->lock, flags);
+ write_seqcount_end(&timekeeper_seq);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
/* signal hrtimers about time change */
clock_was_set();
@@ -513,6 +564,52 @@ error: /* even if we error out, we forwarded the time, so call update */
}
EXPORT_SYMBOL(timekeeping_inject_offset);
+
+/**
+ * timekeeping_get_tai_offset - Returns current TAI offset from UTC
+ *
+ */
+s32 timekeeping_get_tai_offset(void)
+{
+ struct timekeeper *tk = &timekeeper;
+ unsigned int seq;
+ s32 ret;
+
+ do {
+ seq = read_seqcount_begin(&timekeeper_seq);
+ ret = tk->tai_offset;
+ } while (read_seqcount_retry(&timekeeper_seq, seq));
+
+ return ret;
+}
+
+/**
+ * __timekeeping_set_tai_offset - Lock free worker function
+ *
+ */
+static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
+{
+ tk->tai_offset = tai_offset;
+ tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tai_offset, 0));
+}
+
+/**
+ * timekeeping_set_tai_offset - Sets the current TAI offset from UTC
+ *
+ */
+void timekeeping_set_tai_offset(s32 tai_offset)
+{
+ struct timekeeper *tk = &timekeeper;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ write_seqcount_begin(&timekeeper_seq);
+ __timekeeping_set_tai_offset(tk, tai_offset);
+ write_seqcount_end(&timekeeper_seq);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+ clock_was_set();
+}
+
/**
* change_clocksource - Swaps clocksources if a new one is available
*
@@ -526,7 +623,8 @@ static int change_clocksource(void *data)
new = (struct clocksource *) data;
- write_seqlock_irqsave(&tk->lock, flags);
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ write_seqcount_begin(&timekeeper_seq);
timekeeping_forward_now(tk);
if (!new->enable || new->enable(new) == 0) {
@@ -535,9 +633,10 @@ static int change_clocksource(void *data)
if (old->disable)
old->disable(old);
}
- timekeeping_update(tk, true);
+ timekeeping_update(tk, true, true);
- write_sequnlock_irqrestore(&tk->lock, flags);
+ write_seqcount_end(&timekeeper_seq);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
return 0;
}
@@ -587,11 +686,11 @@ void getrawmonotonic(struct timespec *ts)
s64 nsecs;
do {
- seq = read_seqbegin(&tk->lock);
+ seq = read_seqcount_begin(&timekeeper_seq);
nsecs = timekeeping_get_ns_raw(tk);
*ts = tk->raw_time;
- } while (read_seqretry(&tk->lock, seq));
+ } while (read_seqcount_retry(&timekeeper_seq, seq));
timespec_add_ns(ts, nsecs);
}
@@ -607,11 +706,11 @@ int timekeeping_valid_for_hres(void)
int ret;
do {
- seq = read_seqbegin(&tk->lock);
+ seq = read_seqcount_begin(&timekeeper_seq);
ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
- } while (read_seqretry(&tk->lock, seq));
+ } while (read_seqcount_retry(&timekeeper_seq, seq));
return ret;
}
@@ -626,11 +725,11 @@ u64 timekeeping_max_deferment(void)
u64 ret;
do {
- seq = read_seqbegin(&tk->lock);
+ seq = read_seqcount_begin(&timekeeper_seq);
ret = tk->clock->max_idle_ns;
- } while (read_seqretry(&tk->lock, seq));
+ } while (read_seqcount_retry(&timekeeper_seq, seq));
return ret;
}
@@ -693,11 +792,10 @@ void __init timekeeping_init(void)
boot.tv_nsec = 0;
}
- seqlock_init(&tk->lock);
-
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ write_seqcount_begin(&timekeeper_seq);
ntp_init();
- write_seqlock_irqsave(&tk->lock, flags);
clock = clocksource_default_clock();
if (clock->enable)
clock->enable(clock);
@@ -716,7 +814,10 @@ void __init timekeeping_init(void)
tmp.tv_nsec = 0;
tk_set_sleep_time(tk, tmp);
- write_sequnlock_irqrestore(&tk->lock, flags);
+ memcpy(&shadow_timekeeper, &timekeeper, sizeof(timekeeper));
+
+ write_seqcount_end(&timekeeper_seq);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
}
/* time in seconds when suspend began */
@@ -764,15 +865,17 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
if (has_persistent_clock())
return;
- write_seqlock_irqsave(&tk->lock, flags);
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ write_seqcount_begin(&timekeeper_seq);
timekeeping_forward_now(tk);
__timekeeping_inject_sleeptime(tk, delta);
- timekeeping_update(tk, true);
+ timekeeping_update(tk, true, true);
- write_sequnlock_irqrestore(&tk->lock, flags);
+ write_seqcount_end(&timekeeper_seq);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
/* signal hrtimers about time change */
clock_was_set();
@@ -788,26 +891,72 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
static void timekeeping_resume(void)
{
struct timekeeper *tk = &timekeeper;
+ struct clocksource *clock = tk->clock;
unsigned long flags;
- struct timespec ts;
+ struct timespec ts_new, ts_delta;
+ cycle_t cycle_now, cycle_delta;
+ bool suspendtime_found = false;
- read_persistent_clock(&ts);
+ read_persistent_clock(&ts_new);
clockevents_resume();
clocksource_resume();
- write_seqlock_irqsave(&tk->lock, flags);
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ write_seqcount_begin(&timekeeper_seq);
+
+ /*
+ * After system resumes, we need to calculate the suspended time and
+ * compensate it for the OS time. There are 3 sources that could be
+ * used: Nonstop clocksource during suspend, persistent clock and rtc
+ * device.
+ *
+ * One specific platform may have 1 or 2 or all of them, and the
+ * preference will be:
+ * suspend-nonstop clocksource -> persistent clock -> rtc
+ * The less preferred source will only be tried if there is no better
+ * usable source. The rtc part is handled separately in rtc core code.
+ */
+ cycle_now = clock->read(clock);
+ if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
+ cycle_now > clock->cycle_last) {
+ u64 num, max = ULLONG_MAX;
+ u32 mult = clock->mult;
+ u32 shift = clock->shift;
+ s64 nsec = 0;
+
+ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
- if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
- ts = timespec_sub(ts, timekeeping_suspend_time);
- __timekeeping_inject_sleeptime(tk, &ts);
+ /*
+ * "cycle_delta * mutl" may cause 64 bits overflow, if the
+ * suspended time is too long. In that case we need do the
+ * 64 bits math carefully
+ */
+ do_div(max, mult);
+ if (cycle_delta > max) {
+ num = div64_u64(cycle_delta, max);
+ nsec = (((u64) max * mult) >> shift) * num;
+ cycle_delta -= num * max;
+ }
+ nsec += ((u64) cycle_delta * mult) >> shift;
+
+ ts_delta = ns_to_timespec(nsec);
+ suspendtime_found = true;
+ } else if (timespec_compare(&ts_new, &timekeeping_suspend_time) > 0) {
+ ts_delta = timespec_sub(ts_new, timekeeping_suspend_time);
+ suspendtime_found = true;
}
- /* re-base the last cycle value */
- tk->clock->cycle_last = tk->clock->read(tk->clock);
+
+ if (suspendtime_found)
+ __timekeeping_inject_sleeptime(tk, &ts_delta);
+
+ /* Re-base the last cycle value */
+ tk->cycle_last = clock->cycle_last = cycle_now;
tk->ntp_error = 0;
timekeeping_suspended = 0;
- timekeeping_update(tk, false);
- write_sequnlock_irqrestore(&tk->lock, flags);
+ timekeeping_update(tk, false, true);
+ write_seqcount_end(&timekeeper_seq);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
touch_softlockup_watchdog();
@@ -826,7 +975,8 @@ static int timekeeping_suspend(void)
read_persistent_clock(&timekeeping_suspend_time);
- write_seqlock_irqsave(&tk->lock, flags);
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ write_seqcount_begin(&timekeeper_seq);
timekeeping_forward_now(tk);
timekeeping_suspended = 1;
@@ -849,7 +999,8 @@ static int timekeeping_suspend(void)
timekeeping_suspend_time =
timespec_add(timekeeping_suspend_time, delta_delta);
}
- write_sequnlock_irqrestore(&tk->lock, flags);
+ write_seqcount_end(&timekeeper_seq);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
clocksource_suspend();
@@ -1099,6 +1250,8 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
tk_set_wall_to_mono(tk,
timespec_sub(tk->wall_to_monotonic, ts));
+ __timekeeping_set_tai_offset(tk, tk->tai_offset - leap);
+
clock_was_set_delayed();
}
}
@@ -1116,15 +1269,16 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
u32 shift)
{
+ cycle_t interval = tk->cycle_interval << shift;
u64 raw_nsecs;
/* If the offset is smaller then a shifted interval, do nothing */
- if (offset < tk->cycle_interval<<shift)
+ if (offset < interval)
return offset;
/* Accumulate one shifted interval */
- offset -= tk->cycle_interval << shift;
- tk->clock->cycle_last += tk->cycle_interval << shift;
+ offset -= interval;
+ tk->cycle_last += interval;
tk->xtime_nsec += tk->xtime_interval << shift;
accumulate_nsecs_to_secs(tk);
@@ -1181,27 +1335,28 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)
static void update_wall_time(void)
{
struct clocksource *clock;
- struct timekeeper *tk = &timekeeper;
+ struct timekeeper *real_tk = &timekeeper;
+ struct timekeeper *tk = &shadow_timekeeper;
cycle_t offset;
int shift = 0, maxshift;
unsigned long flags;
- write_seqlock_irqsave(&tk->lock, flags);
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
/* Make sure we're fully resumed: */
if (unlikely(timekeeping_suspended))
goto out;
- clock = tk->clock;
+ clock = real_tk->clock;
#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
- offset = tk->cycle_interval;
+ offset = real_tk->cycle_interval;
#else
offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
#endif
/* Check if there's really nothing to do */
- if (offset < tk->cycle_interval)
+ if (offset < real_tk->cycle_interval)
goto out;
/*
@@ -1238,11 +1393,24 @@ static void update_wall_time(void)
*/
accumulate_nsecs_to_secs(tk);
- timekeeping_update(tk, false);
-
+ write_seqcount_begin(&timekeeper_seq);
+ /* Update clock->cycle_last with the new value */
+ clock->cycle_last = tk->cycle_last;
+ /*
+ * Update the real timekeeper.
+ *
+ * We could avoid this memcpy by switching pointers, but that
+ * requires changes to all other timekeeper usage sites as
+ * well, i.e. move the timekeeper pointer getter into the
+ * spinlocked/seqcount protected sections. And we trade this
+ * memcpy under the timekeeper_seq against one before we start
+ * updating.
+ */
+ memcpy(real_tk, tk, sizeof(*tk));
+ timekeeping_update(real_tk, false, false);
+ write_seqcount_end(&timekeeper_seq);
out:
- write_sequnlock_irqrestore(&tk->lock, flags);
-
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
}
/**
@@ -1289,13 +1457,13 @@ void get_monotonic_boottime(struct timespec *ts)
WARN_ON(timekeeping_suspended);
do {
- seq = read_seqbegin(&tk->lock);
+ seq = read_seqcount_begin(&timekeeper_seq);
ts->tv_sec = tk->xtime_sec;
nsec = timekeeping_get_ns(tk);
tomono = tk->wall_to_monotonic;
sleep = tk->total_sleep_time;
- } while (read_seqretry(&tk->lock, seq));
+ } while (read_seqcount_retry(&timekeeper_seq, seq));
ts->tv_sec += tomono.tv_sec + sleep.tv_sec;
ts->tv_nsec = 0;
@@ -1354,10 +1522,10 @@ struct timespec current_kernel_time(void)
unsigned long seq;
do {
- seq = read_seqbegin(&tk->lock);
+ seq = read_seqcount_begin(&timekeeper_seq);
now = tk_xtime(tk);
- } while (read_seqretry(&tk->lock, seq));
+ } while (read_seqcount_retry(&timekeeper_seq, seq));
return now;
}
@@ -1370,11 +1538,11 @@ struct timespec get_monotonic_coarse(void)
unsigned long seq;
do {
- seq = read_seqbegin(&tk->lock);
+ seq = read_seqcount_begin(&timekeeper_seq);
now = tk_xtime(tk);
mono = tk->wall_to_monotonic;
- } while (read_seqretry(&tk->lock, seq));
+ } while (read_seqcount_retry(&timekeeper_seq, seq));
set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
now.tv_nsec + mono.tv_nsec);
@@ -1405,11 +1573,11 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
unsigned long seq;
do {
- seq = read_seqbegin(&tk->lock);
+ seq = read_seqcount_begin(&timekeeper_seq);
*xtim = tk_xtime(tk);
*wtom = tk->wall_to_monotonic;
*sleep = tk->total_sleep_time;
- } while (read_seqretry(&tk->lock, seq));
+ } while (read_seqcount_retry(&timekeeper_seq, seq));
}
#ifdef CONFIG_HIGH_RES_TIMERS
@@ -1421,7 +1589,8 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
* Returns current monotonic time and updates the offsets
* Called from hrtimer_interupt() or retrigger_next_event()
*/
-ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
+ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot,
+ ktime_t *offs_tai)
{
struct timekeeper *tk = &timekeeper;
ktime_t now;
@@ -1429,14 +1598,15 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
u64 secs, nsecs;
do {
- seq = read_seqbegin(&tk->lock);
+ seq = read_seqcount_begin(&timekeeper_seq);
secs = tk->xtime_sec;
nsecs = timekeeping_get_ns(tk);
*offs_real = tk->offs_real;
*offs_boot = tk->offs_boot;
- } while (read_seqretry(&tk->lock, seq));
+ *offs_tai = tk->offs_tai;
+ } while (read_seqcount_retry(&timekeeper_seq, seq));
now = ktime_add_ns(ktime_set(secs, 0), nsecs);
now = ktime_sub(now, *offs_real);
@@ -1454,15 +1624,79 @@ ktime_t ktime_get_monotonic_offset(void)
struct timespec wtom;
do {
- seq = read_seqbegin(&tk->lock);
+ seq = read_seqcount_begin(&timekeeper_seq);
wtom = tk->wall_to_monotonic;
- } while (read_seqretry(&tk->lock, seq));
+ } while (read_seqcount_retry(&timekeeper_seq, seq));
return timespec_to_ktime(wtom);
}
EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
/**
+ * do_adjtimex() - Accessor function to NTP __do_adjtimex function
+ */
+int do_adjtimex(struct timex *txc)
+{
+ struct timekeeper *tk = &timekeeper;
+ unsigned long flags;
+ struct timespec ts;
+ s32 orig_tai, tai;
+ int ret;
+
+ /* Validate the data before disabling interrupts */
+ ret = ntp_validate_timex(txc);
+ if (ret)
+ return ret;
+
+ if (txc->modes & ADJ_SETOFFSET) {
+ struct timespec delta;
+ delta.tv_sec = txc->time.tv_sec;
+ delta.tv_nsec = txc->time.tv_usec;
+ if (!(txc->modes & ADJ_NANO))
+ delta.tv_nsec *= 1000;
+ ret = timekeeping_inject_offset(&delta);
+ if (ret)
+ return ret;
+ }
+
+ getnstimeofday(&ts);
+
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ write_seqcount_begin(&timekeeper_seq);
+
+ orig_tai = tai = tk->tai_offset;
+ ret = __do_adjtimex(txc, &ts, &tai);
+
+ if (tai != orig_tai) {
+ __timekeeping_set_tai_offset(tk, tai);
+ clock_was_set_delayed();
+ }
+ write_seqcount_end(&timekeeper_seq);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+
+ return ret;
+}
+
+#ifdef CONFIG_NTP_PPS
+/**
+ * hardpps() - Accessor function to NTP __hardpps function
+ */
+void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&timekeeper_lock, flags);
+ write_seqcount_begin(&timekeeper_seq);
+
+ __hardpps(phase_ts, raw_ts);
+
+ write_seqcount_end(&timekeeper_seq);
+ raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+}
+EXPORT_SYMBOL(hardpps);
+#endif
+
+/**
* xtime_update() - advances the timekeeping infrastructure
* @ticks: number of ticks, that have elapsed since the last call.
*
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index af5a7e9f164..3bdf2832301 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -20,6 +20,13 @@
#include <asm/uaccess.h>
+
+struct timer_list_iter {
+ int cpu;
+ bool second_pass;
+ u64 now;
+};
+
typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes);
DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
@@ -133,7 +140,6 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
int i;
- SEQ_printf(m, "\n");
SEQ_printf(m, "cpu: %d\n", cpu);
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
SEQ_printf(m, " clock %d:\n", i);
@@ -187,6 +193,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
#undef P
#undef P_ns
+ SEQ_printf(m, "\n");
}
#ifdef CONFIG_GENERIC_CLOCKEVENTS
@@ -195,7 +202,6 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
{
struct clock_event_device *dev = td->evtdev;
- SEQ_printf(m, "\n");
SEQ_printf(m, "Tick Device: mode: %d\n", td->mode);
if (cpu < 0)
SEQ_printf(m, "Broadcast device\n");
@@ -230,12 +236,11 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
print_name_offset(m, dev->event_handler);
SEQ_printf(m, "\n");
SEQ_printf(m, " retries: %lu\n", dev->retries);
+ SEQ_printf(m, "\n");
}
-static void timer_list_show_tickdevices(struct seq_file *m)
+static void timer_list_show_tickdevices_header(struct seq_file *m)
{
- int cpu;
-
#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
print_tickdevice(m, tick_get_broadcast_device(), -1);
SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
@@ -246,47 +251,104 @@ static void timer_list_show_tickdevices(struct seq_file *m)
#endif
SEQ_printf(m, "\n");
#endif
- for_each_online_cpu(cpu)
- print_tickdevice(m, tick_get_device(cpu), cpu);
- SEQ_printf(m, "\n");
}
-#else
-static void timer_list_show_tickdevices(struct seq_file *m) { }
#endif
+static inline void timer_list_header(struct seq_file *m, u64 now)
+{
+ SEQ_printf(m, "Timer List Version: v0.7\n");
+ SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
+ SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
+ SEQ_printf(m, "\n");
+}
+
static int timer_list_show(struct seq_file *m, void *v)
{
+ struct timer_list_iter *iter = v;
+ u64 now = ktime_to_ns(ktime_get());
+
+ if (iter->cpu == -1 && !iter->second_pass)
+ timer_list_header(m, now);
+ else if (!iter->second_pass)
+ print_cpu(m, iter->cpu, iter->now);
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+ else if (iter->cpu == -1 && iter->second_pass)
+ timer_list_show_tickdevices_header(m);
+ else
+ print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu);
+#endif
+ return 0;
+}
+
+void sysrq_timer_list_show(void)
+{
u64 now = ktime_to_ns(ktime_get());
int cpu;
- SEQ_printf(m, "Timer List Version: v0.7\n");
- SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
- SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
+ timer_list_header(NULL, now);
for_each_online_cpu(cpu)
- print_cpu(m, cpu, now);
+ print_cpu(NULL, cpu, now);
- SEQ_printf(m, "\n");
- timer_list_show_tickdevices(m);
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+ timer_list_show_tickdevices_header(NULL);
+ for_each_online_cpu(cpu)
+ print_tickdevice(NULL, tick_get_device(cpu), cpu);
+#endif
+ return;
+}
- return 0;
+static void *timer_list_start(struct seq_file *file, loff_t *offset)
+{
+ struct timer_list_iter *iter = file->private;
+
+ if (!*offset) {
+ iter->cpu = -1;
+ iter->now = ktime_to_ns(ktime_get());
+ } else if (iter->cpu >= nr_cpu_ids) {
+#ifdef CONFIG_GENERIC_CLOCKEVENTS
+ if (!iter->second_pass) {
+ iter->cpu = -1;
+ iter->second_pass = true;
+ } else
+ return NULL;
+#else
+ return NULL;
+#endif
+ }
+ return iter;
}
-void sysrq_timer_list_show(void)
+static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset)
+{
+ struct timer_list_iter *iter = file->private;
+ iter->cpu = cpumask_next(iter->cpu, cpu_online_mask);
+ ++*offset;
+ return timer_list_start(file, offset);
+}
+
+static void timer_list_stop(struct seq_file *seq, void *v)
{
- timer_list_show(NULL, NULL);
}
+static const struct seq_operations timer_list_sops = {
+ .start = timer_list_start,
+ .next = timer_list_next,
+ .stop = timer_list_stop,
+ .show = timer_list_show,
+};
+
static int timer_list_open(struct inode *inode, struct file *filp)
{
- return single_open(filp, timer_list_show, NULL);
+ return seq_open_private(filp, &timer_list_sops,
+ sizeof(struct timer_list_iter));
}
static const struct file_operations timer_list_fops = {
.open = timer_list_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .release = seq_release_private,
};
static int __init init_timer_list_procfs(void)
diff --git a/kernel/timer.c b/kernel/timer.c
index dbf7a78a1ef..a860bba3441 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1,7 +1,7 @@
/*
* linux/kernel/timer.c
*
- * Kernel internal timers, basic process system calls
+ * Kernel internal timers
*
* Copyright (C) 1991, 1992 Linus Torvalds
*
@@ -41,6 +41,7 @@
#include <linux/sched.h>
#include <linux/sched/sysctl.h>
#include <linux/slab.h>
+#include <linux/compat.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -738,7 +739,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
cpu = smp_processor_id();
-#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
+#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP)
if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu))
cpu = get_nohz_timer_target();
#endif
@@ -930,14 +931,14 @@ void add_timer_on(struct timer_list *timer, int cpu)
debug_activate(timer, timer->expires);
internal_add_timer(base, timer);
/*
- * Check whether the other CPU is idle and needs to be
- * triggered to reevaluate the timer wheel when nohz is
- * active. We are protected against the other CPU fiddling
+ * Check whether the other CPU is in dynticks mode and needs
+ * to be triggered to reevaluate the timer wheel.
+ * We are protected against the other CPU fiddling
* with the timer by holding the timer base lock. This also
- * makes sure that a CPU on the way to idle can not evaluate
- * the timer wheel.
+ * makes sure that a CPU on the way to stop its tick can not
+ * evaluate the timer wheel.
*/
- wake_up_idle_cpu(cpu);
+ wake_up_nohz_cpu(cpu);
spin_unlock_irqrestore(&base->lock, flags);
}
EXPORT_SYMBOL_GPL(add_timer_on);
@@ -1188,7 +1189,7 @@ static inline void __run_timers(struct tvec_base *base)
spin_unlock_irq(&base->lock);
}
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
/*
* Find out when the next timer event is due to happen. This
* is used on S/390 to stop all activity when a CPU is idle.
@@ -1395,61 +1396,6 @@ SYSCALL_DEFINE1(alarm, unsigned int, seconds)
#endif
-/**
- * sys_getpid - return the thread group id of the current process
- *
- * Note, despite the name, this returns the tgid not the pid. The tgid and
- * the pid are identical unless CLONE_THREAD was specified on clone() in
- * which case the tgid is the same in all threads of the same group.
- *
- * This is SMP safe as current->tgid does not change.
- */
-SYSCALL_DEFINE0(getpid)
-{
- return task_tgid_vnr(current);
-}
-
-/*
- * Accessing ->real_parent is not SMP-safe, it could
- * change from under us. However, we can use a stale
- * value of ->real_parent under rcu_read_lock(), see
- * release_task()->call_rcu(delayed_put_task_struct).
- */
-SYSCALL_DEFINE0(getppid)
-{
- int pid;
-
- rcu_read_lock();
- pid = task_tgid_vnr(rcu_dereference(current->real_parent));
- rcu_read_unlock();
-
- return pid;
-}
-
-SYSCALL_DEFINE0(getuid)
-{
- /* Only we change this so SMP safe */
- return from_kuid_munged(current_user_ns(), current_uid());
-}
-
-SYSCALL_DEFINE0(geteuid)
-{
- /* Only we change this so SMP safe */
- return from_kuid_munged(current_user_ns(), current_euid());
-}
-
-SYSCALL_DEFINE0(getgid)
-{
- /* Only we change this so SMP safe */
- return from_kgid_munged(current_user_ns(), current_gid());
-}
-
-SYSCALL_DEFINE0(getegid)
-{
- /* Only we change this so SMP safe */
- return from_kgid_munged(current_user_ns(), current_egid());
-}
-
static void process_timeout(unsigned long __data)
{
wake_up_process((struct task_struct *)__data);
@@ -1557,91 +1503,6 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout)
}
EXPORT_SYMBOL(schedule_timeout_uninterruptible);
-/* Thread ID - the internal kernel "pid" */
-SYSCALL_DEFINE0(gettid)
-{
- return task_pid_vnr(current);
-}
-
-/**
- * do_sysinfo - fill in sysinfo struct
- * @info: pointer to buffer to fill
- */
-int do_sysinfo(struct sysinfo *info)
-{
- unsigned long mem_total, sav_total;
- unsigned int mem_unit, bitcount;
- struct timespec tp;
-
- memset(info, 0, sizeof(struct sysinfo));
-
- ktime_get_ts(&tp);
- monotonic_to_bootbased(&tp);
- info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
-
- get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
-
- info->procs = nr_threads;
-
- si_meminfo(info);
- si_swapinfo(info);
-
- /*
- * If the sum of all the available memory (i.e. ram + swap)
- * is less than can be stored in a 32 bit unsigned long then
- * we can be binary compatible with 2.2.x kernels. If not,
- * well, in that case 2.2.x was broken anyways...
- *
- * -Erik Andersen <andersee@debian.org>
- */
-
- mem_total = info->totalram + info->totalswap;
- if (mem_total < info->totalram || mem_total < info->totalswap)
- goto out;
- bitcount = 0;
- mem_unit = info->mem_unit;
- while (mem_unit > 1) {
- bitcount++;
- mem_unit >>= 1;
- sav_total = mem_total;
- mem_total <<= 1;
- if (mem_total < sav_total)
- goto out;
- }
-
- /*
- * If mem_total did not overflow, multiply all memory values by
- * info->mem_unit and set it to 1. This leaves things compatible
- * with 2.2.x, and also retains compatibility with earlier 2.4.x
- * kernels...
- */
-
- info->mem_unit = 1;
- info->totalram <<= bitcount;
- info->freeram <<= bitcount;
- info->sharedram <<= bitcount;
- info->bufferram <<= bitcount;
- info->totalswap <<= bitcount;
- info->freeswap <<= bitcount;
- info->totalhigh <<= bitcount;
- info->freehigh <<= bitcount;
-
-out:
- return 0;
-}
-
-SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info)
-{
- struct sysinfo val;
-
- do_sysinfo(&val);
-
- if (copy_to_user(info, &val, sizeof(struct sysinfo)))
- return -EFAULT;
-
- return 0;
-}
-
static int __cpuinit init_timers_cpu(int cpu)
{
int j;
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index ed58a3216a6..b8b8560bfb9 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1808,6 +1808,7 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
rwbs[i] = '\0';
}
+EXPORT_SYMBOL_GPL(blk_fill_rwbs);
#endif /* CONFIG_EVENT_TRACING */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9e014582e76..711ca7d3e7f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -109,11 +109,6 @@ struct kretprobe_trace_entry_head {
unsigned long ret_ip;
};
-struct uprobe_trace_entry_head {
- struct trace_entry ent;
- unsigned long ip;
-};
-
/*
* trace_flag_type is an enumeration that holds different
* states when a trace occurs. These are:
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 8dad2a92dee..32494fb0ee6 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -28,6 +28,18 @@
#define UPROBE_EVENT_SYSTEM "uprobes"
+struct uprobe_trace_entry_head {
+ struct trace_entry ent;
+ unsigned long vaddr[];
+};
+
+#define SIZEOF_TRACE_ENTRY(is_return) \
+ (sizeof(struct uprobe_trace_entry_head) + \
+ sizeof(unsigned long) * (is_return ? 2 : 1))
+
+#define DATAOF_TRACE_ENTRY(entry, is_return) \
+ ((void*)(entry) + SIZEOF_TRACE_ENTRY(is_return))
+
struct trace_uprobe_filter {
rwlock_t rwlock;
int nr_systemwide;
@@ -64,6 +76,8 @@ static DEFINE_MUTEX(uprobe_lock);
static LIST_HEAD(uprobe_list);
static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
+static int uretprobe_dispatcher(struct uprobe_consumer *con,
+ unsigned long func, struct pt_regs *regs);
static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter)
{
@@ -77,11 +91,16 @@ static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter)
return !filter->nr_systemwide && list_empty(&filter->perf_events);
}
+static inline bool is_ret_probe(struct trace_uprobe *tu)
+{
+ return tu->consumer.ret_handler != NULL;
+}
+
/*
* Allocate new trace_uprobe and initialize it (including uprobes).
*/
static struct trace_uprobe *
-alloc_trace_uprobe(const char *group, const char *event, int nargs)
+alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
{
struct trace_uprobe *tu;
@@ -106,6 +125,8 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs)
INIT_LIST_HEAD(&tu->list);
tu->consumer.handler = uprobe_dispatcher;
+ if (is_ret)
+ tu->consumer.ret_handler = uretprobe_dispatcher;
init_trace_uprobe_filter(&tu->filter);
return tu;
@@ -180,7 +201,7 @@ end:
/*
* Argument syntax:
- * - Add uprobe: p[:[GRP/]EVENT] PATH:SYMBOL[+offs] [FETCHARGS]
+ * - Add uprobe: p|r[:[GRP/]EVENT] PATH:SYMBOL [FETCHARGS]
*
* - Remove uprobe: -:[GRP/]EVENT
*/
@@ -192,20 +213,23 @@ static int create_trace_uprobe(int argc, char **argv)
char buf[MAX_EVENT_NAME_LEN];
struct path path;
unsigned long offset;
- bool is_delete;
+ bool is_delete, is_return;
int i, ret;
inode = NULL;
ret = 0;
is_delete = false;
+ is_return = false;
event = NULL;
group = NULL;
/* argc must be >= 1 */
if (argv[0][0] == '-')
is_delete = true;
+ else if (argv[0][0] == 'r')
+ is_return = true;
else if (argv[0][0] != 'p') {
- pr_info("Probe definition must be started with 'p' or '-'.\n");
+ pr_info("Probe definition must be started with 'p', 'r' or '-'.\n");
return -EINVAL;
}
@@ -303,7 +327,7 @@ static int create_trace_uprobe(int argc, char **argv)
kfree(tail);
}
- tu = alloc_trace_uprobe(group, event, argc);
+ tu = alloc_trace_uprobe(group, event, argc, is_return);
if (IS_ERR(tu)) {
pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu));
ret = PTR_ERR(tu);
@@ -414,9 +438,10 @@ static void probes_seq_stop(struct seq_file *m, void *v)
static int probes_seq_show(struct seq_file *m, void *v)
{
struct trace_uprobe *tu = v;
+ char c = is_ret_probe(tu) ? 'r' : 'p';
int i;
- seq_printf(m, "p:%s/%s", tu->call.class->system, tu->call.name);
+ seq_printf(m, "%c:%s/%s", c, tu->call.class->system, tu->call.name);
seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset);
for (i = 0; i < tu->nr_args; i++)
@@ -485,65 +510,81 @@ static const struct file_operations uprobe_profile_ops = {
.release = seq_release,
};
-/* uprobe handler */
-static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
+static void uprobe_trace_print(struct trace_uprobe *tu,
+ unsigned long func, struct pt_regs *regs)
{
struct uprobe_trace_entry_head *entry;
struct ring_buffer_event *event;
struct ring_buffer *buffer;
- u8 *data;
- int size, i, pc;
- unsigned long irq_flags;
+ void *data;
+ int size, i;
struct ftrace_event_call *call = &tu->call;
- local_save_flags(irq_flags);
- pc = preempt_count();
-
- size = sizeof(*entry) + tu->size;
-
+ size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
- size, irq_flags, pc);
+ size + tu->size, 0, 0);
if (!event)
- return 0;
+ return;
entry = ring_buffer_event_data(event);
- entry->ip = instruction_pointer(task_pt_regs(current));
- data = (u8 *)&entry[1];
+ if (is_ret_probe(tu)) {
+ entry->vaddr[0] = func;
+ entry->vaddr[1] = instruction_pointer(regs);
+ data = DATAOF_TRACE_ENTRY(entry, true);
+ } else {
+ entry->vaddr[0] = instruction_pointer(regs);
+ data = DATAOF_TRACE_ENTRY(entry, false);
+ }
+
for (i = 0; i < tu->nr_args; i++)
call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
if (!filter_current_check_discard(buffer, call, entry, event))
- trace_buffer_unlock_commit(buffer, event, irq_flags, pc);
+ trace_buffer_unlock_commit(buffer, event, 0, 0);
+}
+/* uprobe handler */
+static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
+{
+ if (!is_ret_probe(tu))
+ uprobe_trace_print(tu, 0, regs);
return 0;
}
+static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func,
+ struct pt_regs *regs)
+{
+ uprobe_trace_print(tu, func, regs);
+}
+
/* Event entry printers */
static enum print_line_t
print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event)
{
- struct uprobe_trace_entry_head *field;
+ struct uprobe_trace_entry_head *entry;
struct trace_seq *s = &iter->seq;
struct trace_uprobe *tu;
u8 *data;
int i;
- field = (struct uprobe_trace_entry_head *)iter->ent;
+ entry = (struct uprobe_trace_entry_head *)iter->ent;
tu = container_of(event, struct trace_uprobe, call.event);
- if (!trace_seq_printf(s, "%s: (", tu->call.name))
- goto partial;
-
- if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
- goto partial;
-
- if (!trace_seq_puts(s, ")"))
- goto partial;
+ if (is_ret_probe(tu)) {
+ if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", tu->call.name,
+ entry->vaddr[1], entry->vaddr[0]))
+ goto partial;
+ data = DATAOF_TRACE_ENTRY(entry, true);
+ } else {
+ if (!trace_seq_printf(s, "%s: (0x%lx)", tu->call.name,
+ entry->vaddr[0]))
+ goto partial;
+ data = DATAOF_TRACE_ENTRY(entry, false);
+ }
- data = (u8 *)&field[1];
for (i = 0; i < tu->nr_args; i++) {
if (!tu->args[i].type->print(s, tu->args[i].name,
- data + tu->args[i].offset, field))
+ data + tu->args[i].offset, entry))
goto partial;
}
@@ -595,16 +636,23 @@ static void probe_event_disable(struct trace_uprobe *tu, int flag)
static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
{
- int ret, i;
+ int ret, i, size;
struct uprobe_trace_entry_head field;
- struct trace_uprobe *tu = (struct trace_uprobe *)event_call->data;
+ struct trace_uprobe *tu = event_call->data;
- DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
+ if (is_ret_probe(tu)) {
+ DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_FUNC, 0);
+ DEFINE_FIELD(unsigned long, vaddr[1], FIELD_STRING_RETIP, 0);
+ size = SIZEOF_TRACE_ENTRY(true);
+ } else {
+ DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_IP, 0);
+ size = SIZEOF_TRACE_ENTRY(false);
+ }
/* Set argument names as fields */
for (i = 0; i < tu->nr_args; i++) {
ret = trace_define_field(event_call, tu->args[i].type->fmttype,
tu->args[i].name,
- sizeof(field) + tu->args[i].offset,
+ size + tu->args[i].offset,
tu->args[i].type->size,
tu->args[i].type->is_signed,
FILTER_OTHER);
@@ -622,8 +670,13 @@ static int __set_print_fmt(struct trace_uprobe *tu, char *buf, int len)
int i;
int pos = 0;
- fmt = "(%lx)";
- arg = "REC->" FIELD_STRING_IP;
+ if (is_ret_probe(tu)) {
+ fmt = "(%lx <- %lx)";
+ arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
+ } else {
+ fmt = "(%lx)";
+ arg = "REC->" FIELD_STRING_IP;
+ }
/* When len=0, we just calculate the needed length */
@@ -752,49 +805,68 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc,
return ret;
}
-/* uprobe profile handler */
-static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
+static void uprobe_perf_print(struct trace_uprobe *tu,
+ unsigned long func, struct pt_regs *regs)
{
struct ftrace_event_call *call = &tu->call;
struct uprobe_trace_entry_head *entry;
struct hlist_head *head;
- u8 *data;
- int size, __size, i;
- int rctx;
+ void *data;
+ int size, rctx, i;
- if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
- return UPROBE_HANDLER_REMOVE;
-
- __size = sizeof(*entry) + tu->size;
- size = ALIGN(__size + sizeof(u32), sizeof(u64));
- size -= sizeof(u32);
+ size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
+ size = ALIGN(size + tu->size + sizeof(u32), sizeof(u64)) - sizeof(u32);
if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
- return 0;
+ return;
preempt_disable();
+ head = this_cpu_ptr(call->perf_events);
+ if (hlist_empty(head))
+ goto out;
entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
if (!entry)
goto out;
- entry->ip = instruction_pointer(task_pt_regs(current));
- data = (u8 *)&entry[1];
+ if (is_ret_probe(tu)) {
+ entry->vaddr[0] = func;
+ entry->vaddr[1] = instruction_pointer(regs);
+ data = DATAOF_TRACE_ENTRY(entry, true);
+ } else {
+ entry->vaddr[0] = instruction_pointer(regs);
+ data = DATAOF_TRACE_ENTRY(entry, false);
+ }
+
for (i = 0; i < tu->nr_args; i++)
call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
- head = this_cpu_ptr(call->perf_events);
- perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head, NULL);
-
+ perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
out:
preempt_enable();
+}
+
+/* uprobe profile handler */
+static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
+{
+ if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
+ return UPROBE_HANDLER_REMOVE;
+
+ if (!is_ret_probe(tu))
+ uprobe_perf_print(tu, 0, regs);
return 0;
}
+
+static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
+ struct pt_regs *regs)
+{
+ uprobe_perf_print(tu, func, regs);
+}
#endif /* CONFIG_PERF_EVENTS */
static
int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data)
{
- struct trace_uprobe *tu = (struct trace_uprobe *)event->data;
+ struct trace_uprobe *tu = event->data;
switch (type) {
case TRACE_REG_REGISTER:
@@ -843,6 +915,23 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
return ret;
}
+static int uretprobe_dispatcher(struct uprobe_consumer *con,
+ unsigned long func, struct pt_regs *regs)
+{
+ struct trace_uprobe *tu;
+
+ tu = container_of(con, struct trace_uprobe, consumer);
+
+ if (tu->flags & TP_FLAG_TRACE)
+ uretprobe_trace_func(tu, func, regs);
+
+#ifdef CONFIG_PERF_EVENTS
+ if (tu->flags & TP_FLAG_PROFILE)
+ uretprobe_perf_func(tu, func, regs);
+#endif
+ return 0;
+}
+
static struct trace_event_functions uprobe_funcs = {
.trace = print_uprobe_event
};
diff --git a/kernel/uid16.c b/kernel/uid16.c
index d7948eb1022..f6c83d7ef00 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -18,67 +18,43 @@
SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group)
{
- long ret = sys_chown(filename, low2highuid(user), low2highgid(group));
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(3, ret, filename, user, group);
- return ret;
+ return sys_chown(filename, low2highuid(user), low2highgid(group));
}
SYSCALL_DEFINE3(lchown16, const char __user *, filename, old_uid_t, user, old_gid_t, group)
{
- long ret = sys_lchown(filename, low2highuid(user), low2highgid(group));
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(3, ret, filename, user, group);
- return ret;
+ return sys_lchown(filename, low2highuid(user), low2highgid(group));
}
SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group)
{
- long ret = sys_fchown(fd, low2highuid(user), low2highgid(group));
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(3, ret, fd, user, group);
- return ret;
+ return sys_fchown(fd, low2highuid(user), low2highgid(group));
}
SYSCALL_DEFINE2(setregid16, old_gid_t, rgid, old_gid_t, egid)
{
- long ret = sys_setregid(low2highgid(rgid), low2highgid(egid));
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(2, ret, rgid, egid);
- return ret;
+ return sys_setregid(low2highgid(rgid), low2highgid(egid));
}
SYSCALL_DEFINE1(setgid16, old_gid_t, gid)
{
- long ret = sys_setgid(low2highgid(gid));
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(1, ret, gid);
- return ret;
+ return sys_setgid(low2highgid(gid));
}
SYSCALL_DEFINE2(setreuid16, old_uid_t, ruid, old_uid_t, euid)
{
- long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid));
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(2, ret, ruid, euid);
- return ret;
+ return sys_setreuid(low2highuid(ruid), low2highuid(euid));
}
SYSCALL_DEFINE1(setuid16, old_uid_t, uid)
{
- long ret = sys_setuid(low2highuid(uid));
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(1, ret, uid);
- return ret;
+ return sys_setuid(low2highuid(uid));
}
SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid)
{
- long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid),
+ return sys_setresuid(low2highuid(ruid), low2highuid(euid),
low2highuid(suid));
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(3, ret, ruid, euid, suid);
- return ret;
}
SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euidp, old_uid_t __user *, suidp)
@@ -100,11 +76,8 @@ SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euid
SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid)
{
- long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid),
+ return sys_setresgid(low2highgid(rgid), low2highgid(egid),
low2highgid(sgid));
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(3, ret, rgid, egid, sgid);
- return ret;
}
@@ -127,18 +100,12 @@ SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egid
SYSCALL_DEFINE1(setfsuid16, old_uid_t, uid)
{
- long ret = sys_setfsuid(low2highuid(uid));
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(1, ret, uid);
- return ret;
+ return sys_setfsuid(low2highuid(uid));
}
SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid)
{
- long ret = sys_setfsgid(low2highgid(gid));
- /* avoid REGPARM breakage on x86: */
- asmlinkage_protect(1, ret, gid);
- return ret;
+ return sys_setfsgid(low2highgid(gid));
}
static int groups16_to_user(old_gid_t __user *grouplist,
diff --git a/kernel/user.c b/kernel/user.c
index 8e635a18ab5..69b4c3d48cd 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,7 +16,7 @@
#include <linux/interrupt.h>
#include <linux/export.h>
#include <linux/user_namespace.h>
-#include <linux/proc_fs.h>
+#include <linux/proc_ns.h>
/*
* userns count is 1 for root user, 1 for init_uts_ns,
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index e134d8f365d..d8c30db06c5 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,7 +9,7 @@
#include <linux/nsproxy.h>
#include <linux/slab.h>
#include <linux/user_namespace.h>
-#include <linux/proc_fs.h>
+#include <linux/proc_ns.h>
#include <linux/highuid.h>
#include <linux/cred.h>
#include <linux/securebits.h>
diff --git a/kernel/utsname.c b/kernel/utsname.c
index a47fc5de311..2fc8576efaa 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -15,7 +15,7 @@
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/user_namespace.h>
-#include <linux/proc_fs.h>
+#include <linux/proc_ns.h>
static struct uts_namespace *create_uts_ns(void)
{
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 4a944676358..05039e348f0 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -517,6 +517,11 @@ int proc_dowatchdog(struct ctl_table *table, int write,
return ret;
set_sample_period();
+ /*
+ * Watchdog threads shouldn't be enabled if they are
+ * disabled. The 'watchdog_disabled' variable check in
+ * watchdog_*_all_cpus() function takes care of this.
+ */
if (watchdog_enabled && watchdog_thresh)
watchdog_enable_all_cpus();
else
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 154aa12af48..4aa9f5bc6b2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -46,6 +46,7 @@
#include <linux/rculist.h>
#include <linux/nodemask.h>
#include <linux/moduleparam.h>
+#include <linux/uaccess.h>
#include "workqueue_internal.h"
@@ -2197,6 +2198,7 @@ __acquires(&pool->lock)
worker->current_work = NULL;
worker->current_func = NULL;
worker->current_pwq = NULL;
+ worker->desc_valid = false;
pwq_dec_nr_in_flight(pwq, work_color);
}
@@ -4365,6 +4367,83 @@ unsigned int work_busy(struct work_struct *work)
}
EXPORT_SYMBOL_GPL(work_busy);
+/**
+ * set_worker_desc - set description for the current work item
+ * @fmt: printf-style format string
+ * @...: arguments for the format string
+ *
+ * This function can be called by a running work function to describe what
+ * the work item is about. If the worker task gets dumped, this
+ * information will be printed out together to help debugging. The
+ * description can be at most WORKER_DESC_LEN including the trailing '\0'.
+ */
+void set_worker_desc(const char *fmt, ...)
+{
+ struct worker *worker = current_wq_worker();
+ va_list args;
+
+ if (worker) {
+ va_start(args, fmt);
+ vsnprintf(worker->desc, sizeof(worker->desc), fmt, args);
+ va_end(args);
+ worker->desc_valid = true;
+ }
+}
+
+/**
+ * print_worker_info - print out worker information and description
+ * @log_lvl: the log level to use when printing
+ * @task: target task
+ *
+ * If @task is a worker and currently executing a work item, print out the
+ * name of the workqueue being serviced and worker description set with
+ * set_worker_desc() by the currently executing work item.
+ *
+ * This function can be safely called on any task as long as the
+ * task_struct itself is accessible. While safe, this function isn't
+ * synchronized and may print out mixups or garbages of limited length.
+ */
+void print_worker_info(const char *log_lvl, struct task_struct *task)
+{
+ work_func_t *fn = NULL;
+ char name[WQ_NAME_LEN] = { };
+ char desc[WORKER_DESC_LEN] = { };
+ struct pool_workqueue *pwq = NULL;
+ struct workqueue_struct *wq = NULL;
+ bool desc_valid = false;
+ struct worker *worker;
+
+ if (!(task->flags & PF_WQ_WORKER))
+ return;
+
+ /*
+ * This function is called without any synchronization and @task
+ * could be in any state. Be careful with dereferences.
+ */
+ worker = probe_kthread_data(task);
+
+ /*
+ * Carefully copy the associated workqueue's workfn and name. Keep
+ * the original last '\0' in case the original contains garbage.
+ */
+ probe_kernel_read(&fn, &worker->current_func, sizeof(fn));
+ probe_kernel_read(&pwq, &worker->current_pwq, sizeof(pwq));
+ probe_kernel_read(&wq, &pwq->wq, sizeof(wq));
+ probe_kernel_read(name, wq->name, sizeof(name) - 1);
+
+ /* copy worker description */
+ probe_kernel_read(&desc_valid, &worker->desc_valid, sizeof(desc_valid));
+ if (desc_valid)
+ probe_kernel_read(desc, worker->desc, sizeof(desc) - 1);
+
+ if (fn || name[0] || desc[0]) {
+ printk("%sWorkqueue: %s %pf", log_lvl, name, fn);
+ if (desc[0])
+ pr_cont(" (%s)", desc);
+ pr_cont("\n");
+ }
+}
+
/*
* CPU hotplug.
*
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index 84ab6e1dc6f..ad83c96b2ec 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -29,15 +29,25 @@ struct worker {
struct work_struct *current_work; /* L: work being processed */
work_func_t current_func; /* L: current_work's fn */
struct pool_workqueue *current_pwq; /* L: current_work's pwq */
+ bool desc_valid; /* ->desc is valid */
struct list_head scheduled; /* L: scheduled works */
+
+ /* 64 bytes boundary on 64bit, 32 on 32bit */
+
struct task_struct *task; /* I: worker task */
struct worker_pool *pool; /* I: the associated pool */
/* L: for rescuers */
- /* 64 bytes boundary on 64bit, 32 on 32bit */
+
unsigned long last_active; /* L: last active timestamp */
unsigned int flags; /* X: flags */
int id; /* I: worker id */
+ /*
+ * Opaque string set with work_set_desc(). Printed out with task
+ * dump for debugging - WARN, BUG, panic or sysrq.
+ */
+ char desc[WORKER_DESC_LEN];
+
/* used only by rescuers to point to the target workqueue */
struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */
};