From 9b91528403894641385ec9491a8d7bb84cadccbc Mon Sep 17 00:00:00 2001 From: Will Drewry Date: Tue, 24 May 2011 10:44:51 -0500 Subject: [PATCH 1/2] RFC: perf, tracing: add filter list traversal interruption, secure computing This change extends the perf infrastructure to support two new perf_event attributes: 1. err_on_discard 2. require_secure And uses their resulting changes to allow perf to implement seccomp-like behavior. (1) err_on_discard informs the perf infrastructure to return an error when the given filter is discarded. This is only wired up in perf_tp_event. (2) require_secure informs the perf infrastructure to require the traced process to operate in secure computing mode 1+. If the process was not in a secure computing mode, it will be moved to mode=2. Mode 2 is reserved for perf use and not accessible via the normal prctl(PR_SET_SECCOMP) interface. This transition will occur anytime a 'require_secure' event is enabled. To make use of the attr changes, perf_tp_event and perf_trace_buf_submit were modified to return an int representing the an error during processing. err_on_discard event will trigger a return of -EACCES instead of returning 0. In addition, perf_tp_event will automatically skip any task_context events not for current in perf_tp_event. Tying it all together together, the perf_syscall_enter function decides, based on the seccomp mode, to do_exit the process if the given event is unhandled or an err_on_discard-based error was returned. This could be extended to other perf hooks to respect require_secure and err_on_discard. Some of the open issues: - do these changes make sense in the context of perf? - err_on_discard can be triggered for non-task_context events. - require_secure without err_on_discard becomes a pure global syscall bitmask (not so great), but making err_on_discard implicit seems wrong. - require_secure is pretty meaningless as an attribute, but something like kill_on_fail will _only_ be true for sys_enter events with this change. - corollary: how do we ensure the ABI is usable and makes sense? - several system calls are unhooked by ftrace events which makes enable_on_exec not-always as useful. This RFC patch allows them for simple testing, but without them, enable_on_exec bits are pointless as execve and set_thread_area are unhooked (on x86 at least). - tracepoints occur slightly later than seccomp hooks, but that can be changed per-arch and shouldn't represent a huge exposure. Example usage with a hacked up builtin-record.c: SECURE=--secure perf record \ -e 'syscalls:sys_enter_access' \ -e 'syscalls:sys_enter_brk' \ -e 'syscalls:sys_enter_close' \ -e 'syscalls:sys_enter_exit_group' \ -e 'syscalls:sys_enter_fcntl64' \ -e 'syscalls:sys_enter_fstat64' \ -e 'syscalls:sys_enter_getdents64' \ -e 'syscalls:sys_enter_getpid' \ -e 'syscalls:sys_enter_getuid' \ -e 'syscalls:sys_enter_ioctl' \ -e 'syscalls:sys_enter_lstat64' \ -e 'syscalls:sys_enter_mmap_pgoff' \ -e 'syscalls:sys_enter_mprotect' \ -e 'syscalls:sys_enter_munmap' \ -e 'syscalls:sys_enter_open' \ -e 'syscalls:sys_enter_read' \ -e 'syscalls:sys_enter_stat64' \ -e 'syscalls:sys_enter_time' \ -e 'syscalls:sys_enter_newuname' \ -e 'syscalls:sys_enter_write' $SECURE --filter "fd == 1" \ --secure \ /bin/ls (Though a real ruleset would include sys_enter_execve and sys_enter_set_thread_area for this example.) This is an RFC. While this change is minimal, I'm not 100% confident in the ABI changes, attack surface, etc. The good part is that the code changes are minimal, the bad part is that much of the operation is opaque and the reused perf interfaces feel like they are being abused. Signed-off-by: Will Drewry --- include/linux/ftrace_event.h | 4 +- include/linux/perf_event.h | 10 +++++--- kernel/perf_event.c | 49 +++++++++++++++++++++++++++++++++++++--- kernel/seccomp.c | 8 ++++++ kernel/trace/trace_syscalls.c | 27 +++++++++++++++++----- 5 files changed, 82 insertions(+), 16 deletions(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 22b32af..fc96280 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -272,11 +272,11 @@ extern void ftrace_profile_free_filter(struct perf_event *event); extern void *perf_trace_buf_prepare(int size, unsigned short type, struct pt_regs *regs, int *rctxp); -static inline void +static inline int perf_trace_buf_submit(void *raw_data, int size, int rctx, u64 addr, u64 count, struct pt_regs *regs, void *head) { - perf_tp_event(addr, count, raw_data, size, regs, head, rctx); + return perf_tp_event(addr, count, raw_data, size, regs, head, rctx); } #endif diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ee9f1e7..5157779 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -216,8 +216,10 @@ struct perf_event_attr { precise_ip : 2, /* skid constraint */ mmap_data : 1, /* non-exec mmap data */ sample_id_all : 1, /* sample_type all events */ + err_on_discard: 1, /* propagate err internally */ + require_secure : 1, /* require seccomp mode == 2 */ - __reserved_1 : 45; + __reserved_1 : 43; union { __u32 wakeup_events; /* wakeup every n events */ @@ -1137,9 +1139,9 @@ static inline bool perf_paranoid_kernel(void) } extern void perf_event_init(void); -extern void perf_tp_event(u64 addr, u64 count, void *record, - int entry_size, struct pt_regs *regs, - struct hlist_head *head, int rctx); +extern int perf_tp_event(u64 addr, u64 count, void *record, + int entry_size, struct pt_regs *regs, + struct hlist_head *head, int rctx); extern void perf_bp_event(struct perf_event *event, void *data); #ifndef perf_misc_flags diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 8e81a98..6758569 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1638,6 +1638,15 @@ static int __perf_event_enable(void *info) */ perf_cgroup_set_timestamp(current, ctx); + /* Transition the task if required. */ + if (ctx->type == task_context && event->attr.require_secure) { +#ifdef CONFIG_SECCOMP + /* Don't allow perf events to escape mode = 1. */ + if (!current->seccomp.mode) + current->seccomp.mode = 2; +#endif + } + __perf_event_mark_enabled(event, ctx); if (!event_filter_match(event)) { @@ -1716,7 +1725,6 @@ void perf_event_enable(struct perf_event *event) */ if (event->state == PERF_EVENT_STATE_ERROR) event->state = PERF_EVENT_STATE_OFF; - retry: if (!ctx->is_active) { __perf_event_mark_enabled(event, ctx); @@ -2403,6 +2411,13 @@ static int event_enable_on_exec(struct perf_event *event, event->attr.enable_on_exec = 0; if (event->state >= PERF_EVENT_STATE_INACTIVE) return 0; + if (ctx->type == task_context && event->attr.require_secure) { +#ifdef CONFIG_SECCOMP + /* Don't allow perf events to escape mode = 1. */ + if (!ctx->task->seccomp.mode) + ctx->task->seccomp.mode = 2; +#endif + } __perf_event_mark_enabled(event, ctx); @@ -5515,12 +5530,13 @@ static int perf_tp_event_match(struct perf_event *event, return 1; } -void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, - struct pt_regs *regs, struct hlist_head *head, int rctx) +int perf_tp_event(u64 addr, u64 count, void *record, int entry_size, + struct pt_regs *regs, struct hlist_head *head, int rctx) { struct perf_sample_data data; struct perf_event *event; struct hlist_node *node; + int ok = 0; struct perf_raw_record raw = { .size = entry_size, @@ -5531,11 +5547,23 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, data.raw = &raw; hlist_for_each_entry_rcu(event, node, head, hlist_entry) { - if (perf_tp_event_match(event, &data, regs)) + /* When the event is in the task context but doesn't match + * current, skip it. + */ + if (event->ctx->type == task_context && + event->ctx->task != current) { + continue; + } + if (perf_tp_event_match(event, &data, regs)) { perf_swevent_event(event, count, 1, &data, regs); + continue; + } + if (event->attr.err_on_discard) + ok = -EACCES; } perf_swevent_put_recursion_context(rctx); + return ok; } EXPORT_SYMBOL_GPL(perf_tp_event); @@ -6452,6 +6480,19 @@ SYSCALL_DEFINE5(perf_event_open, return -EINVAL; } + if (attr.require_secure) { + if (!attr.err_on_discard) + return -EINVAL; + if (attr.type != PERF_TYPE_TRACEPOINT) + return -ENOSYS; + if (!attr.disabled) + if (!current->seccomp.mode) + current->seccomp.mode = 2; + /* Disallow globally scoped secure traces. */ + if (pid == -1) + return -EINVAL; + } + /* * In cgroup mode, the pid argument is used to pass the fd * opened to the cgroup directory in cgroupfs. The cpu argument diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 57d4b13..39c4c99 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -47,6 +47,14 @@ void __secure_computing(int this_syscall) return; } while (*++syscall); break; + case 2: /* perf event only */ +#ifdef CONFIG_COMPAT + /* Disallow compat calls */ + if (is_compat_task()) + break; + /* Otherwise, let perf do the rest. */ + return; +#endif default: BUG(); } diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index ee7b5a0..0eafe0e 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -497,20 +497,21 @@ static int sys_perf_refcount_exit; static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) { - struct syscall_metadata *sys_data; + struct syscall_metadata *sys_data = NULL; struct syscall_trace_enter *rec; struct hlist_head *head; int syscall_nr; int rctx; int size; + int ok = -ENOENT; syscall_nr = syscall_get_nr(current, regs); if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) - return; + goto out; sys_data = syscall_nr_to_meta(syscall_nr); if (!sys_data) - return; + goto out; /* get the size after alignment with the u32 buffer size field */ size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); @@ -519,19 +520,33 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "perf buffer not large enough")) - return; + goto out; rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, sys_data->enter_event->event.type, regs, &rctx); if (!rec) - return; + goto out; rec->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, (unsigned long *)&rec->args); head = this_cpu_ptr(sys_data->enter_event->perf_events); - perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); + ok = perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); +out: + if (current->seccomp.mode && ok < 0) { + /* Send over audit subsystem */ + printk(KERN_INFO + "%s[%d]: system call %d (%s) blocked at ip:%lx\n", + current->comm, task_pid_nr(current), syscall_nr, + (sys_data ? sys_data->name : "unmapped"), + KSTK_EIP(current)); + if (ok != -EACCES) { + /* For testing, allow unhooked syscalls like exec and set_thread_area */ + return; + } + do_exit(SIGKILL); + } } int perf_sysenter_enable(struct ftrace_event_call *call) -- 1.7.0.4