seccomp.c 45 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
Linus Torvalds's avatar
Linus Torvalds committed
2 3 4 5 6
/*
 * linux/kernel/seccomp.c
 *
 * Copyright 2004-2005  Andrea Arcangeli <andrea@cpushare.com>
 *
7 8 9 10 11 12 13 14
 * Copyright (C) 2012 Google, Inc.
 * Will Drewry <wad@chromium.org>
 *
 * This defines a simple but solid secure-computing facility.
 *
 * Mode 1 uses a fixed list of allowed system calls.
 * Mode 2 allows user-defined system call filters in the form
 *        of Berkeley Packet Filters/Linux Socket Filters.
Linus Torvalds's avatar
Linus Torvalds committed
15 16
 */

17
#include <linux/refcount.h>
18
#include <linux/audit.h>
19
#include <linux/compat.h>
20
#include <linux/coredump.h>
21
#include <linux/kmemleak.h>
22 23
#include <linux/nospec.h>
#include <linux/prctl.h>
24
#include <linux/sched.h>
25
#include <linux/sched/task_stack.h>
26
#include <linux/seccomp.h>
27
#include <linux/slab.h>
Kees Cook's avatar
Kees Cook committed
28
#include <linux/syscalls.h>
29
#include <linux/sysctl.h>
Linus Torvalds's avatar
Linus Torvalds committed
30

31
#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
32
#include <asm/syscall.h>
33
#endif
34 35

#ifdef CONFIG_SECCOMP_FILTER
36
#include <linux/file.h>
37
#include <linux/filter.h>
38
#include <linux/pid.h>
39
#include <linux/ptrace.h>
40 41 42
#include <linux/security.h>
#include <linux/tracehook.h>
#include <linux/uaccess.h>
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103
#include <linux/anon_inodes.h>

enum notify_state {
	SECCOMP_NOTIFY_INIT,
	SECCOMP_NOTIFY_SENT,
	SECCOMP_NOTIFY_REPLIED,
};

struct seccomp_knotif {
	/* The struct pid of the task whose filter triggered the notification */
	struct task_struct *task;

	/* The "cookie" for this request; this is unique for this filter. */
	u64 id;

	/*
	 * The seccomp data. This pointer is valid the entire time this
	 * notification is active, since it comes from __seccomp_filter which
	 * eclipses the entire lifecycle here.
	 */
	const struct seccomp_data *data;

	/*
	 * Notification states. When SECCOMP_RET_USER_NOTIF is returned, a
	 * struct seccomp_knotif is created and starts out in INIT. Once the
	 * handler reads the notification off of an FD, it transitions to SENT.
	 * If a signal is received the state transitions back to INIT and
	 * another message is sent. When the userspace handler replies, state
	 * transitions to REPLIED.
	 */
	enum notify_state state;

	/* The return values, only valid when in SECCOMP_NOTIFY_REPLIED */
	int error;
	long val;

	/* Signals when this has entered SECCOMP_NOTIFY_REPLIED */
	struct completion ready;

	struct list_head list;
};

/**
 * struct notification - container for seccomp userspace notifications. Since
 * most seccomp filters will not have notification listeners attached and this
 * structure is fairly large, we store the notification-specific stuff in a
 * separate structure.
 *
 * @request: A semaphore that users of this notification can wait on for
 *           changes. Actual reads and writes are still controlled with
 *           filter->notify_lock.
 * @next_id: The id of the next request.
 * @notifications: A list of struct seccomp_knotif elements.
 * @wqh: A wait queue for poll.
 */
struct notification {
	struct semaphore request;
	u64 next_id;
	struct list_head notifications;
	wait_queue_head_t wqh;
};
104 105 106 107 108 109 110 111

/**
 * struct seccomp_filter - container for seccomp BPF programs
 *
 * @usage: reference count to manage the object lifetime.
 *         get/put helpers should be used when accessing an instance
 *         outside of a lifetime-guarded section.  In general, this
 *         is only needed for handling filters shared across tasks.
112
 * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
113
 * @prev: points to a previously installed, or inherited, filter
114
 * @prog: the BPF program to evaluate
115 116
 * @notif: the struct that holds all notification related information
 * @notify_lock: A lock for all notification-related accesses.
117 118 119 120 121 122 123 124 125 126 127 128
 *
 * seccomp_filter objects are organized in a tree linked via the @prev
 * pointer.  For any task, it appears to be a singly-linked list starting
 * with current->seccomp.filter, the most recently attached or inherited filter.
 * However, multiple filters may share a @prev node, by way of fork(), which
 * results in a unidirectional tree existing in memory.  This is similar to
 * how namespaces work.
 *
 * seccomp_filter objects should never be modified after being attached
 * to a task_struct (other than @usage).
 */
struct seccomp_filter {
129
	refcount_t usage;
130
	bool log;
131
	struct seccomp_filter *prev;
132
	struct bpf_prog *prog;
133 134
	struct notification *notif;
	struct mutex notify_lock;
135 136 137 138 139
};

/* Limit any path through the tree to 256KB worth of instructions. */
#define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))

140
/*
141 142 143
 * Endianness is explicitly ignored and left for BPF program authors to manage
 * as per the specific architecture.
 */
144
static void populate_seccomp_data(struct seccomp_data *sd)
145
{
146 147
	struct task_struct *task = current;
	struct pt_regs *regs = task_pt_regs(task);
148
	unsigned long args[6];
149

150
	sd->nr = syscall_get_nr(task, regs);
151
	sd->arch = syscall_get_arch();
152 153 154 155 156 157 158
	syscall_get_arguments(task, regs, 0, 6, args);
	sd->args[0] = args[0];
	sd->args[1] = args[1];
	sd->args[2] = args[2];
	sd->args[3] = args[3];
	sd->args[4] = args[4];
	sd->args[5] = args[5];
159
	sd->instruction_pointer = KSTK_EIP(task);
160 161 162 163 164 165 166
}

/**
 *	seccomp_check_filter - verify seccomp filter code
 *	@filter: filter to verify
 *	@flen: length of filter
 *
167
 * Takes a previously checked filter (by bpf_check_classic) and
168 169 170 171 172 173 174 175 176 177 178 179 180 181 182
 * redirects all filter code that loads struct sk_buff data
 * and related data through seccomp_bpf_load.  It also
 * enforces length and alignment checking of those loads.
 *
 * Returns 0 if the rule set is legal or -EINVAL if not.
 */
static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
{
	int pc;
	for (pc = 0; pc < flen; pc++) {
		struct sock_filter *ftest = &filter[pc];
		u16 code = ftest->code;
		u32 k = ftest->k;

		switch (code) {
183
		case BPF_LD | BPF_W | BPF_ABS:
184
			ftest->code = BPF_LDX | BPF_W | BPF_ABS;
185 186 187 188
			/* 32-bit aligned and not out of bounds. */
			if (k >= sizeof(struct seccomp_data) || k & 3)
				return -EINVAL;
			continue;
189
		case BPF_LD | BPF_W | BPF_LEN:
190
			ftest->code = BPF_LD | BPF_IMM;
191 192
			ftest->k = sizeof(struct seccomp_data);
			continue;
193
		case BPF_LDX | BPF_W | BPF_LEN:
194
			ftest->code = BPF_LDX | BPF_IMM;
195 196 197
			ftest->k = sizeof(struct seccomp_data);
			continue;
		/* Explicitly include allowed calls. */
198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235
		case BPF_RET | BPF_K:
		case BPF_RET | BPF_A:
		case BPF_ALU | BPF_ADD | BPF_K:
		case BPF_ALU | BPF_ADD | BPF_X:
		case BPF_ALU | BPF_SUB | BPF_K:
		case BPF_ALU | BPF_SUB | BPF_X:
		case BPF_ALU | BPF_MUL | BPF_K:
		case BPF_ALU | BPF_MUL | BPF_X:
		case BPF_ALU | BPF_DIV | BPF_K:
		case BPF_ALU | BPF_DIV | BPF_X:
		case BPF_ALU | BPF_AND | BPF_K:
		case BPF_ALU | BPF_AND | BPF_X:
		case BPF_ALU | BPF_OR | BPF_K:
		case BPF_ALU | BPF_OR | BPF_X:
		case BPF_ALU | BPF_XOR | BPF_K:
		case BPF_ALU | BPF_XOR | BPF_X:
		case BPF_ALU | BPF_LSH | BPF_K:
		case BPF_ALU | BPF_LSH | BPF_X:
		case BPF_ALU | BPF_RSH | BPF_K:
		case BPF_ALU | BPF_RSH | BPF_X:
		case BPF_ALU | BPF_NEG:
		case BPF_LD | BPF_IMM:
		case BPF_LDX | BPF_IMM:
		case BPF_MISC | BPF_TAX:
		case BPF_MISC | BPF_TXA:
		case BPF_LD | BPF_MEM:
		case BPF_LDX | BPF_MEM:
		case BPF_ST:
		case BPF_STX:
		case BPF_JMP | BPF_JA:
		case BPF_JMP | BPF_JEQ | BPF_K:
		case BPF_JMP | BPF_JEQ | BPF_X:
		case BPF_JMP | BPF_JGE | BPF_K:
		case BPF_JMP | BPF_JGE | BPF_X:
		case BPF_JMP | BPF_JGT | BPF_K:
		case BPF_JMP | BPF_JGT | BPF_X:
		case BPF_JMP | BPF_JSET | BPF_K:
		case BPF_JMP | BPF_JSET | BPF_X:
236 237 238 239 240 241 242 243 244
			continue;
		default:
			return -EINVAL;
		}
	}
	return 0;
}

/**
245 246
 * seccomp_run_filters - evaluates all seccomp filters against @sd
 * @sd: optional seccomp data to be passed to filters
247 248 249
 * @match: stores struct seccomp_filter that resulted in the return value,
 *         unless filter returned SECCOMP_RET_ALLOW, in which case it will
 *         be unchanged.
250 251 252
 *
 * Returns valid seccomp BPF response codes.
 */
253
#define ACTION_ONLY(ret) ((s32)((ret) & (SECCOMP_RET_ACTION_FULL)))
254 255
static u32 seccomp_run_filters(const struct seccomp_data *sd,
			       struct seccomp_filter **match)
256
{
Will Drewry's avatar
Will Drewry committed
257
	u32 ret = SECCOMP_RET_ALLOW;
258 259
	/* Make sure cross-thread synced filter points somewhere sane. */
	struct seccomp_filter *f =
260
			READ_ONCE(current->seccomp.filter);
Will Drewry's avatar
Will Drewry committed
261 262

	/* Ensure unexpected behavior doesn't result in failing open. */
263
	if (WARN_ON(f == NULL))
264
		return SECCOMP_RET_KILL_PROCESS;
Will Drewry's avatar
Will Drewry committed
265

266 267
	/*
	 * All filters in the list are evaluated and the lowest BPF return
Will Drewry's avatar
Will Drewry committed
268
	 * value always takes priority (ignoring the DATA).
269
	 */
270
	for (; f; f = f->prev) {
271
		u32 cur_ret = BPF_PROG_RUN(f->prog, sd);
272

273
		if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) {
Will Drewry's avatar
Will Drewry committed
274
			ret = cur_ret;
275 276
			*match = f;
		}
277 278 279
	}
	return ret;
}
280
#endif /* CONFIG_SECCOMP_FILTER */
281

282 283
static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
{
284
	assert_spin_locked(&current->sighand->siglock);
285

286 287 288 289 290 291
	if (current->seccomp.mode && current->seccomp.mode != seccomp_mode)
		return false;

	return true;
}

292
void __weak arch_seccomp_spec_mitigate(struct task_struct *task) { }
293

294
static inline void seccomp_assign_mode(struct task_struct *task,
295 296
				       unsigned long seccomp_mode,
				       unsigned long flags)
297
{
298
	assert_spin_locked(&task->sighand->siglock);
299

300 301 302 303 304 305
	task->seccomp.mode = seccomp_mode;
	/*
	 * Make sure TIF_SECCOMP cannot be set before the mode (and
	 * filter) is set.
	 */
	smp_mb__before_atomic();
306 307
	/* Assume default seccomp processes want spec flaw mitigation. */
	if ((flags & SECCOMP_FILTER_FLAG_SPEC_ALLOW) == 0)
308
		arch_seccomp_spec_mitigate(task);
309
	set_tsk_thread_flag(task, TIF_SECCOMP);
310 311 312
}

#ifdef CONFIG_SECCOMP_FILTER
313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339
/* Returns 1 if the parent is an ancestor of the child. */
static int is_ancestor(struct seccomp_filter *parent,
		       struct seccomp_filter *child)
{
	/* NULL is the root ancestor. */
	if (parent == NULL)
		return 1;
	for (; child; child = child->prev)
		if (child == parent)
			return 1;
	return 0;
}

/**
 * seccomp_can_sync_threads: checks if all threads can be synchronized
 *
 * Expects sighand and cred_guard_mutex locks to be held.
 *
 * Returns 0 on success, -ve on error, or the pid of a thread which was
 * either not in the correct seccomp mode or it did not have an ancestral
 * seccomp filter.
 */
static inline pid_t seccomp_can_sync_threads(void)
{
	struct task_struct *thread, *caller;

	BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
340
	assert_spin_locked(&current->sighand->siglock);
341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359

	/* Validate all threads being eligible for synchronization. */
	caller = current;
	for_each_thread(caller, thread) {
		pid_t failed;

		/* Skip current, since it is initiating the sync. */
		if (thread == caller)
			continue;

		if (thread->seccomp.mode == SECCOMP_MODE_DISABLED ||
		    (thread->seccomp.mode == SECCOMP_MODE_FILTER &&
		     is_ancestor(thread->seccomp.filter,
				 caller->seccomp.filter)))
			continue;

		/* Return the first thread that cannot be synchronized. */
		failed = task_pid_vnr(thread);
		/* If the pid cannot be resolved, then return -ESRCH */
360
		if (WARN_ON(failed == 0))
361 362 363 364 365 366 367 368 369 370 371 372 373 374 375
			failed = -ESRCH;
		return failed;
	}

	return 0;
}

/**
 * seccomp_sync_threads: sets all threads to use current's filter
 *
 * Expects sighand and cred_guard_mutex locks to be held, and for
 * seccomp_can_sync_threads() to have returned success already
 * without dropping the locks.
 *
 */
376
static inline void seccomp_sync_threads(unsigned long flags)
377 378 379 380
{
	struct task_struct *thread, *caller;

	BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
381
	assert_spin_locked(&current->sighand->siglock);
382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399

	/* Synchronize all threads. */
	caller = current;
	for_each_thread(caller, thread) {
		/* Skip current, since it needs no changes. */
		if (thread == caller)
			continue;

		/* Get a task reference for the new leaf node. */
		get_seccomp_filter(caller);
		/*
		 * Drop the task reference to the shared ancestor since
		 * current's path will hold a reference.  (This also
		 * allows a put before the assignment.)
		 */
		put_seccomp_filter(thread);
		smp_store_release(&thread->seccomp.filter,
				  caller->seccomp.filter);
400 401 402 403 404 405 406 407 408 409

		/*
		 * Don't let an unprivileged task work around
		 * the no_new_privs restriction by creating
		 * a thread that sets it up, enters seccomp,
		 * then dies.
		 */
		if (task_no_new_privs(caller))
			task_set_no_new_privs(thread);

410 411 412 413 414 415
		/*
		 * Opt the other thread into seccomp if needed.
		 * As threads are considered to be trust-realm
		 * equivalent (see ptrace_may_access), it is safe to
		 * allow one thread to transition the other.
		 */
416
		if (thread->seccomp.mode == SECCOMP_MODE_DISABLED)
417 418
			seccomp_assign_mode(thread, SECCOMP_MODE_FILTER,
					    flags);
419 420 421
	}
}

422
/**
423
 * seccomp_prepare_filter: Prepares a seccomp filter for use.
424 425
 * @fprog: BPF program to install
 *
426
 * Returns filter on success or an ERR_PTR on failure.
427
 */
428
static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
429
{
430 431
	struct seccomp_filter *sfilter;
	int ret;
432
	const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE);
433 434

	if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
435
		return ERR_PTR(-EINVAL);
436

437
	BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
438 439

	/*
440
	 * Installing a seccomp filter requires that the task has
441 442 443 444
	 * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
	 * This avoids scenarios where unprivileged tasks can affect the
	 * behavior of privileged children.
	 */
445
	if (!task_no_new_privs(current) &&
446 447
	    security_capable(current_cred(), current_user_ns(),
				     CAP_SYS_ADMIN, CAP_OPT_NOAUDIT) != 0)
448
		return ERR_PTR(-EACCES);
449

450
	/* Allocate a new seccomp_filter */
451 452
	sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL | __GFP_NOWARN);
	if (!sfilter)
453
		return ERR_PTR(-ENOMEM);
454

455
	mutex_init(&sfilter->notify_lock);
456
	ret = bpf_prog_create_from_user(&sfilter->prog, fprog,
457
					seccomp_check_filter, save_orig);
458 459 460
	if (ret < 0) {
		kfree(sfilter);
		return ERR_PTR(ret);
461
	}
462

463
	refcount_set(&sfilter->usage, 1);
464

465
	return sfilter;
466 467 468
}

/**
469
 * seccomp_prepare_user_filter - prepares a user-supplied sock_fprog
470 471 472 473
 * @user_filter: pointer to the user data containing a sock_fprog.
 *
 * Returns 0 on success and non-zero otherwise.
 */
474 475
static struct seccomp_filter *
seccomp_prepare_user_filter(const char __user *user_filter)
476 477
{
	struct sock_fprog fprog;
478
	struct seccomp_filter *filter = ERR_PTR(-EFAULT);
479 480

#ifdef CONFIG_COMPAT
481
	if (in_compat_syscall()) {
482 483 484 485 486 487 488 489 490
		struct compat_sock_fprog fprog32;
		if (copy_from_user(&fprog32, user_filter, sizeof(fprog32)))
			goto out;
		fprog.len = fprog32.len;
		fprog.filter = compat_ptr(fprog32.filter);
	} else /* falls through to the if below. */
#endif
	if (copy_from_user(&fprog, user_filter, sizeof(fprog)))
		goto out;
491
	filter = seccomp_prepare_filter(&fprog);
492
out:
493 494 495 496 497 498 499 500
	return filter;
}

/**
 * seccomp_attach_filter: validate and attach filter
 * @flags:  flags to change filter behavior
 * @filter: seccomp filter to add to the current process
 *
501 502
 * Caller must be holding current->sighand->siglock lock.
 *
503 504 505 506 507 508 509 510
 * Returns 0 on success, -ve on error.
 */
static long seccomp_attach_filter(unsigned int flags,
				  struct seccomp_filter *filter)
{
	unsigned long total_insns;
	struct seccomp_filter *walker;

511
	assert_spin_locked(&current->sighand->siglock);
512

513 514 515 516 517 518 519
	/* Validate resulting filter length. */
	total_insns = filter->prog->len;
	for (walker = current->seccomp.filter; walker; walker = walker->prev)
		total_insns += walker->prog->len + 4;  /* 4 instr penalty */
	if (total_insns > MAX_INSNS_PER_PATH)
		return -ENOMEM;

520 521 522 523 524 525 526 527 528
	/* If thread sync has been requested, check that it is possible. */
	if (flags & SECCOMP_FILTER_FLAG_TSYNC) {
		int ret;

		ret = seccomp_can_sync_threads();
		if (ret)
			return ret;
	}

529 530 531 532
	/* Set log flag, if present. */
	if (flags & SECCOMP_FILTER_FLAG_LOG)
		filter->log = true;

533 534 535 536 537 538 539
	/*
	 * If there is an existing filter, make it the prev and don't drop its
	 * task reference.
	 */
	filter->prev = current->seccomp.filter;
	current->seccomp.filter = filter;

540 541
	/* Now that the new filter is in place, synchronize to all threads. */
	if (flags & SECCOMP_FILTER_FLAG_TSYNC)
542
		seccomp_sync_threads(flags);
543

544
	return 0;
545 546
}

547
static void __get_seccomp_filter(struct seccomp_filter *filter)
548 549 550 551
{
	refcount_inc(&filter->usage);
}

552 553 554 555 556 557
/* get_seccomp_filter - increments the reference count of the filter on @tsk */
void get_seccomp_filter(struct task_struct *tsk)
{
	struct seccomp_filter *orig = tsk->seccomp.filter;
	if (!orig)
		return;
558
	__get_seccomp_filter(orig);
559 560
}

561 562 563
static inline void seccomp_filter_free(struct seccomp_filter *filter)
{
	if (filter) {
564
		bpf_prog_destroy(filter->prog);
565 566 567 568
		kfree(filter);
	}
}

569
static void __put_seccomp_filter(struct seccomp_filter *orig)
570 571
{
	/* Clean up single-reference branches iteratively. */
572
	while (orig && refcount_dec_and_test(&orig->usage)) {
573 574
		struct seccomp_filter *freeme = orig;
		orig = orig->prev;
575
		seccomp_filter_free(freeme);
576 577
	}
}
Will Drewry's avatar
Will Drewry committed
578

579 580 581 582 583 584
/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
void put_seccomp_filter(struct task_struct *tsk)
{
	__put_seccomp_filter(tsk->seccomp.filter);
}

585
static void seccomp_init_siginfo(kernel_siginfo_t *info, int syscall, int reason)
586
{
587
	clear_siginfo(info);
588 589 590 591 592 593 594 595
	info->si_signo = SIGSYS;
	info->si_code = SYS_SECCOMP;
	info->si_call_addr = (void __user *)KSTK_EIP(current);
	info->si_errno = reason;
	info->si_arch = syscall_get_arch();
	info->si_syscall = syscall;
}

Will Drewry's avatar
Will Drewry committed
596 597 598 599 600 601 602 603 604
/**
 * seccomp_send_sigsys - signals the task to allow in-process syscall emulation
 * @syscall: syscall number to send to userland
 * @reason: filter-supplied reason code to send to userland (via si_errno)
 *
 * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info.
 */
static void seccomp_send_sigsys(int syscall, int reason)
{
605
	struct kernel_siginfo info;
606
	seccomp_init_siginfo(&info, syscall, reason);
Will Drewry's avatar
Will Drewry committed
607 608
	force_sig_info(SIGSYS, &info, current);
}
609
#endif	/* CONFIG_SECCOMP_FILTER */
Linus Torvalds's avatar
Linus Torvalds committed
610

611
/* For use with seccomp_actions_logged */
612 613
#define SECCOMP_LOG_KILL_PROCESS	(1 << 0)
#define SECCOMP_LOG_KILL_THREAD		(1 << 1)
614 615 616
#define SECCOMP_LOG_TRAP		(1 << 2)
#define SECCOMP_LOG_ERRNO		(1 << 3)
#define SECCOMP_LOG_TRACE		(1 << 4)
617 618
#define SECCOMP_LOG_LOG			(1 << 5)
#define SECCOMP_LOG_ALLOW		(1 << 6)
619
#define SECCOMP_LOG_USER_NOTIF		(1 << 7)
620

621 622
static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_PROCESS |
				    SECCOMP_LOG_KILL_THREAD  |
623 624
				    SECCOMP_LOG_TRAP  |
				    SECCOMP_LOG_ERRNO |
625
				    SECCOMP_LOG_USER_NOTIF |
626
				    SECCOMP_LOG_TRACE |
627
				    SECCOMP_LOG_LOG;
628

629 630
static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
			       bool requested)
631 632 633 634 635
{
	bool log = false;

	switch (action) {
	case SECCOMP_RET_ALLOW:
636
		break;
637
	case SECCOMP_RET_TRAP:
638 639
		log = requested && seccomp_actions_logged & SECCOMP_LOG_TRAP;
		break;
640
	case SECCOMP_RET_ERRNO:
641 642
		log = requested && seccomp_actions_logged & SECCOMP_LOG_ERRNO;
		break;
643
	case SECCOMP_RET_TRACE:
644
		log = requested && seccomp_actions_logged & SECCOMP_LOG_TRACE;
645
		break;
646 647 648
	case SECCOMP_RET_USER_NOTIF:
		log = requested && seccomp_actions_logged & SECCOMP_LOG_USER_NOTIF;
		break;
649 650 651
	case SECCOMP_RET_LOG:
		log = seccomp_actions_logged & SECCOMP_LOG_LOG;
		break;
652 653
	case SECCOMP_RET_KILL_THREAD:
		log = seccomp_actions_logged & SECCOMP_LOG_KILL_THREAD;
654 655 656 657
		break;
	case SECCOMP_RET_KILL_PROCESS:
	default:
		log = seccomp_actions_logged & SECCOMP_LOG_KILL_PROCESS;
658 659 660
	}

	/*
661 662 663 664
	 * Emit an audit message when the action is RET_KILL_*, RET_LOG, or the
	 * FILTER_FLAG_LOG bit was set. The admin has the ability to silence
	 * any action from being logged by removing the action name from the
	 * seccomp_actions_logged sysctl.
665
	 */
666 667
	if (!log)
		return;
668

669
	audit_seccomp(syscall, signr, action);
670 671
}

Linus Torvalds's avatar
Linus Torvalds committed
672 673 674 675 676
/*
 * Secure computing mode 1 allows only read/write/exit/sigreturn.
 * To be fully secure this must be combined with rlimit
 * to limit the stack allocations too.
 */
677
static const int mode1_syscalls[] = {
Linus Torvalds's avatar
Linus Torvalds committed
678 679 680 681
	__NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn,
	0, /* null terminated */
};

682
static void __secure_computing_strict(int this_syscall)
Linus Torvalds's avatar
Linus Torvalds committed
683
{
684
	const int *syscall_whitelist = mode1_syscalls;
685
#ifdef CONFIG_COMPAT
686
	if (in_compat_syscall())
687
		syscall_whitelist = get_compat_mode1_syscalls();
688 689 690 691 692 693 694 695 696
#endif
	do {
		if (*syscall_whitelist == this_syscall)
			return;
	} while (*++syscall_whitelist);

#ifdef SECCOMP_DEBUG
	dump_stack();
#endif
697
	seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL_THREAD, true);
698 699 700 701 702 703 704 705
	do_exit(SIGKILL);
}

#ifndef CONFIG_HAVE_ARCH_SECCOMP_FILTER
void secure_computing_strict(int this_syscall)
{
	int mode = current->seccomp.mode;

706
	if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) &&
707 708 709
	    unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
		return;

710
	if (mode == SECCOMP_MODE_DISABLED)
711 712 713 714 715 716 717
		return;
	else if (mode == SECCOMP_MODE_STRICT)
		__secure_computing_strict(this_syscall);
	else
		BUG();
}
#else
718 719

#ifdef CONFIG_SECCOMP_FILTER
720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781
static u64 seccomp_next_notify_id(struct seccomp_filter *filter)
{
	/*
	 * Note: overflow is ok here, the id just needs to be unique per
	 * filter.
	 */
	lockdep_assert_held(&filter->notify_lock);
	return filter->notif->next_id++;
}

static void seccomp_do_user_notification(int this_syscall,
					 struct seccomp_filter *match,
					 const struct seccomp_data *sd)
{
	int err;
	long ret = 0;
	struct seccomp_knotif n = {};

	mutex_lock(&match->notify_lock);
	err = -ENOSYS;
	if (!match->notif)
		goto out;

	n.task = current;
	n.state = SECCOMP_NOTIFY_INIT;
	n.data = sd;
	n.id = seccomp_next_notify_id(match);
	init_completion(&n.ready);
	list_add(&n.list, &match->notif->notifications);

	up(&match->notif->request);
	wake_up_poll(&match->notif->wqh, EPOLLIN | EPOLLRDNORM);
	mutex_unlock(&match->notify_lock);

	/*
	 * This is where we wait for a reply from userspace.
	 */
	err = wait_for_completion_interruptible(&n.ready);
	mutex_lock(&match->notify_lock);
	if (err == 0) {
		ret = n.val;
		err = n.error;
	}

	/*
	 * Note that it's possible the listener died in between the time when
	 * we were notified of a respons (or a signal) and when we were able to
	 * re-acquire the lock, so only delete from the list if the
	 * notification actually exists.
	 *
	 * Also note that this test is only valid because there's no way to
	 * *reattach* to a notifier right now. If one is added, we'll need to
	 * keep track of the notif itself and make sure they match here.
	 */
	if (match->notif)
		list_del(&n.list);
out:
	mutex_unlock(&match->notify_lock);
	syscall_set_return_value(current, task_pt_regs(current),
				 err, ret);
}

782 783
static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
			    const bool recheck_after_trace)
784 785
{
	u32 filter_ret, action;
786
	struct seccomp_filter *match = NULL;
787
	int data;
788
	struct seccomp_data sd_local;
Linus Torvalds's avatar
Linus Torvalds committed
789

790 791 792 793 794 795
	/*
	 * Make sure that any changes to mode from another thread have
	 * been seen after TIF_SECCOMP was seen.
	 */
	rmb();

796 797 798 799 800
	if (!sd) {
		populate_seccomp_data(&sd_local);
		sd = &sd_local;
	}

801
	filter_ret = seccomp_run_filters(sd, &match);
802
	data = filter_ret & SECCOMP_RET_DATA;
803
	action = filter_ret & SECCOMP_RET_ACTION_FULL;
804 805 806

	switch (action) {
	case SECCOMP_RET_ERRNO:
807 808 809
		/* Set low-order bits as an errno, capped at MAX_ERRNO. */
		if (data > MAX_ERRNO)
			data = MAX_ERRNO;
810
		syscall_set_return_value(current, task_pt_regs(current),
811 812 813 814 815
					 -data, 0);
		goto skip;

	case SECCOMP_RET_TRAP:
		/* Show the handler the original registers. */
816
		syscall_rollback(current, task_pt_regs(current));
817 818 819 820 821
		/* Let the filter pass back 16 bits of data. */
		seccomp_send_sigsys(this_syscall, data);
		goto skip;

	case SECCOMP_RET_TRACE:
822 823 824 825
		/* We've been put in this state by the ptracer already. */
		if (recheck_after_trace)
			return 0;

Kees Cook's avatar
Kees Cook committed
826 827 828 829 830 831 832 833 834 835 836 837
		/* ENOSYS these calls if there is no tracer attached. */
		if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
			syscall_set_return_value(current,
						 task_pt_regs(current),
						 -ENOSYS, 0);
			goto skip;
		}

		/* Allow the BPF to provide the event message */
		ptrace_event(PTRACE_EVENT_SECCOMP, data);
		/*
		 * The delivery of a fatal signal during event
838 839 840 841 842 843 844
		 * notification may silently skip tracer notification,
		 * which could leave us with a potentially unmodified
		 * syscall that the tracer would have liked to have
		 * changed. Since the process is about to die, we just
		 * force the syscall to be skipped and let the signal
		 * kill the process and correctly handle any tracer exit
		 * notifications.
Kees Cook's avatar
Kees Cook committed
845 846
		 */
		if (fatal_signal_pending(current))
847
			goto skip;
Kees Cook's avatar
Kees Cook committed
848 849 850 851 852
		/* Check if the tracer forced the syscall to be skipped. */
		this_syscall = syscall_get_nr(current, task_pt_regs(current));
		if (this_syscall < 0)
			goto skip;

853 854 855 856 857 858 859 860 861
		/*
		 * Recheck the syscall, since it may have changed. This
		 * intentionally uses a NULL struct seccomp_data to force
		 * a reload of all registers. This does not goto skip since
		 * a skip would have already been reported.
		 */
		if (__seccomp_filter(this_syscall, NULL, true))
			return -1;

Kees Cook's avatar
Kees Cook committed
862
		return 0;
863

864 865 866 867
	case SECCOMP_RET_USER_NOTIF:
		seccomp_do_user_notification(this_syscall, match, sd);
		goto skip;

868 869 870 871
	case SECCOMP_RET_LOG:
		seccomp_log(this_syscall, 0, action, true);
		return 0;

872
	case SECCOMP_RET_ALLOW:
873 874 875 876 877
		/*
		 * Note that the "match" filter will always be NULL for
		 * this action since SECCOMP_RET_ALLOW is the starting
		 * state in seccomp_run_filters().
		 */
Kees Cook's avatar
Kees Cook committed
878
		return 0;
879

880
	case SECCOMP_RET_KILL_THREAD:
881
	case SECCOMP_RET_KILL_PROCESS:
882
	default:
883
		seccomp_log(this_syscall, SIGSYS, action, true);
884
		/* Dump core only if this is the last remaining thread. */
885 886
		if (action == SECCOMP_RET_KILL_PROCESS ||
		    get_nr_threads(current) == 1) {
887
			kernel_siginfo_t info;
888

889 890 891 892 893 894
			/* Show the original registers in the dump. */
			syscall_rollback(current, task_pt_regs(current));
			/* Trigger a manual coredump since do_exit skips it. */
			seccomp_init_siginfo(&info, this_syscall, data);
			do_coredump(&info);
		}
895 896 897 898
		if (action == SECCOMP_RET_KILL_PROCESS)
			do_group_exit(SIGSYS);
		else
			do_exit(SIGSYS);
899 900 901 902 903
	}

	unreachable();

skip:
904
	seccomp_log(this_syscall, 0, action, match ? match->log : false);
Kees Cook's avatar
Kees Cook committed
905 906 907
	return -1;
}
#else
908 909
static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
			    const bool recheck_after_trace)
Kees Cook's avatar
Kees Cook committed
910 911
{
	BUG();
912
}
Linus Torvalds's avatar
Linus Torvalds committed
913
#endif
914

Kees Cook's avatar
Kees Cook committed
915
int __secure_computing(const struct seccomp_data *sd)
916 917
{
	int mode = current->seccomp.mode;
Kees Cook's avatar
Kees Cook committed
918
	int this_syscall;
919

920
	if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) &&
921
	    unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
Kees Cook's avatar
Kees Cook committed
922 923 924 925
		return 0;

	this_syscall = sd ? sd->nr :
		syscall_get_nr(current, task_pt_regs(current));
926

927
	switch (mode) {
928
	case SECCOMP_MODE_STRICT:
929
		__secure_computing_strict(this_syscall);  /* may call do_exit */
Kees Cook's avatar
Kees Cook committed
930
		return 0;
931
	case SECCOMP_MODE_FILTER:
932
		return __seccomp_filter(this_syscall, sd, false);
Linus Torvalds's avatar
Linus Torvalds committed
933 934 935
	default:
		BUG();
	}
936
}
937
#endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */
938 939 940 941 942 943

long prctl_get_seccomp(void)
{
	return current->seccomp.mode;
}

944
/**
945
 * seccomp_set_mode_strict: internal function for setting strict seccomp
946 947 948 949 950
 *
 * Once current->seccomp.mode is non-zero, it may not be changed.
 *
 * Returns 0 on success or -EINVAL on failure.
 */
951
static long seccomp_set_mode_strict(void)
952
{
953
	const unsigned long seccomp_mode = SECCOMP_MODE_STRICT;
954
	long ret = -EINVAL;
955

956 957
	spin_lock_irq(&current->sighand->siglock);

958
	if (!seccomp_may_assign_mode(seccomp_mode))
959 960
		goto out;

961
#ifdef TIF_NOTSC
962
	disable_TSC();
963
#endif
964
	seccomp_assign_mode(current, seccomp_mode, 0);
965 966 967
	ret = 0;

out:
968
	spin_unlock_irq(&current->sighand->siglock);
969 970 971 972

	return ret;
}

973
#ifdef CONFIG_SECCOMP_FILTER
974 975 976 977 978
static int seccomp_notify_release(struct inode *inode, struct file *file)
{
	struct seccomp_filter *filter = file->private_data;
	struct seccomp_knotif *knotif;

979 980 981
	if (!filter)
		return 0;

982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174
	mutex_lock(&filter->notify_lock);

	/*
	 * If this file is being closed because e.g. the task who owned it
	 * died, let's wake everyone up who was waiting on us.
	 */
	list_for_each_entry(knotif, &filter->notif->notifications, list) {
		if (knotif->state == SECCOMP_NOTIFY_REPLIED)
			continue;

		knotif->state = SECCOMP_NOTIFY_REPLIED;
		knotif->error = -ENOSYS;
		knotif->val = 0;

		complete(&knotif->ready);
	}

	kfree(filter->notif);
	filter->notif = NULL;
	mutex_unlock(&filter->notify_lock);
	__put_seccomp_filter(filter);
	return 0;
}

static long seccomp_notify_recv(struct seccomp_filter *filter,
				void __user *buf)
{
	struct seccomp_knotif *knotif = NULL, *cur;
	struct seccomp_notif unotif;
	ssize_t ret;

	memset(&unotif, 0, sizeof(unotif));

	ret = down_interruptible(&filter->notif->request);
	if (ret < 0)
		return ret;

	mutex_lock(&filter->notify_lock);
	list_for_each_entry(cur, &filter->notif->notifications, list) {
		if (cur->state == SECCOMP_NOTIFY_INIT) {
			knotif = cur;
			break;
		}
	}

	/*
	 * If we didn't find a notification, it could be that the task was
	 * interrupted by a fatal signal between the time we were woken and
	 * when we were able to acquire the rw lock.
	 */
	if (!knotif) {
		ret = -ENOENT;
		goto out;
	}

	unotif.id = knotif->id;
	unotif.pid = task_pid_vnr(knotif->task);
	unotif.data = *(knotif->data);

	knotif->state = SECCOMP_NOTIFY_SENT;
	wake_up_poll(&filter->notif->wqh, EPOLLOUT | EPOLLWRNORM);
	ret = 0;
out:
	mutex_unlock(&filter->notify_lock);

	if (ret == 0 && copy_to_user(buf, &unotif, sizeof(unotif))) {
		ret = -EFAULT;

		/*
		 * Userspace screwed up. To make sure that we keep this
		 * notification alive, let's reset it back to INIT. It
		 * may have died when we released the lock, so we need to make
		 * sure it's still around.
		 */
		knotif = NULL;
		mutex_lock(&filter->notify_lock);
		list_for_each_entry(cur, &filter->notif->notifications, list) {
			if (cur->id == unotif.id) {
				knotif = cur;
				break;
			}
		}

		if (knotif) {
			knotif->state = SECCOMP_NOTIFY_INIT;
			up(&filter->notif->request);
		}
		mutex_unlock(&filter->notify_lock);
	}

	return ret;
}

static long seccomp_notify_send(struct seccomp_filter *filter,
				void __user *buf)
{
	struct seccomp_notif_resp resp = {};
	struct seccomp_knotif *knotif = NULL, *cur;
	long ret;

	if (copy_from_user(&resp, buf, sizeof(resp)))
		return -EFAULT;

	if (resp.flags)
		return -EINVAL;

	ret = mutex_lock_interruptible(&filter->notify_lock);
	if (ret < 0)
		return ret;

	list_for_each_entry(cur, &filter->notif->notifications, list) {
		if (cur->id == resp.id) {
			knotif = cur;
			break;
		}
	}

	if (!knotif) {
		ret = -ENOENT;
		goto out;
	}

	/* Allow exactly one reply. */
	if (knotif->state != SECCOMP_NOTIFY_SENT) {
		ret = -EINPROGRESS;
		goto out;
	}

	ret = 0;
	knotif->state = SECCOMP_NOTIFY_REPLIED;
	knotif->error = resp.error;
	knotif->val = resp.val;
	complete(&knotif->ready);
out:
	mutex_unlock(&filter->notify_lock);
	return ret;
}

static long seccomp_notify_id_valid(struct seccomp_filter *filter,
				    void __user *buf)
{
	struct seccomp_knotif *knotif = NULL;
	u64 id;
	long ret;

	if (copy_from_user(&id, buf, sizeof(id)))
		return -EFAULT;

	ret = mutex_lock_interruptible(&filter->notify_lock);
	if (ret < 0)
		return ret;

	ret = -ENOENT;
	list_for_each_entry(knotif, &filter->notif->notifications, list) {
		if (knotif->id == id) {
			if (knotif->state == SECCOMP_NOTIFY_SENT)
				ret = 0;
			goto out;
		}
	}

out:
	mutex_unlock(&filter->notify_lock);
	return ret;
}

static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
				 unsigned long arg)
{
	struct seccomp_filter *filter = file->private_data;
	void __user *buf = (void __user *)arg;

	switch (cmd) {
	case SECCOMP_IOCTL_NOTIF_RECV:
		return seccomp_notify_recv(filter, buf);
	case SECCOMP_IOCTL_NOTIF_SEND:
		return seccomp_notify_send(filter, buf);
	case SECCOMP_IOCTL_NOTIF_ID_VALID:
		return seccomp_notify_id_valid(filter, buf);
	default:
		return -EINVAL;
	}
}

static __poll_t seccomp_notify_poll(struct file *file,
				    struct poll_table_struct *poll_tab)
{
	struct seccomp_filter *filter = file->private_data;
	__poll_t ret = 0;
	struct seccomp_knotif *cur;

	poll_wait(file, &filter->notif->wqh, poll_tab);

1175
	if (mutex_lock_interruptible(&filter->notify_lock) < 0)
1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232
		return EPOLLERR;

	list_for_each_entry(cur, &filter->notif->notifications, list) {
		if (cur->state == SECCOMP_NOTIFY_INIT)
			ret |= EPOLLIN | EPOLLRDNORM;
		if (cur->state == SECCOMP_NOTIFY_SENT)
			ret |= EPOLLOUT | EPOLLWRNORM;
		if ((ret & EPOLLIN) && (ret & EPOLLOUT))
			break;
	}

	mutex_unlock(&filter->notify_lock);

	return ret;
}

static const struct file_operations seccomp_notify_ops = {
	.poll = seccomp_notify_poll,
	.release = seccomp_notify_release,
	.unlocked_ioctl = seccomp_notify_ioctl,
};

static struct file *init_listener(struct seccomp_filter *filter)
{
	struct file *ret = ERR_PTR(-EBUSY);
	struct seccomp_filter *cur;

	for (cur = current->seccomp.filter; cur; cur = cur->prev) {
		if (cur->notif)
			goto out;
	}

	ret = ERR_PTR(-ENOMEM);
	filter->notif = kzalloc(sizeof(*(filter->notif)), GFP_KERNEL);
	if (!filter->notif)
		goto out;

	sema_init(&filter->notif->request, 0);
	filter->notif->next_id = get_random_u64();
	INIT_LIST_HEAD(&filter->notif->notifications);
	init_waitqueue_head(&filter->notif->wqh);

	ret = anon_inode_getfile("seccomp notify", &seccomp_notify_ops,
				 filter, O_RDWR);
	if (IS_ERR(ret))
		goto out_notif;

	/* The file has a reference to it now */
	__get_seccomp_filter(filter);

out_notif:
	if (IS_ERR(ret))
		kfree(filter->notif);
out:
	return ret;
}

1233 1234
/**
 * seccomp_set_mode_filter: internal function for setting seccomp filter
Kees Cook's avatar
Kees Cook committed
1235
 * @flags:  flags to change filter behavior
1236 1237 1238 1239 1240 1241 1242 1243 1244 1245
 * @filter: struct sock_fprog containing filter
 *
 * This function may be called repeatedly to install additional filters.
 * Every filter successfully installed will be evaluated (in reverse order)
 * for each system call the task makes.
 *
 * Once current->seccomp.mode is non-zero, it may not be changed.
 *
 * Returns 0 on success or -EINVAL on failure.
 */
Kees Cook's avatar
Kees Cook committed
1246 1247
static long seccomp_set_mode_filter(unsigned int flags,
				    const char __user *filter)
1248 1249
{
	const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
1250
	struct seccomp_filter *prepared = NULL;
1251
	long ret = -EINVAL;
1252 1253
	int listener = -1;
	struct file *listener_f = NULL;
1254

Kees Cook's avatar
Kees Cook committed
1255
	/* Validate flags. */
1256
	if (flags & ~SECCOMP_FILTER_FLAG_MASK)
1257
		return -EINVAL;
Kees Cook's avatar
Kees Cook committed
1258

1259 1260 1261 1262 1263
	/* Prepare the new filter before holding any locks. */
	prepared = seccomp_prepare_user_filter(filter);
	if (IS_ERR(prepared))
		return PTR_ERR(prepared);

1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278
	if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) {
		listener = get_unused_fd_flags(O_CLOEXEC);
		if (listener < 0) {
			ret = listener;
			goto out_free;
		}

		listener_f = init_listener(prepared);
		if (IS_ERR(listener_f)) {
			put_unused_fd(listener);
			ret = PTR_ERR(listener_f);
			goto out_free;
		}
	}

1279 1280 1281 1282 1283 1284
	/*
	 * Make sure we cannot change seccomp or nnp state via TSYNC
	 * while another thread is in the middle of calling exec.
	 */
	if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
	    mutex_lock_killable(&current->signal->cred_guard_mutex))
1285
		goto out_put_fd;
1286

1287 1288
	spin_lock_irq(&current->sighand->siglock);

1289 1290 1291
	if (!seccomp_may_assign_mode(seccomp_mode))
		goto out;

1292
	ret = seccomp_attach_filter(flags, prepared);
1293
	if (ret)
1294
		goto out;
1295 1296
	/* Do not free the successfully attached filter. */
	prepared = NULL;
1297

1298
	seccomp_assign_mode(current, seccomp_mode, flags);
1299
out:
1300
	spin_unlock_irq(&current->sighand->siglock);
1301 1302
	if (flags & SECCOMP_FILTER_FLAG_TSYNC)
		mutex_unlock(&current->signal->cred_guard_mutex);
1303 1304 1305
out_put_fd:
	if (flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) {
		if (ret < 0) {
1306
			listener_f->private_data = NULL;
1307 1308 1309 1310 1311 1312 1313
			fput(listener_f);
			put_unused_fd(listener);
		} else {
			fd_install(listener, listener_f);
			ret = listener;
		}
	}
1314
out_free:
1315
	seccomp_filter_free(prepared);
1316 1317
	return ret;
}
1318
#else
Kees Cook's avatar
Kees Cook committed
1319 1320
static inline long seccomp_set_mode_filter(unsigned int flags,
					   const char __user *filter)
1321 1322 1323 1324
{
	return -EINVAL;
}
#endif
1325

1326 1327 1328 1329 1330 1331 1332 1333
static long seccomp_get_action_avail(const char __user *uaction)
{
	u32 action;

	if (copy_from_user(&action, uaction, sizeof(action)))
		return -EFAULT;

	switch (action) {
1334
	case SECCOMP_RET_KILL_PROCESS:
1335
	case SECCOMP_RET_KILL_THREAD:
1336 1337
	case SECCOMP_RET_TRAP:
	case SECCOMP_RET_ERRNO:
1338
	case SECCOMP_RET_USER_NOTIF:
1339
	case SECCOMP_RET_TRACE:
1340
	case SECCOMP_RET_LOG:
1341 1342 1343 1344 1345 1346 1347 1348 1349
	case SECCOMP_RET_ALLOW:
		break;
	default:
		return -EOPNOTSUPP;
	}

	return 0;
}

1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363
static long seccomp_get_notif_sizes(void __user *usizes)
{
	struct seccomp_notif_sizes sizes = {
		.seccomp_notif = sizeof(struct seccomp_notif),
		.seccomp_notif_resp = sizeof(struct seccomp_notif_resp),
		.seccomp_data = sizeof(struct seccomp_data),
	};

	if (copy_to_user(usizes, &sizes, sizeof(sizes)))
		return -EFAULT;

	return 0;
}

Kees Cook's avatar
Kees Cook committed
1364 1365
/* Common entry point for both prctl and syscall. */
static long do_seccomp(unsigned int op, unsigned int flags,
1366
		       void __user *uargs)
Kees Cook's avatar
Kees Cook committed
1367 1368 1369 1370 1371 1372 1373 1374
{
	switch (op) {
	case SECCOMP_SET_MODE_STRICT:
		if (flags != 0 || uargs != NULL)
			return -EINVAL;
		return seccomp_set_mode_strict();
	case SECCOMP_SET_MODE_FILTER:
		return seccomp_set_mode_filter(flags, uargs);
1375 1376 1377 1378 1379
	case SECCOMP_GET_ACTION_AVAIL:
		if (flags != 0)
			return -EINVAL;

		return seccomp_get_action_avail(uargs);
1380 1381 1382 1383 1384
	case SECCOMP_GET_NOTIF_SIZES:
		if (flags != 0)
			return -EINVAL;

		return seccomp_get_notif_sizes(uargs);
Kees Cook's avatar
Kees Cook committed
1385 1386 1387 1388 1389 1390
	default:
		return -EINVAL;
	}
}

SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags,
1391
			 void __user *, uargs)
Kees Cook's avatar
Kees Cook committed
1392 1393 1394 1395
{
	return do_seccomp(op, flags, uargs);
}

1396 1397 1398 1399 1400 1401 1402
/**
 * prctl_set_seccomp: configures current->seccomp.mode
 * @seccomp_mode: requested mode to use
 * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
 *
 * Returns 0 on success or -EINVAL on failure.
 */
1403
long prctl_set_seccomp(unsigned long seccomp_mode, void __user *filter)
1404
{
Kees Cook's avatar
Kees Cook committed
1405
	unsigned int op;
1406
	void __user *uargs;
Kees Cook's avatar
Kees Cook committed
1407