bpf_trace.c 43.5 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2
/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com
3
 * Copyright (c) 2016 Facebook
4 5 6 7 8
 */
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/bpf.h>
9
#include <linux/bpf_perf_event.h>
10 11
#include <linux/filter.h>
#include <linux/uaccess.h>
12
#include <linux/ctype.h>
13
#include <linux/kprobes.h>
14
#include <linux/syscalls.h>
15
#include <linux/error-injection.h>
16

17 18
#include <asm/tlb.h>

19
#include "trace_probe.h"
20 21
#include "trace.h"

22 23 24
#define bpf_event_rcu_dereference(p)					\
	rcu_dereference_protected(p, lockdep_is_held(&bpf_event_mutex))

25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
#ifdef CONFIG_MODULES
struct bpf_trace_module {
	struct module *module;
	struct list_head list;
};

static LIST_HEAD(bpf_trace_modules);
static DEFINE_MUTEX(bpf_module_mutex);

static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name)
{
	struct bpf_raw_event_map *btp, *ret = NULL;
	struct bpf_trace_module *btm;
	unsigned int i;

	mutex_lock(&bpf_module_mutex);
	list_for_each_entry(btm, &bpf_trace_modules, list) {
		for (i = 0; i < btm->module->num_bpf_raw_events; ++i) {
			btp = &btm->module->bpf_raw_events[i];
			if (!strcmp(btp->tp->name, name)) {
				if (try_module_get(btm->module))
					ret = btp;
				goto out;
			}
		}
	}
out:
	mutex_unlock(&bpf_module_mutex);
	return ret;
}
#else
static struct bpf_raw_event_map *bpf_get_raw_tracepoint_module(const char *name)
{
	return NULL;
}
#endif /* CONFIG_MODULES */

62
u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
Yonghong Song's avatar
Yonghong Song committed
63
u64 bpf_get_stack(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
64

65 66
/**
 * trace_call_bpf - invoke BPF program
67
 * @call: tracepoint event
68 69 70 71 72 73 74 75 76 77 78
 * @ctx: opaque context pointer
 *
 * kprobe handlers execute BPF programs via this helper.
 * Can be used from static tracepoints in the future.
 *
 * Return: BPF programs always return an integer which is interpreted by
 * kprobe handler as:
 * 0 - return from kprobe (event is filtered out)
 * 1 - store kprobe event into ring buffer
 * Other values are reserved and currently alias to 1
 */
79
unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
{
	unsigned int ret;

	if (in_nmi()) /* not supported yet */
		return 1;

	preempt_disable();

	if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
		/*
		 * since some bpf program is already running on this cpu,
		 * don't call into another bpf program (same or different)
		 * and don't send kprobe event into ring-buffer,
		 * so return zero here
		 */
		ret = 0;
		goto out;
	}

99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
	/*
	 * Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock
	 * to all call sites, we did a bpf_prog_array_valid() there to check
	 * whether call->prog_array is empty or not, which is
	 * a heurisitc to speed up execution.
	 *
	 * If bpf_prog_array_valid() fetched prog_array was
	 * non-NULL, we go into trace_call_bpf() and do the actual
	 * proper rcu_dereference() under RCU lock.
	 * If it turns out that prog_array is NULL then, we bail out.
	 * For the opposite, if the bpf_prog_array_valid() fetched pointer
	 * was NULL, you'll skip the prog_array with the risk of missing
	 * out of events when it was updated in between this and the
	 * rcu_dereference() which is accepted risk.
	 */
	ret = BPF_PROG_RUN_ARRAY_CHECK(call->prog_array, ctx, BPF_PROG_RUN);
115 116 117 118 119 120 121 122 123

 out:
	__this_cpu_dec(bpf_prog_active);
	preempt_enable();

	return ret;
}
EXPORT_SYMBOL_GPL(trace_call_bpf);

124 125 126 127
#ifdef CONFIG_BPF_KPROBE_OVERRIDE
BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
{
	regs_set_return_value(regs, rc);
128
	override_function_with_return(regs);
129 130 131 132 133 134 135 136 137 138 139 140
	return 0;
}

static const struct bpf_func_proto bpf_override_return_proto = {
	.func		= bpf_override_return,
	.gpl_only	= true,
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_PTR_TO_CTX,
	.arg2_type	= ARG_ANYTHING,
};
#endif

141 142
BPF_CALL_3(bpf_probe_read_user, void *, dst, u32, size,
	   const void __user *, unsafe_ptr)
143
{
144
	int ret = probe_user_read(dst, unsafe_ptr, size);
145

146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185
	if (unlikely(ret < 0))
		memset(dst, 0, size);

	return ret;
}

static const struct bpf_func_proto bpf_probe_read_user_proto = {
	.func		= bpf_probe_read_user,
	.gpl_only	= true,
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
	.arg3_type	= ARG_ANYTHING,
};

BPF_CALL_3(bpf_probe_read_user_str, void *, dst, u32, size,
	   const void __user *, unsafe_ptr)
{
	int ret = strncpy_from_unsafe_user(dst, unsafe_ptr, size);

	if (unlikely(ret < 0))
		memset(dst, 0, size);

	return ret;
}

static const struct bpf_func_proto bpf_probe_read_user_str_proto = {
	.func		= bpf_probe_read_user_str,
	.gpl_only	= true,
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
	.arg3_type	= ARG_ANYTHING,
};

static __always_inline int
bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr,
			     const bool compat)
{
	int ret = security_locked_down(LOCKDOWN_BPF_READ);
186

187 188 189 190
	if (unlikely(ret < 0))
		goto out;
	ret = compat ? probe_kernel_read(dst, unsafe_ptr, size) :
	      probe_kernel_read_strict(dst, unsafe_ptr, size);
191
	if (unlikely(ret < 0))
192
out:
193
		memset(dst, 0, size);
194 195
	return ret;
}
196

197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248
BPF_CALL_3(bpf_probe_read_kernel, void *, dst, u32, size,
	   const void *, unsafe_ptr)
{
	return bpf_probe_read_kernel_common(dst, size, unsafe_ptr, false);
}

static const struct bpf_func_proto bpf_probe_read_kernel_proto = {
	.func		= bpf_probe_read_kernel,
	.gpl_only	= true,
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
	.arg3_type	= ARG_ANYTHING,
};

BPF_CALL_3(bpf_probe_read_compat, void *, dst, u32, size,
	   const void *, unsafe_ptr)
{
	return bpf_probe_read_kernel_common(dst, size, unsafe_ptr, true);
}

static const struct bpf_func_proto bpf_probe_read_compat_proto = {
	.func		= bpf_probe_read_compat,
	.gpl_only	= true,
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
	.arg3_type	= ARG_ANYTHING,
};

static __always_inline int
bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr,
				 const bool compat)
{
	int ret = security_locked_down(LOCKDOWN_BPF_READ);

	if (unlikely(ret < 0))
		goto out;
	/*
	 * The strncpy_from_unsafe_*() call will likely not fill the entire
	 * buffer, but that's okay in this circumstance as we're probing
	 * arbitrary memory anyway similar to bpf_probe_read_*() and might
	 * as well probe the stack. Thus, memory is explicitly cleared
	 * only in error case, so that improper users ignoring return
	 * code altogether don't copy garbage; otherwise length of string
	 * is returned that can be used for bpf_perf_event_output() et al.
	 */
	ret = compat ? strncpy_from_unsafe(dst, unsafe_ptr, size) :
	      strncpy_from_unsafe_strict(dst, unsafe_ptr, size);
	if (unlikely(ret < 0))
out:
		memset(dst, 0, size);
249
	return ret;
250 251
}

252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274
BPF_CALL_3(bpf_probe_read_kernel_str, void *, dst, u32, size,
	   const void *, unsafe_ptr)
{
	return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr, false);
}

static const struct bpf_func_proto bpf_probe_read_kernel_str_proto = {
	.func		= bpf_probe_read_kernel_str,
	.gpl_only	= true,
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
	.arg3_type	= ARG_ANYTHING,
};

BPF_CALL_3(bpf_probe_read_compat_str, void *, dst, u32, size,
	   const void *, unsafe_ptr)
{
	return bpf_probe_read_kernel_str_common(dst, size, unsafe_ptr, true);
}

static const struct bpf_func_proto bpf_probe_read_compat_str_proto = {
	.func		= bpf_probe_read_compat_str,
275 276
	.gpl_only	= true,
	.ret_type	= RET_INTEGER,
277
	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
278
	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
279 280 281
	.arg3_type	= ARG_ANYTHING,
};

282
BPF_CALL_3(bpf_probe_write_user, void __user *, unsafe_ptr, const void *, src,
283
	   u32, size)
284 285 286 287 288 289 290 291
{
	/*
	 * Ensure we're in user context which is safe for the helper to
	 * run. This helper has no business in a kthread.
	 *
	 * access_ok() should prevent writing to non-user memory, but in
	 * some situations (nommu, temporary switch, etc) access_ok() does
	 * not provide enough validation, hence the check on KERNEL_DS.
292 293 294 295
	 *
	 * nmi_uaccess_okay() ensures the probe is not run in an interim
	 * state, when the task or mm are switched. This is specifically
	 * required to prevent the use of temporary mm.
296 297 298 299 300
	 */

	if (unlikely(in_interrupt() ||
		     current->flags & (PF_KTHREAD | PF_EXITING)))
		return -EPERM;
Al Viro's avatar
Al Viro committed
301
	if (unlikely(uaccess_kernel()))
302
		return -EPERM;
303 304
	if (unlikely(!nmi_uaccess_okay()))
		return -EPERM;
305

306
	return probe_user_write(unsafe_ptr, src, size);
307 308 309 310 311 312 313
}

static const struct bpf_func_proto bpf_probe_write_user_proto = {
	.func		= bpf_probe_write_user,
	.gpl_only	= true,
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_ANYTHING,
314 315
	.arg2_type	= ARG_PTR_TO_MEM,
	.arg3_type	= ARG_CONST_SIZE,
316 317 318 319 320 321 322 323 324 325
};

static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
{
	pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!",
			    current->comm, task_pid_nr(current));

	return &bpf_probe_write_user_proto;
}

326
/*
327 328
 * Only limited trace_printk() conversion specifiers allowed:
 * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %s
329
 */
330 331
BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
	   u64, arg2, u64, arg3)
332
{
333
	bool str_seen = false;
334 335
	int mod[3] = {};
	int fmt_cnt = 0;
336 337
	u64 unsafe_addr;
	char buf[64];
338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363
	int i;

	/*
	 * bpf_check()->check_func_arg()->check_stack_boundary()
	 * guarantees that fmt points to bpf program stack,
	 * fmt_size bytes of it were initialized and fmt_size > 0
	 */
	if (fmt[--fmt_size] != 0)
		return -EINVAL;

	/* check format string for allowed specifiers */
	for (i = 0; i < fmt_size; i++) {
		if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i]))
			return -EINVAL;

		if (fmt[i] != '%')
			continue;

		if (fmt_cnt >= 3)
			return -EINVAL;

		/* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
		i++;
		if (fmt[i] == 'l') {
			mod[fmt_cnt]++;
			i++;
364
		} else if (fmt[i] == 'p' || fmt[i] == 's') {
365
			mod[fmt_cnt]++;
366 367 368 369
			/* disallow any further format extensions */
			if (fmt[i + 1] != 0 &&
			    !isspace(fmt[i + 1]) &&
			    !ispunct(fmt[i + 1]))
370 371
				return -EINVAL;
			fmt_cnt++;
372
			if (fmt[i] == 's') {
373 374 375 376 377 378 379
				if (str_seen)
					/* allow only one '%s' per fmt string */
					return -EINVAL;
				str_seen = true;

				switch (fmt_cnt) {
				case 1:
380 381
					unsafe_addr = arg1;
					arg1 = (long) buf;
382 383
					break;
				case 2:
384 385
					unsafe_addr = arg2;
					arg2 = (long) buf;
386 387
					break;
				case 3:
388 389
					unsafe_addr = arg3;
					arg3 = (long) buf;
390 391 392 393 394 395 396
					break;
				}
				buf[0] = 0;
				strncpy_from_unsafe(buf,
						    (void *) (long) unsafe_addr,
						    sizeof(buf));
			}
397 398 399 400 401 402 403 404
			continue;
		}

		if (fmt[i] == 'l') {
			mod[fmt_cnt]++;
			i++;
		}

405 406
		if (fmt[i] != 'i' && fmt[i] != 'd' &&
		    fmt[i] != 'u' && fmt[i] != 'x')
407 408 409 410
			return -EINVAL;
		fmt_cnt++;
	}

411 412 413 414 415
/* Horrid workaround for getting va_list handling working with different
 * argument type combinations generically for 32 and 64 bit archs.
 */
#define __BPF_TP_EMIT()	__BPF_ARG3_TP()
#define __BPF_TP(...)							\
416
	__trace_printk(0 /* Fake ip */,					\
417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440
		       fmt, ##__VA_ARGS__)

#define __BPF_ARG1_TP(...)						\
	((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64))	\
	  ? __BPF_TP(arg1, ##__VA_ARGS__)				\
	  : ((mod[0] == 1 || (mod[0] == 0 && __BITS_PER_LONG == 32))	\
	      ? __BPF_TP((long)arg1, ##__VA_ARGS__)			\
	      : __BPF_TP((u32)arg1, ##__VA_ARGS__)))

#define __BPF_ARG2_TP(...)						\
	((mod[1] == 2 || (mod[1] == 1 && __BITS_PER_LONG == 64))	\
	  ? __BPF_ARG1_TP(arg2, ##__VA_ARGS__)				\
	  : ((mod[1] == 1 || (mod[1] == 0 && __BITS_PER_LONG == 32))	\
	      ? __BPF_ARG1_TP((long)arg2, ##__VA_ARGS__)		\
	      : __BPF_ARG1_TP((u32)arg2, ##__VA_ARGS__)))

#define __BPF_ARG3_TP(...)						\
	((mod[2] == 2 || (mod[2] == 1 && __BITS_PER_LONG == 64))	\
	  ? __BPF_ARG2_TP(arg3, ##__VA_ARGS__)				\
	  : ((mod[2] == 1 || (mod[2] == 0 && __BITS_PER_LONG == 32))	\
	      ? __BPF_ARG2_TP((long)arg3, ##__VA_ARGS__)		\
	      : __BPF_ARG2_TP((u32)arg3, ##__VA_ARGS__)))

	return __BPF_TP_EMIT();
441 442 443 444 445 446
}

static const struct bpf_func_proto bpf_trace_printk_proto = {
	.func		= bpf_trace_printk,
	.gpl_only	= true,
	.ret_type	= RET_INTEGER,
447 448
	.arg1_type	= ARG_PTR_TO_MEM,
	.arg2_type	= ARG_CONST_SIZE,
449 450
};

451 452 453 454 455 456 457 458 459 460 461
const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
{
	/*
	 * this program might be calling bpf_trace_printk,
	 * so allocate per-cpu printk buffers
	 */
	trace_printk_init_buffers();

	return &bpf_trace_printk_proto;
}

462 463 464
static __always_inline int
get_map_perf_counter(struct bpf_map *map, u64 flags,
		     u64 *value, u64 *enabled, u64 *running)
465 466
{
	struct bpf_array *array = container_of(map, struct bpf_array, map);
467 468
	unsigned int cpu = smp_processor_id();
	u64 index = flags & BPF_F_INDEX_MASK;
469
	struct bpf_event_entry *ee;
470

471 472 473 474
	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
		return -EINVAL;
	if (index == BPF_F_CURRENT_CPU)
		index = cpu;
475 476 477
	if (unlikely(index >= array->map.max_entries))
		return -E2BIG;

478
	ee = READ_ONCE(array->ptrs[index]);
479
	if (!ee)
480 481
		return -ENOENT;

482 483 484 485 486 487 488 489 490
	return perf_event_read_local(ee->event, value, enabled, running);
}

BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
{
	u64 value = 0;
	int err;

	err = get_map_perf_counter(map, flags, &value, NULL, NULL);
491
	/*
492 493
	 * this api is ugly since we miss [-22..-2] range of valid
	 * counter values, but that's uapi
494
	 */
495 496 497
	if (err)
		return err;
	return value;
498 499
}

500
static const struct bpf_func_proto bpf_perf_event_read_proto = {
501
	.func		= bpf_perf_event_read,
502
	.gpl_only	= true,
503 504 505 506 507
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_CONST_MAP_PTR,
	.arg2_type	= ARG_ANYTHING,
};

508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534
BPF_CALL_4(bpf_perf_event_read_value, struct bpf_map *, map, u64, flags,
	   struct bpf_perf_event_value *, buf, u32, size)
{
	int err = -EINVAL;

	if (unlikely(size != sizeof(struct bpf_perf_event_value)))
		goto clear;
	err = get_map_perf_counter(map, flags, &buf->counter, &buf->enabled,
				   &buf->running);
	if (unlikely(err))
		goto clear;
	return 0;
clear:
	memset(buf, 0, size);
	return err;
}

static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
	.func		= bpf_perf_event_read_value,
	.gpl_only	= true,
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_CONST_MAP_PTR,
	.arg2_type	= ARG_ANYTHING,
	.arg3_type	= ARG_PTR_TO_UNINIT_MEM,
	.arg4_type	= ARG_CONST_SIZE,
};

535 536
static __always_inline u64
__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
537
			u64 flags, struct perf_sample_data *sd)
538 539
{
	struct bpf_array *array = container_of(map, struct bpf_array, map);
540
	unsigned int cpu = smp_processor_id();
541
	u64 index = flags & BPF_F_INDEX_MASK;
542
	struct bpf_event_entry *ee;
543 544
	struct perf_event *event;

545
	if (index == BPF_F_CURRENT_CPU)
546
		index = cpu;
547 548 549
	if (unlikely(index >= array->map.max_entries))
		return -E2BIG;

550
	ee = READ_ONCE(array->ptrs[index]);
551
	if (!ee)
552 553
		return -ENOENT;

554
	event = ee->event;
555 556 557 558
	if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE ||
		     event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
		return -EINVAL;

559
	if (unlikely(event->oncpu != cpu))
560 561
		return -EOPNOTSUPP;

562
	return perf_event_output(event, sd, regs);
563 564
}

565 566 567 568 569 570 571 572 573 574
/*
 * Support executing tracepoints in normal, irq, and nmi context that each call
 * bpf_perf_event_output
 */
struct bpf_trace_sample_data {
	struct perf_sample_data sds[3];
};

static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_trace_sds);
static DEFINE_PER_CPU(int, bpf_trace_nest_level);
575 576
BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
	   u64, flags, void *, data, u64, size)
577
{
578 579
	struct bpf_trace_sample_data *sds = this_cpu_ptr(&bpf_trace_sds);
	int nest_level = this_cpu_inc_return(bpf_trace_nest_level);
580 581 582 583 584 585
	struct perf_raw_record raw = {
		.frag = {
			.size = size,
			.data = data,
		},
	};
586 587
	struct perf_sample_data *sd;
	int err;
588

589 590 591 592 593 594 595 596 597 598 599
	if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) {
		err = -EBUSY;
		goto out;
	}

	sd = &sds->sds[nest_level - 1];

	if (unlikely(flags & ~(BPF_F_INDEX_MASK))) {
		err = -EINVAL;
		goto out;
	}
600

601 602 603
	perf_sample_data_init(sd, 0, 0);
	sd->raw = &raw;

604 605 606 607 608
	err = __bpf_perf_event_output(regs, map, flags, sd);

out:
	this_cpu_dec(bpf_trace_nest_level);
	return err;
609 610
}

611 612
static const struct bpf_func_proto bpf_perf_event_output_proto = {
	.func		= bpf_perf_event_output,
613
	.gpl_only	= true,
614 615 616 617
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_PTR_TO_CTX,
	.arg2_type	= ARG_CONST_MAP_PTR,
	.arg3_type	= ARG_ANYTHING,
618
	.arg4_type	= ARG_PTR_TO_MEM,
619
	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
620 621
};

622 623 624 625 626 627
static DEFINE_PER_CPU(int, bpf_event_output_nest_level);
struct bpf_nested_pt_regs {
	struct pt_regs regs[3];
};
static DEFINE_PER_CPU(struct bpf_nested_pt_regs, bpf_pt_regs);
static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_misc_sds);
628

629 630
u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
		     void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
631
{
632
	int nest_level = this_cpu_inc_return(bpf_event_output_nest_level);
633 634 635 636 637 638 639
	struct perf_raw_frag frag = {
		.copy		= ctx_copy,
		.size		= ctx_size,
		.data		= ctx,
	};
	struct perf_raw_record raw = {
		.frag = {
640 641 642
			{
				.next	= ctx_size ? &frag : NULL,
			},
643 644 645 646
			.size	= meta_size,
			.data	= meta,
		},
	};
647 648 649 650 651 652 653 654 655 656
	struct perf_sample_data *sd;
	struct pt_regs *regs;
	u64 ret;

	if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(bpf_misc_sds.sds))) {
		ret = -EBUSY;
		goto out;
	}
	sd = this_cpu_ptr(&bpf_misc_sds.sds[nest_level - 1]);
	regs = this_cpu_ptr(&bpf_pt_regs.regs[nest_level - 1]);
657 658

	perf_fetch_caller_regs(regs);
659 660
	perf_sample_data_init(sd, 0, 0);
	sd->raw = &raw;
661

662 663 664 665
	ret = __bpf_perf_event_output(regs, map, flags, sd);
out:
	this_cpu_dec(bpf_event_output_nest_level);
	return ret;
666 667
}

668
BPF_CALL_0(bpf_get_current_task)
669 670 671 672 673 674 675 676 677 678
{
	return (long) current;
}

static const struct bpf_func_proto bpf_get_current_task_proto = {
	.func		= bpf_get_current_task,
	.gpl_only	= true,
	.ret_type	= RET_INTEGER,
};

679
BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx)
680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701
{
	struct bpf_array *array = container_of(map, struct bpf_array, map);
	struct cgroup *cgrp;

	if (unlikely(idx >= array->map.max_entries))
		return -E2BIG;

	cgrp = READ_ONCE(array->ptrs[idx]);
	if (unlikely(!cgrp))
		return -EAGAIN;

	return task_under_cgroup_hierarchy(current, cgrp);
}

static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = {
	.func           = bpf_current_task_under_cgroup,
	.gpl_only       = false,
	.ret_type       = RET_INTEGER,
	.arg1_type      = ARG_CONST_MAP_PTR,
	.arg2_type      = ARG_ANYTHING,
};

702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734
struct send_signal_irq_work {
	struct irq_work irq_work;
	struct task_struct *task;
	u32 sig;
};

static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work);

static void do_bpf_send_signal(struct irq_work *entry)
{
	struct send_signal_irq_work *work;

	work = container_of(entry, struct send_signal_irq_work, irq_work);
	group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, PIDTYPE_TGID);
}

BPF_CALL_1(bpf_send_signal, u32, sig)
{
	struct send_signal_irq_work *work = NULL;

	/* Similar to bpf_probe_write_user, task needs to be
	 * in a sound condition and kernel memory access be
	 * permitted in order to send signal to the current
	 * task.
	 */
	if (unlikely(current->flags & (PF_KTHREAD | PF_EXITING)))
		return -EPERM;
	if (unlikely(uaccess_kernel()))
		return -EPERM;
	if (unlikely(!nmi_uaccess_okay()))
		return -EPERM;

	if (in_nmi()) {
735 736 737 738 739 740
		/* Do an early check on signal validity. Otherwise,
		 * the error is lost in deferred irq_work.
		 */
		if (unlikely(!valid_signal(sig)))
			return -EINVAL;

741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764
		work = this_cpu_ptr(&send_signal_work);
		if (work->irq_work.flags & IRQ_WORK_BUSY)
			return -EBUSY;

		/* Add the current task, which is the target of sending signal,
		 * to the irq_work. The current task may change when queued
		 * irq works get executed.
		 */
		work->task = current;
		work->sig = sig;
		irq_work_queue(&work->irq_work);
		return 0;
	}

	return group_send_sig_info(sig, SEND_SIG_PRIV, current, PIDTYPE_TGID);
}

static const struct bpf_func_proto bpf_send_signal_proto = {
	.func		= bpf_send_signal,
	.gpl_only	= false,
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_ANYTHING,
};

765 766
static const struct bpf_func_proto *
tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
767 768 769 770 771 772 773 774
{
	switch (func_id) {
	case BPF_FUNC_map_lookup_elem:
		return &bpf_map_lookup_elem_proto;
	case BPF_FUNC_map_update_elem:
		return &bpf_map_update_elem_proto;
	case BPF_FUNC_map_delete_elem:
		return &bpf_map_delete_elem_proto;
775 776 777 778 779 780
	case BPF_FUNC_map_push_elem:
		return &bpf_map_push_elem_proto;
	case BPF_FUNC_map_pop_elem:
		return &bpf_map_pop_elem_proto;
	case BPF_FUNC_map_peek_elem:
		return &bpf_map_peek_elem_proto;
781 782
	case BPF_FUNC_ktime_get_ns:
		return &bpf_ktime_get_ns_proto;
783 784
	case BPF_FUNC_tail_call:
		return &bpf_tail_call_proto;
785 786
	case BPF_FUNC_get_current_pid_tgid:
		return &bpf_get_current_pid_tgid_proto;
787 788
	case BPF_FUNC_get_current_task:
		return &bpf_get_current_task_proto;
789 790 791 792
	case BPF_FUNC_get_current_uid_gid:
		return &bpf_get_current_uid_gid_proto;
	case BPF_FUNC_get_current_comm:
		return &bpf_get_current_comm_proto;
793
	case BPF_FUNC_trace_printk:
794
		return bpf_get_trace_printk_proto();
795 796
	case BPF_FUNC_get_smp_processor_id:
		return &bpf_get_smp_processor_id_proto;
797 798
	case BPF_FUNC_get_numa_node_id:
		return &bpf_get_numa_node_id_proto;
799 800
	case BPF_FUNC_perf_event_read:
		return &bpf_perf_event_read_proto;
801 802
	case BPF_FUNC_probe_write_user:
		return bpf_get_probe_write_proto();
803 804
	case BPF_FUNC_current_task_under_cgroup:
		return &bpf_current_task_under_cgroup_proto;
805 806
	case BPF_FUNC_get_prandom_u32:
		return &bpf_get_prandom_u32_proto;
807 808 809 810 811 812 813 814 815 816
	case BPF_FUNC_probe_read_user:
		return &bpf_probe_read_user_proto;
	case BPF_FUNC_probe_read_kernel:
		return &bpf_probe_read_kernel_proto;
	case BPF_FUNC_probe_read:
		return &bpf_probe_read_compat_proto;
	case BPF_FUNC_probe_read_user_str:
		return &bpf_probe_read_user_str_proto;
	case BPF_FUNC_probe_read_kernel_str:
		return &bpf_probe_read_kernel_str_proto;
817
	case BPF_FUNC_probe_read_str:
818
		return &bpf_probe_read_compat_str_proto;
819
#ifdef CONFIG_CGROUPS
820 821
	case BPF_FUNC_get_current_cgroup_id:
		return &bpf_get_current_cgroup_id_proto;
822
#endif
823 824
	case BPF_FUNC_send_signal:
		return &bpf_send_signal_proto;
825 826 827 828 829
	default:
		return NULL;
	}
}

830 831
static const struct bpf_func_proto *
kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
832 833
{
	switch (func_id) {
834 835
	case BPF_FUNC_perf_event_output:
		return &bpf_perf_event_output_proto;
836 837
	case BPF_FUNC_get_stackid:
		return &bpf_get_stackid_proto;
Yonghong Song's avatar
Yonghong Song committed
838 839
	case BPF_FUNC_get_stack:
		return &bpf_get_stack_proto;
840 841
	case BPF_FUNC_perf_event_read_value:
		return &bpf_perf_event_read_value_proto;
842 843 844 845
#ifdef CONFIG_BPF_KPROBE_OVERRIDE
	case BPF_FUNC_override_return:
		return &bpf_override_return_proto;
#endif
846
	default:
847
		return tracing_func_proto(func_id, prog);
848 849 850 851
	}
}

/* bpf+kprobe programs can access fields of 'struct pt_regs' */
852
static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
853
					const struct bpf_prog *prog,
854
					struct bpf_insn_access_aux *info)
855 856 857 858 859 860 861
{
	if (off < 0 || off >= sizeof(struct pt_regs))
		return false;
	if (type != BPF_READ)
		return false;
	if (off % size != 0)
		return false;
862 863 864 865 866 867 868
	/*
	 * Assertion for 32 bit to make sure last 8 byte access
	 * (BPF_DW) to the last 4 byte member is disallowed.
	 */
	if (off + size > sizeof(struct pt_regs))
		return false;

869 870 871
	return true;
}

872
const struct bpf_verifier_ops kprobe_verifier_ops = {
873 874 875 876
	.get_func_proto  = kprobe_prog_func_proto,
	.is_valid_access = kprobe_prog_is_valid_access,
};

877 878 879
const struct bpf_prog_ops kprobe_prog_ops = {
};

880 881
BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map,
	   u64, flags, void *, data, u64, size)
882
{
883 884
	struct pt_regs *regs = *(struct pt_regs **)tp_buff;

885 886 887
	/*
	 * r1 points to perf tracepoint buffer where first 8 bytes are hidden
	 * from bpf program and contain a pointer to 'struct pt_regs'. Fetch it
888
	 * from there and call the same bpf_perf_event_output() helper inline.
889
	 */
890
	return ____bpf_perf_event_output(regs, map, flags, data, size);
891 892 893 894 895 896 897 898 899
}

static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
	.func		= bpf_perf_event_output_tp,
	.gpl_only	= true,
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_PTR_TO_CTX,
	.arg2_type	= ARG_CONST_MAP_PTR,
	.arg3_type	= ARG_ANYTHING,
900
	.arg4_type	= ARG_PTR_TO_MEM,
901
	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
902 903
};

904 905
BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map,
	   u64, flags)
906
{
907
	struct pt_regs *regs = *(struct pt_regs **)tp_buff;
908

909 910 911 912 913 914 915
	/*
	 * Same comment as in bpf_perf_event_output_tp(), only that this time
	 * the other helper's function body cannot be inlined due to being
	 * external, thus we need to call raw helper function.
	 */
	return bpf_get_stackid((unsigned long) regs, (unsigned long) map,
			       flags, 0, 0);
916 917 918 919 920 921 922 923 924 925 926
}

static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
	.func		= bpf_get_stackid_tp,
	.gpl_only	= true,
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_PTR_TO_CTX,
	.arg2_type	= ARG_CONST_MAP_PTR,
	.arg3_type	= ARG_ANYTHING,
};

Yonghong Song's avatar
Yonghong Song committed
927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945
BPF_CALL_4(bpf_get_stack_tp, void *, tp_buff, void *, buf, u32, size,
	   u64, flags)
{
	struct pt_regs *regs = *(struct pt_regs **)tp_buff;

	return bpf_get_stack((unsigned long) regs, (unsigned long) buf,
			     (unsigned long) size, flags, 0);
}

static const struct bpf_func_proto bpf_get_stack_proto_tp = {
	.func		= bpf_get_stack_tp,
	.gpl_only	= true,
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_PTR_TO_CTX,
	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
	.arg4_type	= ARG_ANYTHING,
};

946 947
static const struct bpf_func_proto *
tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
948 949 950 951 952 953
{
	switch (func_id) {
	case BPF_FUNC_perf_event_output:
		return &bpf_perf_event_output_proto_tp;
	case BPF_FUNC_get_stackid:
		return &bpf_get_stackid_proto_tp;
Yonghong Song's avatar
Yonghong Song committed
954 955
	case BPF_FUNC_get_stack:
		return &bpf_get_stack_proto_tp;
956
	default:
957
		return tracing_func_proto(func_id, prog);
958 959 960 961
	}
}

static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type,
962
				    const struct bpf_prog *prog,
963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984
				    struct bpf_insn_access_aux *info)
{
	if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE)
		return false;
	if (type != BPF_READ)
		return false;
	if (off % size != 0)
		return false;

	BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(__u64));
	return true;
}

const struct bpf_verifier_ops tracepoint_verifier_ops = {
	.get_func_proto  = tp_prog_func_proto,
	.is_valid_access = tp_prog_is_valid_access,
};

const struct bpf_prog_ops tracepoint_prog_ops = {
};

BPF_CALL_3(bpf_perf_prog_read_value, struct bpf_perf_event_data_kern *, ctx,
985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000
	   struct bpf_perf_event_value *, buf, u32, size)
{
	int err = -EINVAL;

	if (unlikely(size != sizeof(struct bpf_perf_event_value)))
		goto clear;
	err = perf_event_read_local(ctx->event, &buf->counter, &buf->enabled,
				    &buf->running);
	if (unlikely(err))
		goto clear;
	return 0;
clear:
	memset(buf, 0, size);
	return err;
}

1001 1002
static const struct bpf_func_proto bpf_perf_prog_read_value_proto = {
         .func           = bpf_perf_prog_read_value,
1003 1004 1005 1006 1007 1008 1009
         .gpl_only       = true,
         .ret_type       = RET_INTEGER,
         .arg1_type      = ARG_PTR_TO_CTX,
         .arg2_type      = ARG_PTR_TO_UNINIT_MEM,
         .arg3_type      = ARG_CONST_SIZE,
};

1010 1011
static const struct bpf_func_proto *
pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1012 1013 1014
{
	switch (func_id) {
	case BPF_FUNC_perf_event_output:
1015
		return &bpf_perf_event_output_proto_tp;
1016
	case BPF_FUNC_get_stackid:
1017
		return &bpf_get_stackid_proto_tp;
Yonghong Song's avatar
Yonghong Song committed
1018 1019
	case BPF_FUNC_get_stack:
		return &bpf_get_stack_proto_tp;
1020
	case BPF_FUNC_perf_prog_read_value:
1021
		return &bpf_perf_prog_read_value_proto;
1022
	default:
1023
		return tracing_func_proto(func_id, prog);
1024 1025 1026
	}
}

1027 1028 1029
/*
 * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp
 * to avoid potential recursive reuse issue when/if tracepoints are added
1030 1031 1032 1033
 * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack.
 *
 * Since raw tracepoints run despite bpf_prog_active, support concurrent usage
 * in normal, irq, and nmi context.
1034
 */
1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057
struct bpf_raw_tp_regs {
	struct pt_regs regs[3];
};
static DEFINE_PER_CPU(struct bpf_raw_tp_regs, bpf_raw_tp_regs);
static DEFINE_PER_CPU(int, bpf_raw_tp_nest_level);
static struct pt_regs *get_bpf_raw_tp_regs(void)
{
	struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs);
	int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level);

	if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) {
		this_cpu_dec(bpf_raw_tp_nest_level);
		return ERR_PTR(-EBUSY);
	}

	return &tp_regs->regs[nest_level - 1];
}

static void put_bpf_raw_tp_regs(void)
{
	this_cpu_dec(bpf_raw_tp_nest_level);
}

1058 1059 1060
BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args,
	   struct bpf_map *, map, u64, flags, void *, data, u64, size)
{
1061 1062 1063 1064 1065
	struct pt_regs *regs = get_bpf_raw_tp_regs();
	int ret;

	if (IS_ERR(regs))
		return PTR_ERR(regs);
1066 1067

	perf_fetch_caller_regs(regs);
1068 1069 1070 1071
	ret = ____bpf_perf_event_output(regs, map, flags, data, size);

	put_bpf_raw_tp_regs();
	return ret;
1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084
}

static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = {
	.func		= bpf_perf_event_output_raw_tp,
	.gpl_only	= true,
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_PTR_TO_CTX,
	.arg2_type	= ARG_CONST_MAP_PTR,
	.arg3_type	= ARG_ANYTHING,
	.arg4_type	= ARG_PTR_TO_MEM,
	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
};

1085 1086
extern const struct bpf_func_proto bpf_skb_output_proto;

1087 1088 1089
BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args,
	   struct bpf_map *, map, u64, flags)
{
1090 1091 1092 1093 1094
	struct pt_regs *regs = get_bpf_raw_tp_regs();
	int ret;

	if (IS_ERR(regs))
		return PTR_ERR(regs);
1095 1096 1097

	perf_fetch_caller_regs(regs);
	/* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */
1098 1099 1100 1101
	ret = bpf_get_stackid((unsigned long) regs, (unsigned long) map,
			      flags, 0, 0);
	put_bpf_raw_tp_regs();
	return ret;
1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112
}

static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = {
	.func		= bpf_get_stackid_raw_tp,
	.gpl_only	= true,
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_PTR_TO_CTX,
	.arg2_type	= ARG_CONST_MAP_PTR,
	.arg3_type	= ARG_ANYTHING,
};

Yonghong Song's avatar
Yonghong Song committed
1113 1114 1115
BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args,
	   void *, buf, u32, size, u64, flags)
{
1116 1117 1118 1119 1120
	struct pt_regs *regs = get_bpf_raw_tp_regs();
	int ret;

	if (IS_ERR(regs))
		return PTR_ERR(regs);
Yonghong Song's avatar
Yonghong Song committed
1121 1122

	perf_fetch_caller_regs(regs);
1123 1124 1125 1126
	ret = bpf_get_stack((unsigned long) regs, (unsigned long) buf,
			    (unsigned long) size, flags, 0);
	put_bpf_raw_tp_regs();
	return ret;
Yonghong Song's avatar
Yonghong Song committed
1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138
}

static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = {
	.func		= bpf_get_stack_raw_tp,
	.gpl_only	= true,
	.ret_type	= RET_INTEGER,
	.arg1_type	= ARG_PTR_TO_CTX,
	.arg2_type	= ARG_PTR_TO_MEM,
	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
	.arg4_type	= ARG_ANYTHING,
};

1139 1140
static const struct bpf_func_proto *
raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
1141 1142 1143 1144 1145 1146
{
	switch (func_id) {
	case BPF_FUNC_perf_event_output:
		return &bpf_perf_event_output_proto_raw_tp;
	case BPF_FUNC_get_stackid:
		return &bpf_get_stackid_proto_raw_tp;
Yonghong Song's avatar
Yonghong Song committed
1147 1148
	case BPF_FUNC_get_stack:
		return &bpf_get_stack_proto_raw_tp;
1149
	default:
1150
		return tracing_func_proto(func_id, prog);
1151 1152 1153
	}
}

1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166
static const struct bpf_func_proto *
tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
	switch (func_id) {
#ifdef CONFIG_NET
	case BPF_FUNC_skb_output:
		return &bpf_skb_output_proto;
#endif
	default:
		return raw_tp_prog_func_proto(func_id, prog);
	}
}

1167 1168
static bool raw_tp_prog_is_valid_access(int off, int size,
					enum bpf_access_type type,
1169
					const struct bpf_prog *prog,
1170 1171
					struct bpf_insn_access_aux *info)
{
1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186
	if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS)
		return false;
	if (type != BPF_READ)
		return false;
	if (off % size != 0)
		return false;
	return true;
}

static bool tracing_prog_is_valid_access(int off, int size,
					 enum bpf_access_type type,
					 const struct bpf_prog *prog,
					 struct bpf_insn_access_aux *info)
{
	if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS)
1187 1188 1189 1190 1191
		return false;
	if (type != BPF_READ)
		return false;
	if (off % size != 0)
		return false;
1192
	return btf_ctx_access(off, size, type, prog, info);
1193