kprobes.c 61.6 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
/*
 *  Kernel Probes (KProbes)
 *  kernel/kprobes.c
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 *
 * Copyright (C) IBM Corporation, 2002, 2004
 *
 * 2002-Oct	Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel
 *		Probes initial implementation (includes suggestions from
 *		Rusty Russell).
 * 2004-Aug	Updated by Prasanna S Panchamukhi <prasanna@in.ibm.com> with
 *		hlists and exceptions notifier as suggested by Andi Kleen.
 * 2004-July	Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes
 *		interface to access function arguments.
 * 2004-Sep	Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes
 *		exceptions notifier to be first on the priority list.
30 31 32
 * 2005-May	Hien Nguyen <hien@us.ibm.com>, Jim Keniston
 *		<jkenisto@us.ibm.com> and Prasanna S Panchamukhi
 *		<prasanna@in.ibm.com> added function-return probes.
Linus Torvalds's avatar
Linus Torvalds committed
33 34 35 36
 */
#include <linux/kprobes.h>
#include <linux/hash.h>
#include <linux/init.h>
37
#include <linux/slab.h>
38
#include <linux/stddef.h>
39
#include <linux/export.h>
40
#include <linux/moduleloader.h>
41
#include <linux/kallsyms.h>
42
#include <linux/freezer.h>
43 44
#include <linux/seq_file.h>
#include <linux/debugfs.h>
45
#include <linux/sysctl.h>
46
#include <linux/kdebug.h>
47
#include <linux/memory.h>
48
#include <linux/ftrace.h>
49
#include <linux/cpu.h>
50
#include <linux/jump_label.h>
51

52
#include <asm/sections.h>
Linus Torvalds's avatar
Linus Torvalds committed
53 54
#include <asm/cacheflush.h>
#include <asm/errno.h>
55
#include <linux/uaccess.h>
Linus Torvalds's avatar
Linus Torvalds committed
56 57 58 59

#define KPROBE_HASH_BITS 6
#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)

60 61 62 63 64 65 66 67 68 69

/*
 * Some oddball architectures like 64bit powerpc have function descriptors
 * so this must be overridable.
 */
#ifndef kprobe_lookup_name
#define kprobe_lookup_name(name, addr) \
	addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name)))
#endif

70
static int kprobes_initialized;
Linus Torvalds's avatar
Linus Torvalds committed
71
static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
72
static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
Linus Torvalds's avatar
Linus Torvalds committed
73

74
/* NOTE: change this value only with kprobe_mutex held */
75
static bool kprobes_all_disarmed;
76

77 78
/* This protects kprobe_table and optimizing_list */
static DEFINE_MUTEX(kprobe_mutex);
79
static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
80
static struct {
81
	raw_spinlock_t lock ____cacheline_aligned_in_smp;
82 83
} kretprobe_table_locks[KPROBE_TABLE_SIZE];

84
static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
85 86 87
{
	return &(kretprobe_table_locks[hash].lock);
}
Linus Torvalds's avatar
Linus Torvalds committed
88

89 90
/* Blacklist -- list of struct kprobe_blacklist_entry */
static LIST_HEAD(kprobe_blacklist);
91

92
#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
93 94 95 96 97 98 99
/*
 * kprobe->ainsn.insn points to the copy of the instruction to be
 * single-stepped. x86_64, POWER4 and above have no-exec support and
 * stepping on the instruction on a vmalloced/kmalloced/data page
 * is a recipe for disaster
 */
struct kprobe_insn_page {
100
	struct list_head list;
101
	kprobe_opcode_t *insns;		/* Page of instruction slots */
102
	struct kprobe_insn_cache *cache;
103
	int nused;
104
	int ngarbage;
105
	char slot_used[];
106 107
};

108 109 110 111 112 113 114 115 116
#define KPROBE_INSN_PAGE_SIZE(slots)			\
	(offsetof(struct kprobe_insn_page, slot_used) +	\
	 (sizeof(char) * (slots)))

static int slots_per_page(struct kprobe_insn_cache *c)
{
	return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t));
}

117 118 119 120 121 122
enum kprobe_slot_state {
	SLOT_CLEAN = 0,
	SLOT_DIRTY = 1,
	SLOT_USED = 2,
};

123 124 125 126 127 128 129
static void *alloc_insn_page(void)
{
	return module_alloc(PAGE_SIZE);
}

static void free_insn_page(void *page)
{
130
	module_memfree(page);
131 132
}

133 134
struct kprobe_insn_cache kprobe_insn_slots = {
	.mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex),
135 136
	.alloc = alloc_insn_page,
	.free = free_insn_page,
137 138 139 140
	.pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
	.insn_size = MAX_INSN_SIZE,
	.nr_garbage = 0,
};
141
static int collect_garbage_slots(struct kprobe_insn_cache *c);
142

143
/**
144
 * __get_insn_slot() - Find a slot on an executable page for an instruction.
145 146
 * We allocate an executable page if there's no room on existing ones.
 */
147
kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
148 149
{
	struct kprobe_insn_page *kip;
150
	kprobe_opcode_t *slot = NULL;
151

152
	/* Since the slot array is not protected by rcu, we need a mutex */
153
	mutex_lock(&c->mutex);
154
 retry:
155 156
	rcu_read_lock();
	list_for_each_entry_rcu(kip, &c->pages, list) {
157
		if (kip->nused < slots_per_page(c)) {
158
			int i;
159
			for (i = 0; i < slots_per_page(c); i++) {
160 161
				if (kip->slot_used[i] == SLOT_CLEAN) {
					kip->slot_used[i] = SLOT_USED;
162
					kip->nused++;
163
					slot = kip->insns + (i * c->insn_size);
164
					rcu_read_unlock();
165
					goto out;
166 167
				}
			}
168 169 170
			/* kip->nused is broken. Fix it. */
			kip->nused = slots_per_page(c);
			WARN_ON(1);
171 172
		}
	}
173
	rcu_read_unlock();
174

175
	/* If there are any garbage slots, collect it and try again. */
176
	if (c->nr_garbage && collect_garbage_slots(c) == 0)
177
		goto retry;
178 179 180

	/* All out of space.  Need to allocate a new page. */
	kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL);
181
	if (!kip)
182
		goto out;
183 184 185 186 187 188

	/*
	 * Use module_alloc so this page is within +/- 2GB of where the
	 * kernel image and loaded module images reside. This is required
	 * so x86_64 can correctly handle the %rip-relative fixups.
	 */
189
	kip->insns = c->alloc();
190 191
	if (!kip->insns) {
		kfree(kip);
192
		goto out;
193
	}
194
	INIT_LIST_HEAD(&kip->list);
195
	memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c));
196
	kip->slot_used[0] = SLOT_USED;
197
	kip->nused = 1;
198
	kip->ngarbage = 0;
199
	kip->cache = c;
200
	list_add_rcu(&kip->list, &c->pages);
201 202 203 204
	slot = kip->insns;
out:
	mutex_unlock(&c->mutex);
	return slot;
205 206
}

207
/* Return 1 if all garbages are collected, otherwise 0. */
208
static int collect_one_slot(struct kprobe_insn_page *kip, int idx)
209
{
210
	kip->slot_used[idx] = SLOT_CLEAN;
211 212 213 214 215 216 217 218
	kip->nused--;
	if (kip->nused == 0) {
		/*
		 * Page is no longer in use.  Free it unless
		 * it's the last one.  We keep the last one
		 * so as not to have to set it up again the
		 * next time somebody inserts a probe.
		 */
219
		if (!list_is_singular(&kip->list)) {
220 221
			list_del_rcu(&kip->list);
			synchronize_rcu();
222
			kip->cache->free(kip->insns);
223 224 225 226 227 228 229
			kfree(kip);
		}
		return 1;
	}
	return 0;
}

230
static int collect_garbage_slots(struct kprobe_insn_cache *c)
231
{
232
	struct kprobe_insn_page *kip, *next;
233

234 235
	/* Ensure no-one is interrupted on the garbages */
	synchronize_sched();
236

237
	list_for_each_entry_safe(kip, next, &c->pages, list) {
238 239 240 241
		int i;
		if (kip->ngarbage == 0)
			continue;
		kip->ngarbage = 0;	/* we will collect all garbages */
242
		for (i = 0; i < slots_per_page(c); i++) {
243
			if (kip->slot_used[i] == SLOT_DIRTY && collect_one_slot(kip, i))
244 245 246
				break;
		}
	}
247
	c->nr_garbage = 0;
248 249 250
	return 0;
}

251 252
void __free_insn_slot(struct kprobe_insn_cache *c,
		      kprobe_opcode_t *slot, int dirty)
253 254
{
	struct kprobe_insn_page *kip;
255
	long idx;
256

257
	mutex_lock(&c->mutex);
258 259 260 261 262
	rcu_read_lock();
	list_for_each_entry_rcu(kip, &c->pages, list) {
		idx = ((long)slot - (long)kip->insns) /
			(c->insn_size * sizeof(kprobe_opcode_t));
		if (idx >= 0 && idx < slots_per_page(c))
263
			goto out;
264
	}
265
	/* Could not find this slot. */
266
	WARN_ON(1);
267
	kip = NULL;
268
out:
269 270 271 272 273 274 275 276 277 278 279 280 281 282
	rcu_read_unlock();
	/* Mark and sweep: this may sleep */
	if (kip) {
		/* Check double free */
		WARN_ON(kip->slot_used[idx] != SLOT_USED);
		if (dirty) {
			kip->slot_used[idx] = SLOT_DIRTY;
			kip->ngarbage++;
			if (++c->nr_garbage > slots_per_page(c))
				collect_garbage_slots(c);
		} else {
			collect_one_slot(kip, idx);
		}
	}
283
	mutex_unlock(&c->mutex);
284
}
285

286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308
/*
 * Check given address is on the page of kprobe instruction slots.
 * This will be used for checking whether the address on a stack
 * is on a text area or not.
 */
bool __is_insn_slot_addr(struct kprobe_insn_cache *c, unsigned long addr)
{
	struct kprobe_insn_page *kip;
	bool ret = false;

	rcu_read_lock();
	list_for_each_entry_rcu(kip, &c->pages, list) {
		if (addr >= (unsigned long)kip->insns &&
		    addr < (unsigned long)kip->insns + PAGE_SIZE) {
			ret = true;
			break;
		}
	}
	rcu_read_unlock();

	return ret;
}

309 310
#ifdef CONFIG_OPTPROBES
/* For optimized_kprobe buffer */
311 312
struct kprobe_insn_cache kprobe_optinsn_slots = {
	.mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex),
313 314
	.alloc = alloc_insn_page,
	.free = free_insn_page,
315 316 317 318 319
	.pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
	/* .insn_size is initialized later */
	.nr_garbage = 0,
};
#endif
320
#endif
321

322 323 324
/* We have preemption disabled.. so it is safe to use __ versions */
static inline void set_kprobe_instance(struct kprobe *kp)
{
325
	__this_cpu_write(kprobe_instance, kp);
326 327 328 329
}

static inline void reset_kprobe_instance(void)
{
330
	__this_cpu_write(kprobe_instance, NULL);
331 332
}

333 334
/*
 * This routine is called either:
335
 * 	- under the kprobe_mutex - during kprobe_[un]register()
336
 * 				OR
337
 * 	- with preemption disabled - from arch/xxx/kernel/kprobes.c
338
 */
339
struct kprobe *get_kprobe(void *addr)
Linus Torvalds's avatar
Linus Torvalds committed
340 341
{
	struct hlist_head *head;
342
	struct kprobe *p;
Linus Torvalds's avatar
Linus Torvalds committed
343 344

	head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
345
	hlist_for_each_entry_rcu(p, head, hlist) {
Linus Torvalds's avatar
Linus Torvalds committed
346 347 348
		if (p->addr == addr)
			return p;
	}
349

Linus Torvalds's avatar
Linus Torvalds committed
350 351
	return NULL;
}
352
NOKPROBE_SYMBOL(get_kprobe);
Linus Torvalds's avatar
Linus Torvalds committed
353

354
static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);
355 356 357 358 359 360 361

/* Return true if the kprobe is an aggregator */
static inline int kprobe_aggrprobe(struct kprobe *p)
{
	return p->pre_handler == aggr_pre_handler;
}

362 363 364 365 366 367 368
/* Return true(!0) if the kprobe is unused */
static inline int kprobe_unused(struct kprobe *p)
{
	return kprobe_aggrprobe(p) && kprobe_disabled(p) &&
	       list_empty(&p->list);
}

369 370 371
/*
 * Keep all fields in the kprobe consistent
 */
372
static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p)
373
{
374 375
	memcpy(&p->opcode, &ap->opcode, sizeof(kprobe_opcode_t));
	memcpy(&p->ainsn, &ap->ainsn, sizeof(struct arch_specific_insn));
376 377 378
}

#ifdef CONFIG_OPTPROBES
379 380 381
/* NOTE: change this value only with kprobe_mutex held */
static bool kprobes_allow_optimization;

382 383 384 385
/*
 * Call all pre_handler on the list, but ignores its return value.
 * This must be called from arch-dep optimized caller.
 */
386
void opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
387 388 389 390 391 392 393 394 395 396 397
{
	struct kprobe *kp;

	list_for_each_entry_rcu(kp, &p->list, list) {
		if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
			set_kprobe_instance(kp);
			kp->pre_handler(kp, regs);
		}
		reset_kprobe_instance();
	}
}
398
NOKPROBE_SYMBOL(opt_pre_handler);
399

400
/* Free optimized instructions and optimized_kprobe */
401
static void free_aggr_kprobe(struct kprobe *p)
402 403 404 405 406 407 408 409 410
{
	struct optimized_kprobe *op;

	op = container_of(p, struct optimized_kprobe, kp);
	arch_remove_optimized_kprobe(op);
	arch_remove_kprobe(p);
	kfree(op);
}

411 412 413 414 415 416 417 418 419 420 421 422 423
/* Return true(!0) if the kprobe is ready for optimization. */
static inline int kprobe_optready(struct kprobe *p)
{
	struct optimized_kprobe *op;

	if (kprobe_aggrprobe(p)) {
		op = container_of(p, struct optimized_kprobe, kp);
		return arch_prepared_optinsn(&op->optinsn);
	}

	return 0;
}

424 425 426 427 428 429 430 431 432 433 434 435 436 437 438
/* Return true(!0) if the kprobe is disarmed. Note: p must be on hash list */
static inline int kprobe_disarmed(struct kprobe *p)
{
	struct optimized_kprobe *op;

	/* If kprobe is not aggr/opt probe, just return kprobe is disabled */
	if (!kprobe_aggrprobe(p))
		return kprobe_disabled(p);

	op = container_of(p, struct optimized_kprobe, kp);

	return kprobe_disabled(p) && list_empty(&op->list);
}

/* Return true(!0) if the probe is queued on (un)optimizing lists */
439
static int kprobe_queued(struct kprobe *p)
440 441 442 443 444 445 446 447 448 449 450
{
	struct optimized_kprobe *op;

	if (kprobe_aggrprobe(p)) {
		op = container_of(p, struct optimized_kprobe, kp);
		if (!list_empty(&op->list))
			return 1;
	}
	return 0;
}

451 452 453 454
/*
 * Return an optimized kprobe whose optimizing code replaces
 * instructions including addr (exclude breakpoint).
 */
455
static struct kprobe *get_optimized_kprobe(unsigned long addr)
456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475
{
	int i;
	struct kprobe *p = NULL;
	struct optimized_kprobe *op;

	/* Don't check i == 0, since that is a breakpoint case. */
	for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH; i++)
		p = get_kprobe((void *)(addr - i));

	if (p && kprobe_optready(p)) {
		op = container_of(p, struct optimized_kprobe, kp);
		if (arch_within_optimized_kprobe(op, addr))
			return p;
	}

	return NULL;
}

/* Optimization staging list, protected by kprobe_mutex */
static LIST_HEAD(optimizing_list);
476
static LIST_HEAD(unoptimizing_list);
477
static LIST_HEAD(freeing_list);
478 479 480 481 482

static void kprobe_optimizer(struct work_struct *work);
static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
#define OPTIMIZE_DELAY 5

483 484 485 486
/*
 * Optimize (replace a breakpoint with a jump) kprobes listed on
 * optimizing_list.
 */
487
static void do_optimize_kprobes(void)
488
{
489 490 491 492 493
	/* Optimization never be done when disarmed */
	if (kprobes_all_disarmed || !kprobes_allow_optimization ||
	    list_empty(&optimizing_list))
		return;

494 495 496 497 498 499 500 501 502 503 504 505
	/*
	 * The optimization/unoptimization refers online_cpus via
	 * stop_machine() and cpu-hotplug modifies online_cpus.
	 * And same time, text_mutex will be held in cpu-hotplug and here.
	 * This combination can cause a deadlock (cpu-hotplug try to lock
	 * text_mutex but stop_machine can not be done because online_cpus
	 * has been changed)
	 * To avoid this deadlock, we need to call get_online_cpus()
	 * for preventing cpu-hotplug outside of text_mutex locking.
	 */
	get_online_cpus();
	mutex_lock(&text_mutex);
506
	arch_optimize_kprobes(&optimizing_list);
507 508
	mutex_unlock(&text_mutex);
	put_online_cpus();
509 510
}

511 512 513 514
/*
 * Unoptimize (replace a jump with a breakpoint and remove the breakpoint
 * if need) kprobes listed on unoptimizing_list.
 */
515
static void do_unoptimize_kprobes(void)
516 517 518 519 520 521 522 523 524 525
{
	struct optimized_kprobe *op, *tmp;

	/* Unoptimization must be done anytime */
	if (list_empty(&unoptimizing_list))
		return;

	/* Ditto to do_optimize_kprobes */
	get_online_cpus();
	mutex_lock(&text_mutex);
526
	arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list);
527
	/* Loop free_list for disarming */
528
	list_for_each_entry_safe(op, tmp, &freeing_list, list) {
529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546
		/* Disarm probes if marked disabled */
		if (kprobe_disabled(&op->kp))
			arch_disarm_kprobe(&op->kp);
		if (kprobe_unused(&op->kp)) {
			/*
			 * Remove unused probes from hash list. After waiting
			 * for synchronization, these probes are reclaimed.
			 * (reclaiming is done by do_free_cleaned_kprobes.)
			 */
			hlist_del_rcu(&op->kp.hlist);
		} else
			list_del_init(&op->list);
	}
	mutex_unlock(&text_mutex);
	put_online_cpus();
}

/* Reclaim all kprobes on the free_list */
547
static void do_free_cleaned_kprobes(void)
548 549 550
{
	struct optimized_kprobe *op, *tmp;

551
	list_for_each_entry_safe(op, tmp, &freeing_list, list) {
552 553 554 555 556 557 558
		BUG_ON(!kprobe_unused(&op->kp));
		list_del_init(&op->list);
		free_aggr_kprobe(&op->kp);
	}
}

/* Start optimizer after OPTIMIZE_DELAY passed */
559
static void kick_kprobe_optimizer(void)
560
{
561
	schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
562 563
}

564
/* Kprobe jump optimizer */
565
static void kprobe_optimizer(struct work_struct *work)
566
{
567
	mutex_lock(&kprobe_mutex);
568 569 570 571
	/* Lock modules while optimizing kprobes */
	mutex_lock(&module_mutex);

	/*
572 573 574
	 * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
	 * kprobes before waiting for quiesence period.
	 */
575
	do_unoptimize_kprobes();
576 577 578

	/*
	 * Step 2: Wait for quiesence period to ensure all running interrupts
579 580 581 582 583 584 585
	 * are done. Because optprobe may modify multiple instructions
	 * there is a chance that Nth instruction is interrupted. In that
	 * case, running interrupt can return to 2nd-Nth byte of jump
	 * instruction. This wait is for avoiding it.
	 */
	synchronize_sched();

586
	/* Step 3: Optimize kprobes after quiesence period */
587
	do_optimize_kprobes();
588 589

	/* Step 4: Free cleaned kprobes after quiesence period */
590
	do_free_cleaned_kprobes();
591

592
	mutex_unlock(&module_mutex);
593
	mutex_unlock(&kprobe_mutex);
594

595
	/* Step 5: Kick optimizer again if needed */
596
	if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
597
		kick_kprobe_optimizer();
598 599 600
}

/* Wait for completing optimization and unoptimization */
601
static void wait_for_kprobe_optimizer(void)
602
{
603 604 605 606 607 608 609 610 611 612 613 614 615 616
	mutex_lock(&kprobe_mutex);

	while (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) {
		mutex_unlock(&kprobe_mutex);

		/* this will also make optimizing_work execute immmediately */
		flush_delayed_work(&optimizing_work);
		/* @optimizing_work might not have been queued yet, relax */
		cpu_relax();

		mutex_lock(&kprobe_mutex);
	}

	mutex_unlock(&kprobe_mutex);
617 618 619
}

/* Optimize kprobe if p is ready to be optimized */
620
static void optimize_kprobe(struct kprobe *p)
621 622 623 624
{
	struct optimized_kprobe *op;

	/* Check if the kprobe is disabled or not ready for optimization. */
625
	if (!kprobe_optready(p) || !kprobes_allow_optimization ||
626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642
	    (kprobe_disabled(p) || kprobes_all_disarmed))
		return;

	/* Both of break_handler and post_handler are not supported. */
	if (p->break_handler || p->post_handler)
		return;

	op = container_of(p, struct optimized_kprobe, kp);

	/* Check there is no other kprobes at the optimized instructions */
	if (arch_check_optimized_kprobe(op) < 0)
		return;

	/* Check if it is already optimized. */
	if (op->kp.flags & KPROBE_FLAG_OPTIMIZED)
		return;
	op->kp.flags |= KPROBE_FLAG_OPTIMIZED;
643 644 645 646 647 648 649 650 651 652 653

	if (!list_empty(&op->list))
		/* This is under unoptimizing. Just dequeue the probe */
		list_del_init(&op->list);
	else {
		list_add(&op->list, &optimizing_list);
		kick_kprobe_optimizer();
	}
}

/* Short cut to direct unoptimizing */
654
static void force_unoptimize_kprobe(struct optimized_kprobe *op)
655 656 657 658 659 660
{
	get_online_cpus();
	arch_unoptimize_kprobe(op);
	put_online_cpus();
	if (kprobe_disabled(&op->kp))
		arch_disarm_kprobe(&op->kp);
661 662 663
}

/* Unoptimize a kprobe if p is optimized */
664
static void unoptimize_kprobe(struct kprobe *p, bool force)
665 666 667
{
	struct optimized_kprobe *op;

668 669 670 671 672 673 674 675 676 677 678 679
	if (!kprobe_aggrprobe(p) || kprobe_disarmed(p))
		return; /* This is not an optprobe nor optimized */

	op = container_of(p, struct optimized_kprobe, kp);
	if (!kprobe_optimized(p)) {
		/* Unoptimized or unoptimizing case */
		if (force && !list_empty(&op->list)) {
			/*
			 * Only if this is unoptimizing kprobe and forced,
			 * forcibly unoptimize it. (No need to unoptimize
			 * unoptimized kprobe again :)
			 */
680
			list_del_init(&op->list);
681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698
			force_unoptimize_kprobe(op);
		}
		return;
	}

	op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
	if (!list_empty(&op->list)) {
		/* Dequeue from the optimization queue */
		list_del_init(&op->list);
		return;
	}
	/* Optimized kprobe case */
	if (force)
		/* Forcibly update the code: this is a special case */
		force_unoptimize_kprobe(op);
	else {
		list_add(&op->list, &unoptimizing_list);
		kick_kprobe_optimizer();
699 700 701
	}
}

702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722
/* Cancel unoptimizing for reusing */
static void reuse_unused_kprobe(struct kprobe *ap)
{
	struct optimized_kprobe *op;

	BUG_ON(!kprobe_unused(ap));
	/*
	 * Unused kprobe MUST be on the way of delayed unoptimizing (means
	 * there is still a relative jump) and disabled.
	 */
	op = container_of(ap, struct optimized_kprobe, kp);
	if (unlikely(list_empty(&op->list)))
		printk(KERN_WARNING "Warning: found a stray unused "
			"aggrprobe@%p\n", ap->addr);
	/* Enable the probe again */
	ap->flags &= ~KPROBE_FLAG_DISABLED;
	/* Optimize it again (remove from op->list) */
	BUG_ON(!kprobe_optready(ap));
	optimize_kprobe(ap);
}

723
/* Remove optimized instructions */
724
static void kill_optimized_kprobe(struct kprobe *p)
725 726 727 728
{
	struct optimized_kprobe *op;

	op = container_of(p, struct optimized_kprobe, kp);
729 730
	if (!list_empty(&op->list))
		/* Dequeue from the (un)optimization queue */
731
		list_del_init(&op->list);
732
	op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED;
733 734 735 736 737 738 739 740 741 742 743 744

	if (kprobe_unused(p)) {
		/* Enqueue if it is unused */
		list_add(&op->list, &freeing_list);
		/*
		 * Remove unused probes from the hash list. After waiting
		 * for synchronization, this probe is reclaimed.
		 * (reclaiming is done by do_free_cleaned_kprobes().)
		 */
		hlist_del_rcu(&op->kp.hlist);
	}

745
	/* Don't touch the code, because it is already freed. */
746 747 748 749
	arch_remove_optimized_kprobe(op);
}

/* Try to prepare optimized instructions */
750
static void prepare_optimized_kprobe(struct kprobe *p)
751 752 753 754
{
	struct optimized_kprobe *op;

	op = container_of(p, struct optimized_kprobe, kp);
755
	arch_prepare_optimized_kprobe(op, p);
756 757 758
}

/* Allocate new optimized_kprobe and try to prepare optimized instructions */
759
static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
760 761 762 763 764 765 766 767 768
{
	struct optimized_kprobe *op;

	op = kzalloc(sizeof(struct optimized_kprobe), GFP_KERNEL);
	if (!op)
		return NULL;

	INIT_LIST_HEAD(&op->list);
	op->kp.addr = p->addr;
769
	arch_prepare_optimized_kprobe(op, p);
770 771 772 773

	return &op->kp;
}

774
static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p);
775 776 777 778 779

/*
 * Prepare an optimized_kprobe and optimize it
 * NOTE: p must be a normal registered kprobe
 */
780
static void try_to_optimize_kprobe(struct kprobe *p)
781 782 783 784
{
	struct kprobe *ap;
	struct optimized_kprobe *op;

785 786 787 788
	/* Impossible to optimize ftrace-based kprobe */
	if (kprobe_ftrace(p))
		return;

789 790 791 792
	/* For preparing optimization, jump_label_text_reserved() is called */
	jump_label_lock();
	mutex_lock(&text_mutex);

793 794
	ap = alloc_aggr_kprobe(p);
	if (!ap)
795
		goto out;
796 797 798 799

	op = container_of(ap, struct optimized_kprobe, kp);
	if (!arch_prepared_optinsn(&op->optinsn)) {
		/* If failed to setup optimizing, fallback to kprobe */
800 801
		arch_remove_optimized_kprobe(op);
		kfree(op);
802
		goto out;
803 804 805
	}

	init_aggr_kprobe(ap, p);
806 807 808 809 810
	optimize_kprobe(ap);	/* This just kicks optimizer thread */

out:
	mutex_unlock(&text_mutex);
	jump_label_unlock();
811 812
}

813
#ifdef CONFIG_SYSCTL
814
static void optimize_all_kprobes(void)
815 816 817 818 819
{
	struct hlist_head *head;
	struct kprobe *p;
	unsigned int i;

820
	mutex_lock(&kprobe_mutex);
821 822
	/* If optimization is already allowed, just return */
	if (kprobes_allow_optimization)
823
		goto out;
824 825 826 827

	kprobes_allow_optimization = true;
	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
		head = &kprobe_table[i];
828
		hlist_for_each_entry_rcu(p, head, hlist)
829 830 831 832
			if (!kprobe_disabled(p))
				optimize_kprobe(p);
	}
	printk(KERN_INFO "Kprobes globally optimized\n");
833 834
out:
	mutex_unlock(&kprobe_mutex);
835 836
}

837
static void unoptimize_all_kprobes(void)
838 839 840 841 842
{
	struct hlist_head *head;
	struct kprobe *p;
	unsigned int i;

843
	mutex_lock(&kprobe_mutex);
844
	/* If optimization is already prohibited, just return */
845 846
	if (!kprobes_allow_optimization) {
		mutex_unlock(&kprobe_mutex);
847
		return;
848
	}
849 850 851 852

	kprobes_allow_optimization = false;
	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
		head = &kprobe_table[i];
853
		hlist_for_each_entry_rcu(p, head, hlist) {
854
			if (!kprobe_disabled(p))
855
				unoptimize_kprobe(p, false);
856 857
		}
	}
858 859
	mutex_unlock(&kprobe_mutex);

860 861 862
	/* Wait for unoptimizing completion */
	wait_for_kprobe_optimizer();
	printk(KERN_INFO "Kprobes globally unoptimized\n");
863 864
}

865
static DEFINE_MUTEX(kprobe_sysctl_mutex);
866 867 868 869 870 871 872
int sysctl_kprobes_optimization;
int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
				      void __user *buffer, size_t *length,
				      loff_t *ppos)
{
	int ret;

873
	mutex_lock(&kprobe_sysctl_mutex);
874 875 876 877 878 879 880
	sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0;
	ret = proc_dointvec_minmax(table, write, buffer, length, ppos);

	if (sysctl_kprobes_optimization)
		optimize_all_kprobes();
	else
		unoptimize_all_kprobes();
881
	mutex_unlock(&kprobe_sysctl_mutex);
882 883 884 885 886

	return ret;
}
#endif /* CONFIG_SYSCTL */

887
/* Put a breakpoint for a probe. Must be called with text_mutex locked */
888
static void __arm_kprobe(struct kprobe *p)
889
{
890
	struct kprobe *_p;
891 892

	/* Check collision with other optimized kprobes */
893 894
	_p = get_optimized_kprobe((unsigned long)p->addr);
	if (unlikely(_p))
895 896
		/* Fallback to unoptimized kprobe */
		unoptimize_kprobe(_p, true);
897 898 899 900 901

	arch_arm_kprobe(p);
	optimize_kprobe(p);	/* Try to optimize (add kprobe to a list) */
}

902
/* Remove the breakpoint of a probe. Must be called with text_mutex locked */
903
static void __disarm_kprobe(struct kprobe *p, bool reopt)
904
{
905
	struct kprobe *_p;
906

907 908
	/* Try to unoptimize */
	unoptimize_kprobe(p, kprobes_all_disarmed);
909

910 911 912 913 914 915 916 917
	if (!kprobe_queued(p)) {
		arch_disarm_kprobe(p);
		/* If another kprobe was blocked, optimize it. */
		_p = get_optimized_kprobe((unsigned long)p->addr);
		if (unlikely(_p) && reopt)
			optimize_kprobe(_p);
	}
	/* TODO: reoptimize others after unoptimized this probe */
918 919 920 921 922
}

#else /* !CONFIG_OPTPROBES */

#define optimize_kprobe(p)			do {} while (0)
923
#define unoptimize_kprobe(p, f)			do {} while (0)
924 925 926 927
#define kill_optimized_kprobe(p)		do {} while (0)
#define prepare_optimized_kprobe(p)		do {} while (0)
#define try_to_optimize_kprobe(p)		do {} while (0)
#define __arm_kprobe(p)				arch_arm_kprobe(p)
928 929 930
#define __disarm_kprobe(p, o)			arch_disarm_kprobe(p)
#define kprobe_disarmed(p)			kprobe_disabled(p)
#define wait_for_kprobe_optimizer()		do {} while (0)
931

932 933 934 935 936 937 938
/* There should be no unused kprobes can be reused without optimization */
static void reuse_unused_kprobe(struct kprobe *ap)
{
	printk(KERN_ERR "Error: There should be no unused kprobe here.\n");
	BUG_ON(kprobe_unused(ap));
}

939
static void free_aggr_kprobe(struct kprobe *p)
940
{
941
	arch_remove_kprobe(p);
942 943 944
	kfree(p);
}

945
static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
946 947 948 949 950
{
	return kzalloc(sizeof(struct kprobe), GFP_KERNEL);
}
#endif /* CONFIG_OPTPROBES */

951
#ifdef CONFIG_KPROBES_ON_FTRACE
952
static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
953
	.func = kprobe_ftrace_handler,
954
	.flags = FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_IPMODIFY,
955 956 957 958
};
static int kprobe_ftrace_enabled;

/* Must ensure p->addr is really on ftrace */
959
static int prepare_kprobe(struct kprobe *p)
960 961 962 963 964 965 966 967
{
	if (!kprobe_ftrace(p))
		return arch_prepare_kprobe(p);

	return arch_prepare_kprobe_ftrace(p);
}

/* Caller must lock kprobe_mutex */
968
static void arm_kprobe_ftrace(struct kprobe *p)
969 970 971 972 973 974 975 976 977 978 979 980 981 982
{
	int ret;

	ret = ftrace_set_filter_ip(&kprobe_ftrace_ops,
				   (unsigned long)p->addr, 0, 0);
	WARN(ret < 0, "Failed to arm kprobe-ftrace at %p (%d)\n", p->addr, ret);
	kprobe_ftrace_enabled++;
	if (kprobe_ftrace_enabled == 1) {
		ret = register_ftrace_function(&kprobe_ftrace_ops);
		WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret);
	}
}

/* Caller must lock kprobe_mutex */
983
static void disarm_kprobe_ftrace(struct kprobe *p)
984 985 986 987 988 989 990 991 992 993 994 995
{
	int ret;

	kprobe_ftrace_enabled--;
	if (kprobe_ftrace_enabled == 0) {
		ret = unregister_ftrace_function(&kprobe_ftrace_ops);
		WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret);
	}
	ret = ftrace_set_filter_ip(&kprobe_ftrace_ops,
			   (unsigned long)p->addr, 1, 0);
	WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret);
}
996
#else	/* !CONFIG_KPROBES_ON_FTRACE */
997 998 999 1000 1001
#define prepare_kprobe(p)	arch_prepare_kprobe(p)
#define arm_kprobe_ftrace(p)	do {} while (0)
#define disarm_kprobe_ftrace(p)	do {} while (0)
#endif

1002
/* Arm a kprobe with text_mutex */
1003
static void arm_kprobe(struct kprobe *kp)
1004
{
1005 1006 1007 1008
	if (unlikely(kprobe_ftrace(kp))) {
		arm_kprobe_ftrace(kp);
		return;
	}
1009 1010 1011 1012 1013
	/*
	 * Here, since __arm_kprobe() doesn't use stop_machine(),
	 * this doesn't cause deadlock on text_mutex. So, we don't
	 * need get_online_cpus().
	 */
1014
	mutex_lock(&text_mutex);
1015
	__arm_kprobe(kp);
1016 1017 1018 1019
	mutex_unlock(&text_mutex);
}

/* Disarm a kprobe with text_mutex */
1020
static void disarm_kprobe(struct kprobe *kp, bool reopt)
1021
{
1022 1023 1024 1025
	if (unlikely(kprobe_ftrace(kp))) {
		disarm_kprobe_ftrace(kp);
		return;
	}
1026
	/* Ditto */
1027
	mutex_lock(&text_mutex);
1028
	__disarm_kprobe(kp, reopt);
1029 1030 1031
	mutex_unlock(&text_mutex);
}

1032 1033 1034 1035
/*
 * Aggregate handlers for multiple kprobes support - these handlers
 * take care of invoking the individual kprobe handlers on p->list
 */
1036
static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
1037 1038 1039
{
	struct kprobe *kp;

1040
	list_for_each_entry_rcu(kp, &p->list, list) {
1041
		if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
1042
			set_kprobe_instance(kp);
1043 1044
			if (kp->pre_handler(kp, regs))
				return 1;
1045
		}
1046
		reset_kprobe_instance();
1047 1048 1049
	}
	return 0;
}
1050
NOKPROBE_SYMBOL(aggr_pre_handler);
1051

1052 1053
static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
			      unsigned long flags)
1054 1055 1056
{
	struct kprobe *kp;

1057
	list_for_each_entry_rcu(kp, &p->list, list) {
1058
		if (kp->post_handler && likely(!kprobe_disabled(kp))) {
1059
			set_kprobe_instance(kp);
1060
			kp->post_handler(kp, regs, flags);
1061
			reset_kprobe_instance();
1062 1063 1064
		}
	}
}
1065
NOKPROBE_SYMBOL(aggr_post_handler);
1066

1067 1068
static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
			      int trapnr)
1069
{
1070
	struct kprobe *cur = __this_cpu_read(kprobe_instance);
1071

1072 1073 1074 1075
	/*
	 * if we faulted "during" the execution of a user specified
	 * probe handler, invoke just that probe's fault handler
	 */
1076 1077
	if (cur && cur->fault_handler) {
		if (cur->fault_handler(cur, regs, trapnr))
1078 1079 1080 1081
			return 1;
	}
	return 0;
}
1082
NOKPROBE_SYMBOL(aggr_fault_handler);
1083

1084
static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
1085
{
1086
	struct kprobe *cur = __this_cpu_read(kprobe_instance);
1087 1088 1089 1090 1091
	int ret = 0;

	if (cur && cur->break_handler) {
		if (cur->break_handler(cur, regs))
			ret = 1;
1092
	}
1093 1094
	reset_kprobe_instance();
	return ret;
1095
}
1096
NOKPROBE_SYMBOL(aggr_break_handler);
1097

1098
/* Walks the list and increments nmissed count for multiprobe case */
1099
void kprobes_inc_nmissed_count(struct kprobe *p)
1100 1101
{
	struct kprobe *kp;
1102
	if (!kprobe_aggrprobe(p)) {
1103 1104 1105 1106 1107 1108 1109
		p->nmissed++;
	} else {
		list_for_each_entry_rcu(kp, &p->list, list)
			kp->nmissed++;
	}
	return;
}
1110
NOKPROBE_SYMBOL(kprobes_inc_nmissed_count);
1111

1112 1113
void recycle_rp_inst(struct kretprobe_instance *ri,
		     struct hlist_head *head)
1114
{
1115 1116
	struct kretprobe *rp = ri->rp;

1117 1118
	/* remove rp inst off the rprobe_inst_table */
	hlist_del(&ri->hlist);
1119 1120
	INIT_HLIST_NODE(&ri->hlist);
	if (likely(rp)) {
1121
		raw_spin_lock(&rp->lock);
1122
		hlist_add_head(&ri->hlist, &rp->free_instances);
1123
		raw_spin_unlock(&rp->lock);
1124 1125
	} else
		/* Unregistering */
1126
		hlist_add_head(&ri->hlist, head);
1127
}
1128
NOKPROBE_SYMBOL(recycle_rp_inst);
1129

1130
void kretprobe_hash_lock(struct task_struct *tsk,
1131
			 struct hlist_head **head, unsigned long *flags)
1132
__acquires(hlist_lock)
1133 1134
{
	unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
1135
	raw_spinlock_t *hlist_lock;
1136 1137 1138

	*head = &kretprobe_inst_table[hash];
	hlist_lock = kretprobe_table_lock_ptr(hash);
1139
	raw_spin_lock_irqsave(hlist_lock, *flags);
1140
}
1141
NOKPROBE_SYMBOL(kretprobe_hash_lock);
1142

1143 1144
static void kretprobe_table_lock(unsigned long hash,
				 unsigned long *flags)
1145
__acquires(hlist_lock)
1146
{
1147 1148
	raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
	raw_spin_lock_irqsave(hlist_lock, *flags);
1149
}
1150
NOKPROBE_SYMBOL(kretprobe_table_lock);
1151

1152 1153
void kretprobe_hash_unlock(struct task_struct *tsk,
			   unsigned long *flags)
1154
__releases(hlist_lock)
1155 1156
{
	unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
1157
	raw_spinlock_t *hlist_lock;
1158 1159

	hlist_lock = kretprobe_table_lock_ptr(hash);
1160
	raw_spin_unlock_irqrestore(hlist_lock, *flags);
1161
}
1162
NOKPROBE_SYMBOL(kretprobe_hash_unlock);
1163

1164 1165
static void kretprobe_table_unlock(unsigned long hash,
				   unsigned long *flags)
1166
__releases(hlist_lock)
1167
{
1168 1169
	raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
	raw_spin_unlock_irqrestore(hlist_lock, *flags);
1170
}
1171
NOKPROBE_SYMBOL(kretprobe_table_unlock);
1172 1173

/*
1174 1175 1176 1177
 * This function is called from finish_task_switch when task tk becomes dead,
 * so that we can recycle any function-return probe instances associated
 * with this task. These left over instances represent probed functions
 * that have been called but will never return.
1178
 */
1179
void kprobe_flush_task(struct task_struct *tk)
1180
{
1181
	struct kretprobe_instance *ri;
1182
	struct hlist_head *head, empty_rp;
1183
	struct hlist_node *tmp;
1184
	unsigned long hash, flags = 0;
1185

1186 1187 1188 1189
	if (unlikely(!kprobes_initialized))
		/* Early boot.  kretprobe_table_locks not yet initialized. */
		return;

1190
	INIT_HLIST_HEAD(&empty_rp);
1191 1192 1193
	hash = hash_ptr(tk, KPROBE_HASH_BITS);
	head = &kretprobe_inst_table[hash];
	kretprobe_table_lock(hash, &flags);
1194
	hlist_for_each_entry_safe(ri, tmp, head, hlist) {
1195
		if (ri->task == tk)
1196
			recycle_rp_inst(ri, &empty_rp);
1197
	}
1198
	kretprobe_table_unlock(hash, &flags);
1199
	hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
1200 1201 1202
		hlist_del(&ri->hlist);
		kfree(ri);
	}
1203
}
1204
NOKPROBE_SYMBOL(kprobe_flush_task);
1205 1206 1207 1208

static inline void free_rp_inst(struct kretprobe *rp)
{
	struct kretprobe_instance *ri;
1209
	struct hlist_node *next;
1210

1211
	hlist_for_each_entry_safe(ri, next, &rp->free_instances, hlist) {
1212
		hlist_del(&ri->hlist);
1213 1214 1215 1216
		kfree(ri);
	}
}

1217
static void cleanup_rp_inst(struct kretprobe *rp)
1218
{
1219
	unsigned long flags, hash;
1220
	struct kretprobe_instance *ri;
1221
	struct hlist_node *next;
1222 1223
	struct hlist_head *head;

1224
	/* No race here */
1225 1226 1227
	for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) {
		kretprobe_table_lock(hash, &flags);
		head = &kretprobe_inst_table[hash];
1228
		hlist_for_each_entry_safe(ri, next, head, hlist) {
1229 1230 1231 1232
			if (ri->rp == rp)
				ri->rp = NULL;
		}
		kretprobe_table_unlock(hash, &flags);
1233 1234 1235
	}
	free_rp_inst(rp);
}
1236
NOKPROBE_SYMBOL(cleanup_rp_inst);
1237

1238
/*
1239
* Add the new probe to ap->list. Fail if this is the
1240 1241
* second jprobe at the address - two jprobes can't coexist
*/
1242
static int add_new_kprobe(struct kprobe *ap, struct kprobe *p)
1243
{
1244
	BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
1245 1246

	if (p->break_handler || p->post_handler)
1247
		unoptimize_kprobe(ap, true);	/* Fall back to normal kprobe */
1248

1249
	if (p->break_handler) {
1250
		if (ap->break_handler)
1251
			return -EEXIST;
1252 1253
		list_add_tail_rcu(&p->list, &ap->list);
		ap->break_handler = aggr_break_handler;
1254
	} else
1255 1256 1257
		list_add_rcu(&p->list, &ap->list);
	if (p->post_handler && !ap->post_handler)
		ap->post_handler = aggr_post_handler;
1258

1259 1260 1261
	return 0;
}

1262 1263 1264 1265
/*
 * Fill in the required fields of the "manager kprobe". Replace the
 * earlier kprobe in the hlist with the manager kprobe
 */
1266
static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
1267
{