Commit fb40bd78 authored by Mathieu Desnoyers's avatar Mathieu Desnoyers Committed by Linus Torvalds

Linux Kernel Markers: support multiple probes

RCU style multiple probes support for the Linux Kernel Markers.  Common case
(one probe) is still fast and does not require dynamic allocation or a
supplementary pointer dereference on the fast path.

- Move preempt disable from the marker site to the callback.

Since we now have an internal callback, move the preempt disable/enable to the
callback instead of the marker site.

Since the callback change is done asynchronously (passing from a handler that
supports arguments to a handler that does not setup the arguments is no
arguments are passed), we can safely update it even if it is outside the
preempt disable section.

- Move probe arm to probe connection. Now, a connected probe is automatically
  armed.

Remove MARK_MAX_FORMAT_LEN, unused.

This patch modifies the Linux Kernel Markers API : it removes the probe
"arm/disarm" and changes the probe function prototype : it now expects a
va_list * instead of a "...".

If we want to have more than one probe connected to a marker at a given
time (LTTng, or blktrace, ssytemtap) then we need this patch. Without it,
connecting a second probe handler to a marker will fail.

It allow us, for instance, to do interesting combinations :

Do standard tracing with LTTng and, eventually, to compute statistics
with SystemTAP, or to have a special trigger on an event that would call
a systemtap script which would stop flight recorder tracing.
Signed-off-by: default avatarMathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Mike Mason <mmlnx@us.ibm.com>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Cc: David Smith <dsmith@redhat.com>
Cc: "Paul E. McKenney" <paulmck@us.ibm.com>
Cc: "Frank Ch. Eigler" <fche@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 9170d2f6
...@@ -146,34 +146,28 @@ static void sputrace_log_item(const char *name, struct spu_context *ctx, ...@@ -146,34 +146,28 @@ static void sputrace_log_item(const char *name, struct spu_context *ctx,
wake_up(&sputrace_wait); wake_up(&sputrace_wait);
} }
static void spu_context_event(const struct marker *mdata, static void spu_context_event(void *probe_private, void *call_data,
void *private, const char *format, ...) const char *format, va_list *args)
{ {
struct spu_probe *p = mdata->private; struct spu_probe *p = probe_private;
va_list ap;
struct spu_context *ctx; struct spu_context *ctx;
struct spu *spu; struct spu *spu;
va_start(ap, format); ctx = va_arg(*args, struct spu_context *);
ctx = va_arg(ap, struct spu_context *); spu = va_arg(*args, struct spu *);
spu = va_arg(ap, struct spu *);
sputrace_log_item(p->name, ctx, spu); sputrace_log_item(p->name, ctx, spu);
va_end(ap);
} }
static void spu_context_nospu_event(const struct marker *mdata, static void spu_context_nospu_event(void *probe_private, void *call_data,
void *private, const char *format, ...) const char *format, va_list *args)
{ {
struct spu_probe *p = mdata->private; struct spu_probe *p = probe_private;
va_list ap;
struct spu_context *ctx; struct spu_context *ctx;
va_start(ap, format); ctx = va_arg(*args, struct spu_context *);
ctx = va_arg(ap, struct spu_context *);
sputrace_log_item(p->name, ctx, NULL); sputrace_log_item(p->name, ctx, NULL);
va_end(ap);
} }
struct spu_probe spu_probes[] = { struct spu_probe spu_probes[] = {
...@@ -219,10 +213,6 @@ static int __init sputrace_init(void) ...@@ -219,10 +213,6 @@ static int __init sputrace_init(void)
if (error) if (error)
printk(KERN_INFO "Unable to register probe %s\n", printk(KERN_INFO "Unable to register probe %s\n",
p->name); p->name);
error = marker_arm(p->name);
if (error)
printk(KERN_INFO "Unable to arm probe %s\n", p->name);
} }
return 0; return 0;
...@@ -238,7 +228,8 @@ static void __exit sputrace_exit(void) ...@@ -238,7 +228,8 @@ static void __exit sputrace_exit(void)
int i; int i;
for (i = 0; i < ARRAY_SIZE(spu_probes); i++) for (i = 0; i < ARRAY_SIZE(spu_probes); i++)
marker_probe_unregister(spu_probes[i].name); marker_probe_unregister(spu_probes[i].name,
spu_probes[i].probe_func, &spu_probes[i]);
remove_proc_entry("sputrace", NULL); remove_proc_entry("sputrace", NULL);
kfree(sputrace_log); kfree(sputrace_log);
......
...@@ -19,16 +19,23 @@ struct marker; ...@@ -19,16 +19,23 @@ struct marker;
/** /**
* marker_probe_func - Type of a marker probe function * marker_probe_func - Type of a marker probe function
* @mdata: pointer of type struct marker * @probe_private: probe private data
* @private_data: caller site private data * @call_private: call site private data
* @fmt: format string * @fmt: format string
* @...: variable argument list * @args: variable argument list pointer. Use a pointer to overcome C's
* inability to pass this around as a pointer in a portable manner in
* the callee otherwise.
* *
* Type of marker probe functions. They receive the mdata and need to parse the * Type of marker probe functions. They receive the mdata and need to parse the
* format string to recover the variable argument list. * format string to recover the variable argument list.
*/ */
typedef void marker_probe_func(const struct marker *mdata, typedef void marker_probe_func(void *probe_private, void *call_private,
void *private_data, const char *fmt, ...); const char *fmt, va_list *args);
struct marker_probe_closure {
marker_probe_func *func; /* Callback */
void *probe_private; /* Private probe data */
};
struct marker { struct marker {
const char *name; /* Marker name */ const char *name; /* Marker name */
...@@ -36,8 +43,11 @@ struct marker { ...@@ -36,8 +43,11 @@ struct marker {
* variable argument list. * variable argument list.
*/ */
char state; /* Marker state. */ char state; /* Marker state. */
marker_probe_func *call;/* Probe handler function pointer */ char ptype; /* probe type : 0 : single, 1 : multi */
void *private; /* Private probe data */ void (*call)(const struct marker *mdata, /* Probe wrapper */
void *call_private, const char *fmt, ...);
struct marker_probe_closure single;
struct marker_probe_closure *multi;
} __attribute__((aligned(8))); } __attribute__((aligned(8)));
#ifdef CONFIG_MARKERS #ifdef CONFIG_MARKERS
...@@ -49,7 +59,7 @@ struct marker { ...@@ -49,7 +59,7 @@ struct marker {
* not add unwanted padding between the beginning of the section and the * not add unwanted padding between the beginning of the section and the
* structure. Force alignment to the same alignment as the section start. * structure. Force alignment to the same alignment as the section start.
*/ */
#define __trace_mark(name, call_data, format, args...) \ #define __trace_mark(name, call_private, format, args...) \
do { \ do { \
static const char __mstrtab_name_##name[] \ static const char __mstrtab_name_##name[] \
__attribute__((section("__markers_strings"))) \ __attribute__((section("__markers_strings"))) \
...@@ -60,24 +70,23 @@ struct marker { ...@@ -60,24 +70,23 @@ struct marker {
static struct marker __mark_##name \ static struct marker __mark_##name \
__attribute__((section("__markers"), aligned(8))) = \ __attribute__((section("__markers"), aligned(8))) = \
{ __mstrtab_name_##name, __mstrtab_format_##name, \ { __mstrtab_name_##name, __mstrtab_format_##name, \
0, __mark_empty_function, NULL }; \ 0, 0, marker_probe_cb, \
{ __mark_empty_function, NULL}, NULL }; \
__mark_check_format(format, ## args); \ __mark_check_format(format, ## args); \
if (unlikely(__mark_##name.state)) { \ if (unlikely(__mark_##name.state)) { \
preempt_disable(); \
(*__mark_##name.call) \ (*__mark_##name.call) \
(&__mark_##name, call_data, \ (&__mark_##name, call_private, \
format, ## args); \ format, ## args); \
preempt_enable(); \
} \ } \
} while (0) } while (0)
extern void marker_update_probe_range(struct marker *begin, extern void marker_update_probe_range(struct marker *begin,
struct marker *end, struct module *probe_module, int *refcount); struct marker *end);
#else /* !CONFIG_MARKERS */ #else /* !CONFIG_MARKERS */
#define __trace_mark(name, call_data, format, args...) \ #define __trace_mark(name, call_private, format, args...) \
__mark_check_format(format, ## args) __mark_check_format(format, ## args)
static inline void marker_update_probe_range(struct marker *begin, static inline void marker_update_probe_range(struct marker *begin,
struct marker *end, struct module *probe_module, int *refcount) struct marker *end)
{ } { }
#endif /* CONFIG_MARKERS */ #endif /* CONFIG_MARKERS */
...@@ -92,8 +101,6 @@ static inline void marker_update_probe_range(struct marker *begin, ...@@ -92,8 +101,6 @@ static inline void marker_update_probe_range(struct marker *begin,
#define trace_mark(name, format, args...) \ #define trace_mark(name, format, args...) \
__trace_mark(name, NULL, format, ## args) __trace_mark(name, NULL, format, ## args)
#define MARK_MAX_FORMAT_LEN 1024
/** /**
* MARK_NOARGS - Format string for a marker with no argument. * MARK_NOARGS - Format string for a marker with no argument.
*/ */
...@@ -106,24 +113,30 @@ static inline void __printf(1, 2) __mark_check_format(const char *fmt, ...) ...@@ -106,24 +113,30 @@ static inline void __printf(1, 2) __mark_check_format(const char *fmt, ...)
extern marker_probe_func __mark_empty_function; extern marker_probe_func __mark_empty_function;
extern void marker_probe_cb(const struct marker *mdata,
void *call_private, const char *fmt, ...);
extern void marker_probe_cb_noarg(const struct marker *mdata,
void *call_private, const char *fmt, ...);
/* /*
* Connect a probe to a marker. * Connect a probe to a marker.
* private data pointer must be a valid allocated memory address, or NULL. * private data pointer must be a valid allocated memory address, or NULL.
*/ */
extern int marker_probe_register(const char *name, const char *format, extern int marker_probe_register(const char *name, const char *format,
marker_probe_func *probe, void *private); marker_probe_func *probe, void *probe_private);
/* /*
* Returns the private data given to marker_probe_register. * Returns the private data given to marker_probe_register.
*/ */
extern void *marker_probe_unregister(const char *name); extern int marker_probe_unregister(const char *name,
marker_probe_func *probe, void *probe_private);
/* /*
* Unregister a marker by providing the registered private data. * Unregister a marker by providing the registered private data.
*/ */
extern void *marker_probe_unregister_private_data(void *private); extern int marker_probe_unregister_private_data(marker_probe_func *probe,
void *probe_private);
extern int marker_arm(const char *name); extern void *marker_get_private_data(const char *name, marker_probe_func *probe,
extern int marker_disarm(const char *name); int num);
extern void *marker_get_private_data(const char *name);
#endif #endif
...@@ -465,7 +465,7 @@ int unregister_module_notifier(struct notifier_block * nb); ...@@ -465,7 +465,7 @@ int unregister_module_notifier(struct notifier_block * nb);
extern void print_modules(void); extern void print_modules(void);
extern void module_update_markers(struct module *probe_module, int *refcount); extern void module_update_markers(void);
#else /* !CONFIG_MODULES... */ #else /* !CONFIG_MODULES... */
#define EXPORT_SYMBOL(sym) #define EXPORT_SYMBOL(sym)
......
...@@ -27,22 +27,15 @@ ...@@ -27,22 +27,15 @@
extern struct marker __start___markers[]; extern struct marker __start___markers[];
extern struct marker __stop___markers[]; extern struct marker __stop___markers[];
/* Set to 1 to enable marker debug output */
const int marker_debug;
/* /*
* markers_mutex nests inside module_mutex. Markers mutex protects the builtin * markers_mutex nests inside module_mutex. Markers mutex protects the builtin
* and module markers, the hash table and deferred_sync. * and module markers and the hash table.
*/ */
static DEFINE_MUTEX(markers_mutex); static DEFINE_MUTEX(markers_mutex);
/*
* Marker deferred synchronization.
* Upon marker probe_unregister, we delay call to synchronize_sched() to
* accelerate mass unregistration (only when there is no more reference to a
* given module do we call synchronize_sched()). However, we need to make sure
* every critical region has ended before we re-arm a marker that has been
* unregistered and then registered back with a different probe data.
*/
static int deferred_sync;
/* /*
* Marker hash table, containing the active markers. * Marker hash table, containing the active markers.
* Protected by module_mutex. * Protected by module_mutex.
...@@ -50,12 +43,26 @@ static int deferred_sync; ...@@ -50,12 +43,26 @@ static int deferred_sync;
#define MARKER_HASH_BITS 6 #define MARKER_HASH_BITS 6
#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS) #define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
/*
* Note about RCU :
* It is used to make sure every handler has finished using its private data
* between two consecutive operation (add or remove) on a given marker. It is
* also used to delay the free of multiple probes array until a quiescent state
* is reached.
* marker entries modifications are protected by the markers_mutex.
*/
struct marker_entry { struct marker_entry {
struct hlist_node hlist; struct hlist_node hlist;
char *format; char *format;
marker_probe_func *probe; void (*call)(const struct marker *mdata, /* Probe wrapper */
void *private; void *call_private, const char *fmt, ...);
struct marker_probe_closure single;
struct marker_probe_closure *multi;
int refcount; /* Number of times armed. 0 if disarmed. */ int refcount; /* Number of times armed. 0 if disarmed. */
struct rcu_head rcu;
void *oldptr;
char rcu_pending:1;
char ptype:1;
char name[0]; /* Contains name'\0'format'\0' */ char name[0]; /* Contains name'\0'format'\0' */
}; };
...@@ -63,7 +70,8 @@ static struct hlist_head marker_table[MARKER_TABLE_SIZE]; ...@@ -63,7 +70,8 @@ static struct hlist_head marker_table[MARKER_TABLE_SIZE];
/** /**
* __mark_empty_function - Empty probe callback * __mark_empty_function - Empty probe callback
* @mdata: pointer of type const struct marker * @probe_private: probe private data
* @call_private: call site private data
* @fmt: format string * @fmt: format string
* @...: variable argument list * @...: variable argument list
* *
...@@ -72,12 +80,266 @@ static struct hlist_head marker_table[MARKER_TABLE_SIZE]; ...@@ -72,12 +80,266 @@ static struct hlist_head marker_table[MARKER_TABLE_SIZE];
* though the function pointer change and the marker enabling are two distinct * though the function pointer change and the marker enabling are two distinct
* operations that modifies the execution flow of preemptible code. * operations that modifies the execution flow of preemptible code.
*/ */
void __mark_empty_function(const struct marker *mdata, void *private, void __mark_empty_function(void *probe_private, void *call_private,
const char *fmt, ...) const char *fmt, va_list *args)
{ {
} }
EXPORT_SYMBOL_GPL(__mark_empty_function); EXPORT_SYMBOL_GPL(__mark_empty_function);
/*
* marker_probe_cb Callback that prepares the variable argument list for probes.
* @mdata: pointer of type struct marker
* @call_private: caller site private data
* @fmt: format string
* @...: Variable argument list.
*
* Since we do not use "typical" pointer based RCU in the 1 argument case, we
* need to put a full smp_rmb() in this branch. This is why we do not use
* rcu_dereference() for the pointer read.
*/
void marker_probe_cb(const struct marker *mdata, void *call_private,
const char *fmt, ...)
{
va_list args;
char ptype;
/*
* disabling preemption to make sure the teardown of the callbacks can
* be done correctly when they are in modules and they insure RCU read
* coherency.
*/
preempt_disable();
ptype = ACCESS_ONCE(mdata->ptype);
if (likely(!ptype)) {
marker_probe_func *func;
/* Must read the ptype before ptr. They are not data dependant,
* so we put an explicit smp_rmb() here. */
smp_rmb();
func = ACCESS_ONCE(mdata->single.func);
/* Must read the ptr before private data. They are not data
* dependant, so we put an explicit smp_rmb() here. */
smp_rmb();
va_start(args, fmt);
func(mdata->single.probe_private, call_private, fmt, &args);
va_end(args);
} else {
struct marker_probe_closure *multi;
int i;
/*
* multi points to an array, therefore accessing the array
* depends on reading multi. However, even in this case,
* we must insure that the pointer is read _before_ the array
* data. Same as rcu_dereference, but we need a full smp_rmb()
* in the fast path, so put the explicit barrier here.
*/
smp_read_barrier_depends();
multi = ACCESS_ONCE(mdata->multi);
for (i = 0; multi[i].func; i++) {
va_start(args, fmt);
multi[i].func(multi[i].probe_private, call_private, fmt,
&args);
va_end(args);
}
}
preempt_enable();
}
EXPORT_SYMBOL_GPL(marker_probe_cb);
/*
* marker_probe_cb Callback that does not prepare the variable argument list.
* @mdata: pointer of type struct marker
* @call_private: caller site private data
* @fmt: format string
* @...: Variable argument list.
*
* Should be connected to markers "MARK_NOARGS".
*/
void marker_probe_cb_noarg(const struct marker *mdata,
void *call_private, const char *fmt, ...)
{
va_list args; /* not initialized */
char ptype;
preempt_disable();
ptype = ACCESS_ONCE(mdata->ptype);
if (likely(!ptype)) {
marker_probe_func *func;
/* Must read the ptype before ptr. They are not data dependant,
* so we put an explicit smp_rmb() here. */
smp_rmb();
func = ACCESS_ONCE(mdata->single.func);
/* Must read the ptr before private data. They are not data
* dependant, so we put an explicit smp_rmb() here. */
smp_rmb();
func(mdata->single.probe_private, call_private, fmt, &args);
} else {
struct marker_probe_closure *multi;
int i;
/*
* multi points to an array, therefore accessing the array
* depends on reading multi. However, even in this case,
* we must insure that the pointer is read _before_ the array
* data. Same as rcu_dereference, but we need a full smp_rmb()
* in the fast path, so put the explicit barrier here.
*/
smp_read_barrier_depends();
multi = ACCESS_ONCE(mdata->multi);
for (i = 0; multi[i].func; i++)
multi[i].func(multi[i].probe_private, call_private, fmt,
&args);
}
preempt_enable();
}
EXPORT_SYMBOL_GPL(marker_probe_cb_noarg);
static void free_old_closure(struct rcu_head *head)
{
struct marker_entry *entry = container_of(head,
struct marker_entry, rcu);
kfree(entry->oldptr);
/* Make sure we free the data before setting the pending flag to 0 */
smp_wmb();
entry->rcu_pending = 0;
}
static void debug_print_probes(struct marker_entry *entry)
{
int i;
if (!marker_debug)
return;
if (!entry->ptype) {
printk(KERN_DEBUG "Single probe : %p %p\n",
entry->single.func,
entry->single.probe_private);
} else {
for (i = 0; entry->multi[i].func; i++)
printk(KERN_DEBUG "Multi probe %d : %p %p\n", i,
entry->multi[i].func,
entry->multi[i].probe_private);
}
}
static struct marker_probe_closure *
marker_entry_add_probe(struct marker_entry *entry,
marker_probe_func *probe, void *probe_private)
{
int nr_probes = 0;
struct marker_probe_closure *old, *new;
WARN_ON(!probe);
debug_print_probes(entry);
old = entry->multi;
if (!entry->ptype) {
if (entry->single.func == probe &&
entry->single.probe_private == probe_private)
return ERR_PTR(-EBUSY);
if (entry->single.func == __mark_empty_function) {
/* 0 -> 1 probes */
entry->single.func = probe;
entry->single.probe_private = probe_private;
entry->refcount = 1;
entry->ptype = 0;
debug_print_probes(entry);
return NULL;
} else {
/* 1 -> 2 probes */
nr_probes = 1;
old = NULL;
}
} else {
/* (N -> N+1), (N != 0, 1) probes */
for (nr_probes = 0; old[nr_probes].func; nr_probes++)
if (old[nr_probes].func == probe
&& old[nr_probes].probe_private
== probe_private)
return ERR_PTR(-EBUSY);
}
/* + 2 : one for new probe, one for NULL func */
new = kzalloc((nr_probes + 2) * sizeof(struct marker_probe_closure),
GFP_KERNEL);
if (new == NULL)
return ERR_PTR(-ENOMEM);
if (!old)
new[0] = entry->single;
else
memcpy(new, old,
nr_probes * sizeof(struct marker_probe_closure));
new[nr_probes].func = probe;
new[nr_probes].probe_private = probe_private;
entry->refcount = nr_probes + 1;
entry->multi = new;
entry->ptype = 1;
debug_print_probes(entry);
return old;
}
static struct marker_probe_closure *
marker_entry_remove_probe(struct marker_entry *entry,
marker_probe_func *probe, void *probe_private)
{
int nr_probes = 0, nr_del = 0, i;
struct marker_probe_closure *old, *new;
old = entry->multi;
debug_print_probes(entry);
if (!entry->ptype) {
/* 0 -> N is an error */
WARN_ON(entry->single.func == __mark_empty_function);
/* 1 -> 0 probes */
WARN_ON(probe && entry->single.func != probe);
WARN_ON(entry->single.probe_private != probe_private);
entry->single.func = __mark_empty_function;
entry->refcount = 0;
entry->ptype = 0;
debug_print_probes(entry);
return NULL;
} else {
/* (N -> M), (N > 1, M >= 0) probes */
for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
if ((!probe || old[nr_probes].func == probe)
&& old[nr_probes].probe_private
== probe_private)
nr_del++;
}
}
if (nr_probes - nr_del == 0) {
/* N -> 0, (N > 1) */
entry->single.func = __mark_empty_function;
entry->refcount = 0;
entry->ptype = 0;
} else if (nr_probes - nr_del == 1) {
/* N -> 1, (N > 1) */
for (i = 0; old[i].func; i++)
if ((probe && old[i].func != probe) ||
old[i].probe_private != probe_private)
entry->single = old[i];
entry->refcount = 1;
entry->ptype = 0;
} else {
int j = 0;
/* N -> M, (N > 1, M > 1) */
/* + 1 for NULL */
new = kzalloc((nr_probes - nr_del + 1)
* sizeof(struct marker_probe_closure), GFP_KERNEL);
if (new == NULL)
return ERR_PTR(-ENOMEM);
for (i = 0; old[i].func; i++)
if ((probe && old[i].func != probe) ||
old[i].probe_private != probe_private)
new[j++] = old[i];
entry->refcount = nr_probes - nr_del;
<