Purpose
BPF programs can easily access values of:
- Global variables (via libbpf’s ksym() and /proc/kallsyms symbol addresses)
- Function arguments (via PT_REGS_PARMx())
- Function return values (via PT_REGS_RC())
However local variables in kernel functions remain difficult to access. This blog explains a technique that allows BPF programs to access local variables.
To illustrate this we will consider the case of measuring the length of the list at xfs_inode->i_ioend_list
from inside xfs_end_io()
.
Modifications made to xfs_end_ioend()
To demonstrate the technique, we will define a new local variable called nr_entries
in xfs_end_io()
. The value of the variable is incremented as we process each entry in the list at xfs_inode->i_ioend_list
.
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 9d6a67c7d227..cecc39ed3517 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -150,7 +150,7 @@ xfs_end_ioend( * to complete it. Hence we have to be careful about holding the CPU for too * long in this loop. */ -void +void __attribute__((optimize("O0"))) xfs_end_io( struct work_struct *work) { @@ -159,6 +159,7 @@ xfs_end_io( struct iomap_ioend *ioend; struct list_head tmp; unsigned long flags; + int nr_entries = 0; spin_lock_irqsave(&ip->i_ioend_lock, flags); list_replace_init(&ip->i_ioend_list, &tmp); @@ -167,11 +168,14 @@ xfs_end_io( iomap_sort_ioends(&tmp); while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend, io_list))) { +s ++nr_entries; list_del_init(&ioend->io_list); iomap_ioend_try_merge(ioend, &tmp); xfs_end_ioend(ioend); cond_resched(); } + + return; } STATIC void
To find the length of list, we need to read the value of nr_entries
just before the return
statement is executed.
Using perf to obtain the offset of the return
statement.
-
Find the line number of the return statement as determined by Perf.
# perf probe -L xfs_end_io -k ./vmlinux <xfs_end_io@/data/linux/fs/xfs/xfs_aops.c:0> 0 xfs_end_io( struct work_struct *work) { 3 struct xfs_inode *ip = 4 container_of(work, struct xfs_inode, i_ioend_work); 5 struct iomap_ioend *ioend; struct list_head tmp; unsigned long flags; int nr_entries = 0; 10 spin_lock_irqsave(&ip->i_ioend_lock, flags); list_replace_init(&ip->i_ioend_list, &tmp); 12 spin_unlock_irqrestore(&ip->i_ioend_lock, flags); iomap_sort_ioends(&tmp); 15 while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend, io_list))) { 17 ++nr_entries; 18 list_del_init(&ioend->io_list); 19 iomap_ioend_try_merge(ioend, &tmp); 20 xfs_end_ioend(ioend); 21 cond_resched(); } 24 return; }
The offset of
24
will be used in the next step. -
Find the function offset of the
return
statement.# perf probe -V xfs_end_io:24 -k ./vmlinux Available variables at xfs_end_io:24 @<xfs_end_io+283> int nr_entries long unsigned int flags struct iomap_ioend* ioend struct list_head tmp struct work_struct* work struct xfs_inode* ip
The above output indicates that the
return
statement maps to function offsetxfs_end_io+283
. It also confirms that the variablenr_entries
is indeed accessible at that offset.
Find the register/Memory location of nr_entries
.
Execute GDB’s disassemble
command to find the location of nr_entries
.
# gdb --batch -ex 'disassemble/m xfs_end_io' ./vmlinux Dump of assembler code for function xfs_end_io: 156 { 0xffffffff8146e65b <+0>: callq 0xffffffff81075d50 <__fentry__> 0xffffffff8146e660 <+5>: push %rbp 0xffffffff8146e661 <+6>: mov %rsp,%rbp 0xffffffff8146e664 <+9>: sub $0x78,%rsp 0xffffffff8146e668 <+13>: mov %rdi,-0x78(%rbp) 0xffffffff8146e66c <+17>: mov %gs:0x28,%rax 0xffffffff8146e675 <+26>: mov %rax,-0x8(%rbp) 0xffffffff8146e679 <+30>: xor %eax,%eax 157 struct xfs_inode *ip = 158 container_of(work, struct xfs_inode, i_ioend_work); 0xffffffff8146e67b <+32>: mov -0x78(%rbp),%rax 0xffffffff8146e67f <+36>: mov %rax,-0x68(%rbp) 0xffffffff8146e683 <+40>: mov -0x68(%rbp),%rax 0xffffffff8146e687 <+44>: sub $0x378,%rax 0xffffffff8146e68d <+50>: mov %rax,-0x60(%rbp) 159 struct iomap_ioend *ioend; 160 struct list_head tmp; 161 unsigned long flags; 162 int nr_entries = 0; 0xffffffff8146e691 <+54>: movl $0x0,-0x6c(%rbp) 163 164 spin_lock_irqsave(&ip->i_ioend_lock, flags); 0xffffffff8146e698 <+61>: mov -0x60(%rbp),%rax 0xffffffff8146e69c <+65>: add $0x370,%rax 0xffffffff8146e6a2 <+71>: mov %rax,-0x20(%rbp) 165 list_replace_init(&ip->i_ioend_list, &tmp); 0xffffffff8146e6b6 <+91>: mov -0x60(%rbp),%rax 0xffffffff8146e6ba <+95>: lea 0x398(%rax),%rdx 0xffffffff8146e6c1 <+102>: lea -0x18(%rbp),%rax 0xffffffff8146e6c5 <+106>: mov %rax,%rsi 0xffffffff8146e6c8 <+109>: mov %rdx,%rdi 0xffffffff8146e6cb <+112>: callq 0xffffffff8146f0ce <list_replace_init> [...] [...] 171 ++nr_entries; 0xffffffff8146e707 <+172>: addl $0x1,-0x6c(%rbp) 172 list_del_init(&ioend->io_list); 0xffffffff8146e70b <+176>: mov -0x30(%rbp),%rax 0xffffffff8146e70f <+180>: mov %rax,%rdi 0xffffffff8146e712 <+183>: callq 0xffffffff8146f0eb <list_del_init> 173 iomap_ioend_try_merge(ioend, &tmp); 0xffffffff8146e717 <+188>: lea -0x18(%rbp),%rdx 0xffffffff8146e71b <+192>: mov -0x30(%rbp),%rax 0xffffffff8146e71f <+196>: mov %rdx,%rsi 0xffffffff8146e722 <+199>: mov %rax,%rdi 0xffffffff8146e725 <+202>: callq 0xffffffff813ce0f0 <iomap_ioend_try_merge> 174 xfs_end_ioend(ioend); 0xffffffff8146e72a <+207>: mov -0x30(%rbp),%rax 0xffffffff8146e72e <+211>: mov %rax,%rdi 0xffffffff8146e731 <+214>: callq 0xffffffff8146f263 <xfs_end_ioend> 175 cond_resched(); 176 } 177 178 return; 0xffffffff8146e776 <+283>: nop 0xffffffff8146e777 <+284>: mov -0x8(%rbp),%rax 0xffffffff8146e77b <+288>: xor %gs:0x28,%rax 0xffffffff8146e784 <+297>: je 0xffffffff8146e78b <xfs_end_io+304> 0xffffffff8146e786 <+299>: callq 0xffffffff81c9dd20 <__stack_chk_fail> 0xffffffff8146e78b <+304>: leaveq 0xffffffff8146e78c <+305>: retq End of assembler dump.
The above output shows that nr_entries
is available at memory location %rbp - 0x6c
.
BPF programs to measure the length of xfs_inode->i_ioend_list
.
Armed with the information obtained from the previous steps, We can now write a BCC/BPF program to measure the length of the list at xfs_inode->i_ioend_list
.
-
The following BCC Python program attaches a kprobe at offset
xfs_end_io+283
and prints statistics associated with the value ofnr_entries
.#!/usr/bin/python import argparse import subprocess import shlex import time import tabulate from bcc import BPF parser = argparse.ArgumentParser(description="Compute IO list's length") parser.add_argument("-c", dest="cmdline", help="Workload's command line", required=True) args = parser.parse_args() b = BPF(src_file="io-list-len.c") b.attach_kprobe(event="xfs_end_io+283", fn_name="trace_xfs_end_io") cmdline = shlex.split(args.cmdline) proc = subprocess.Popen(cmdline, stdout=subprocess.DEVNULL) proc.wait() sum = 0 for i in range(0, 10): nr_samples = b["io_list_len"][i].value sum = sum + nr_samples table = [["List size", "Nr samples", "Percent"]] for i in range(0, 10): element = b["io_list_len"][i] nr_samples = element.value percent = (nr_samples * 100) / sum percent = round(percent, 2) table.append([i+1, nr_samples, percent]) print(tabulate.tabulate(table, headers='firstrow'))
The BCC Python program does the following:
- Adds a probe point is at location
xfs_end_io+283
. - Executes the workload.
- Computes & prints statistics associated with the list length.
- Adds a probe point is at location
-
The following is the corresponding C program which is executed when the kprobe is reached.
#include <uapi/linux/ptrace.h> #include <linux/sched.h> #define MAX_IO_LIST_LEN 10 BPF_ARRAY(io_list_len, u64, MAX_IO_LIST_LEN); int trace_xfs_end_io(struct pt_regs *ctx) { u64 rbp = ctx->bp; u64 *valp, val; int nr_entries; long ret; ret = bpf_probe_read_kernel(&nr_entries, sizeof(nr_entries), (void *)(rbp - 0x6c)); /* -0x6c(%rbp) */ --nr_entries; if (nr_entries >= MAX_IO_LIST_LEN) return 0; valp = io_list_len.lookup(&nr_entries); if (valp == NULL) { val = 1; } else { val = *valp + 1; } io_list_len.update(&nr_entries, &val); return 0; }
The C program does the following,
- Reads the data stored at location
rbp - 0x6c
. - Updates the statistics stored in a BPF array.
- Reads the data stored at location
Sample output
The following illustrates a sample execution of the BPF program.
# ./io-list-len.py -c 'xfs_io -f -c "pwrite -b 4k 0 10M" -c sync "pwrite -b 4k 10M 40M" /mnt/file.bin' List size Nr samples Percent ----------- ------------ --------- 1 12 70.59 2 5 29.41 3 0 0 4 0 0 5 0 0 6 0 0 7 0 0 8 0 0 9 0 0 10 0 0
Conclusion
As illustrated in this article, obtaining access to local variables is slightly complex. But in some cases this is the only way to gain insight into the problem being analyzed.