Accessing local variable values in BPF

May 2, 2023 | 10 minute read
Text Size 100%:

Purpose

BPF programs can easily access values of:

  • Global variables (via libbpf’s ksym() and /proc/kallsyms symbol addresses)
  • Function arguments (via PT_REGS_PARMx())
  • Function return values (via PT_REGS_RC())

However local variables in kernel functions remain difficult to access. This blog explains a technique that allows BPF programs to access local variables.

To illustrate this we will consider the case of measuring the length of the list at xfs_inode->i_ioend_list from inside xfs_end_io().

Modifications made to xfs_end_ioend()

To demonstrate the technique, we will define a new local variable called nr_entries in xfs_end_io(). The value of the variable is incremented as we process each entry in the list at xfs_inode->i_ioend_list.

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 9d6a67c7d227..cecc39ed3517 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -150,7 +150,7 @@ xfs_end_ioend(
 * to complete it. Hence we have to be careful about holding the CPU for too
 * long in this loop.
 */
-void
+void __attribute__((optimize("O0")))
 xfs_end_io(
     struct work_struct   *work)
 {
@@ -159,6 +159,7 @@ xfs_end_io(
     struct iomap_ioend   *ioend;
     struct list_head     tmp;
     unsigned long        flags;
+    int                  nr_entries = 0;

     spin_lock_irqsave(&ip->i_ioend_lock, flags);
     list_replace_init(&ip->i_ioend_list, &tmp);
@@ -167,11 +168,14 @@ xfs_end_io(
     iomap_sort_ioends(&tmp);
     while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend,
             io_list))) {
+s       ++nr_entries;
         list_del_init(&ioend->io_list);
         iomap_ioend_try_merge(ioend, &tmp);
         xfs_end_ioend(ioend);
         cond_resched();
     }
+
+    return;
     }

STATIC void

To find the length of list, we need to read the value of nr_entries just before the return statement is executed.

Using perf to obtain the offset of the return statement.

  1. Find the line number of the return statement as determined by Perf.

    # perf probe -L xfs_end_io -k ./vmlinux
    <xfs_end_io@/data/linux/fs/xfs/xfs_aops.c:0>
      0  xfs_end_io(
            struct work_struct   *work)
         {
      3      struct xfs_inode    *ip =
      4          container_of(work, struct xfs_inode, i_ioend_work);
      5      struct iomap_ioend  *ioend;
             struct list_head    tmp;
             unsigned long       flags;
             int                 nr_entries = 0;
    
      10     spin_lock_irqsave(&ip->i_ioend_lock, flags);
             list_replace_init(&ip->i_ioend_list, &tmp);
      12     spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
    
             iomap_sort_ioends(&tmp);
      15     while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend,
                     io_list))) {
      17         ++nr_entries;
      18         list_del_init(&ioend->io_list);
      19         iomap_ioend_try_merge(ioend, &tmp);
      20         xfs_end_ioend(ioend);
      21         cond_resched();
             }
    
      24     return;
         }

    The offset of 24 will be used in the next step.

  2. Find the function offset of the return statement.

    # perf probe -V xfs_end_io:24 -k ./vmlinux
    Available variables at xfs_end_io:24
        @<xfs_end_io+283>
            int  nr_entries
            long unsigned int  flags
            struct iomap_ioend*  ioend
            struct list_head  tmp
            struct work_struct*  work
            struct xfs_inode*  ip

    The above output indicates that the return statement maps to function offset xfs_end_io+283. It also confirms that the variable nr_entries is indeed accessible at that offset.

Find the register/Memory location of nr_entries.

Execute GDB’s disassemble command to find the location of nr_entries.

# gdb --batch -ex 'disassemble/m xfs_end_io' ./vmlinux
Dump of assembler code for function xfs_end_io:
156  {
   0xffffffff8146e65b <+0>:    callq  0xffffffff81075d50 <__fentry__>
   0xffffffff8146e660 <+5>:    push   %rbp
   0xffffffff8146e661 <+6>:    mov    %rsp,%rbp
   0xffffffff8146e664 <+9>:    sub    $0x78,%rsp
   0xffffffff8146e668 <+13>:   mov    %rdi,-0x78(%rbp)
   0xffffffff8146e66c <+17>:   mov    %gs:0x28,%rax
   0xffffffff8146e675 <+26>:   mov    %rax,-0x8(%rbp)
   0xffffffff8146e679 <+30>:   xor    %eax,%eax

157    struct xfs_inode  *ip =

158        container_of(work, struct xfs_inode, i_ioend_work);
   0xffffffff8146e67b <+32>:   mov    -0x78(%rbp),%rax
   0xffffffff8146e67f <+36>:   mov    %rax,-0x68(%rbp)
   0xffffffff8146e683 <+40>:   mov    -0x68(%rbp),%rax
   0xffffffff8146e687 <+44>:   sub    $0x378,%rax
   0xffffffff8146e68d <+50>:   mov    %rax,-0x60(%rbp)

159    struct iomap_ioend  *ioend;

160    struct list_head    tmp;

161    unsigned long       flags;

162    int                 nr_entries = 0;
   0xffffffff8146e691 <+54>:  movl   $0x0,-0x6c(%rbp)

163
164    spin_lock_irqsave(&ip->i_ioend_lock, flags);
   0xffffffff8146e698 <+61>:   mov    -0x60(%rbp),%rax
   0xffffffff8146e69c <+65>:   add    $0x370,%rax
   0xffffffff8146e6a2 <+71>:   mov    %rax,-0x20(%rbp)

165    list_replace_init(&ip->i_ioend_list, &tmp);
   0xffffffff8146e6b6 <+91>:   mov    -0x60(%rbp),%rax
   0xffffffff8146e6ba <+95>:   lea    0x398(%rax),%rdx
   0xffffffff8146e6c1 <+102>:  lea    -0x18(%rbp),%rax
   0xffffffff8146e6c5 <+106>:  mov    %rax,%rsi
   0xffffffff8146e6c8 <+109>:  mov    %rdx,%rdi
   0xffffffff8146e6cb <+112>:  callq  0xffffffff8146f0ce <list_replace_init>

[...]

[...]

171        ++nr_entries;
   0xffffffff8146e707 <+172>:  addl   $0x1,-0x6c(%rbp)

172        list_del_init(&ioend->io_list);
   0xffffffff8146e70b <+176>:  mov    -0x30(%rbp),%rax
   0xffffffff8146e70f <+180>:  mov    %rax,%rdi
   0xffffffff8146e712 <+183>:  callq  0xffffffff8146f0eb <list_del_init>

173        iomap_ioend_try_merge(ioend, &tmp);
   0xffffffff8146e717 <+188>:  lea    -0x18(%rbp),%rdx
   0xffffffff8146e71b <+192>:  mov    -0x30(%rbp),%rax
   0xffffffff8146e71f <+196>:  mov    %rdx,%rsi
   0xffffffff8146e722 <+199>:  mov    %rax,%rdi
   0xffffffff8146e725 <+202>:  callq  0xffffffff813ce0f0 <iomap_ioend_try_merge>

174        xfs_end_ioend(ioend);
   0xffffffff8146e72a <+207>:  mov    -0x30(%rbp),%rax
   0xffffffff8146e72e <+211>:  mov    %rax,%rdi
   0xffffffff8146e731 <+214>:  callq  0xffffffff8146f263 <xfs_end_ioend>

175        cond_resched();
176    }
177
178    return;
   0xffffffff8146e776 <+283>:  nop
   0xffffffff8146e777 <+284>:  mov    -0x8(%rbp),%rax
   0xffffffff8146e77b <+288>:  xor    %gs:0x28,%rax
   0xffffffff8146e784 <+297>:  je     0xffffffff8146e78b <xfs_end_io+304>
   0xffffffff8146e786 <+299>:  callq  0xffffffff81c9dd20 <__stack_chk_fail>
   0xffffffff8146e78b <+304>:  leaveq
   0xffffffff8146e78c <+305>:  retq

End of assembler dump.

The above output shows that nr_entries is available at memory location %rbp - 0x6c.

BPF programs to measure the length of xfs_inode->i_ioend_list.

Armed with the information obtained from the previous steps, We can now write a BCC/BPF program to measure the length of the list at xfs_inode->i_ioend_list.

  • The following BCC Python program attaches a kprobe at offset xfs_end_io+283 and prints statistics associated with the value of nr_entries.

    #!/usr/bin/python
    
    import argparse
    import subprocess
    import shlex
    import time
    import tabulate
    from bcc import BPF
    
    parser = argparse.ArgumentParser(description="Compute IO list's length")
    parser.add_argument("-c", dest="cmdline", help="Workload's command line",
                        required=True)
    args = parser.parse_args()
    
    b = BPF(src_file="io-list-len.c")
    b.attach_kprobe(event="xfs_end_io+283", fn_name="trace_xfs_end_io")
    
    cmdline = shlex.split(args.cmdline)
    proc = subprocess.Popen(cmdline, stdout=subprocess.DEVNULL)
    proc.wait()
    
    sum = 0
    for i in range(0, 10):
        nr_samples = b["io_list_len"][i].value
        sum = sum + nr_samples
    
    table = [["List size", "Nr samples", "Percent"]]
    for i in range(0, 10):
        element = b["io_list_len"][i]
        nr_samples = element.value
        percent = (nr_samples * 100) / sum
        percent = round(percent, 2)
        table.append([i+1, nr_samples, percent])
    
    print(tabulate.tabulate(table, headers='firstrow'))

    The BCC Python program does the following:

    1. Adds a probe point is at location xfs_end_io+283.
    2. Executes the workload.
    3. Computes & prints statistics associated with the list length.
  • The following is the corresponding C program which is executed when the kprobe is reached.

    #include <uapi/linux/ptrace.h>
    #include <linux/sched.h>
    
    #define MAX_IO_LIST_LEN 10
    
    BPF_ARRAY(io_list_len, u64, MAX_IO_LIST_LEN);
    
    int trace_xfs_end_io(struct pt_regs *ctx)
    {
        u64 rbp = ctx->bp;
        u64 *valp, val;
        int nr_entries;
        long ret;
    
        ret = bpf_probe_read_kernel(&nr_entries, sizeof(nr_entries),
                                    (void *)(rbp - 0x6c)); /* -0x6c(%rbp) */
        --nr_entries;
        if (nr_entries >= MAX_IO_LIST_LEN)
            return 0;
    
        valp = io_list_len.lookup(&nr_entries);
        if (valp == NULL) {
            val = 1;
        } else {
            val = *valp + 1;
        }
        io_list_len.update(&nr_entries, &val);
    
        return 0;
    }

    The C program does the following,

    1. Reads the data stored at location rbp - 0x6c.
    2. Updates the statistics stored in a BPF array.

Sample output

The following illustrates a sample execution of the BPF program.

# ./io-list-len.py -c 'xfs_io -f -c "pwrite -b 4k 0 10M" -c sync "pwrite -b 4k 10M 40M"  /mnt/file.bin'
  List size    Nr samples    Percent
-----------  ------------  ---------
          1            12      70.59
          2             5      29.41
          3             0       0
          4             0       0
          5             0       0
          6             0       0
          7             0       0
          8             0       0
          9             0       0
         10             0       0

Conclusion

As illustrated in this article, obtaining access to local variables is slightly complex. But in some cases this is the only way to gain insight into the problem being analyzed.

Chandan Babu


Previous Post

Tuning glibc malloc on ARM: A Case Study

Richard Smith | 6 min read

Next Post


A case study of QEMU and AddressSanitizer

Dongli Zhang | 4 min read