#!/bin/ksh # fsmode - Trace filesystem I/O to establish random and sequential % # # Copyright (c) 2011, timothy.cook@oracle.com. See LICENSE below. # # DESCRIPTION # # This script uses the DTrace syscall, fsinfo and fbt providers # to trace system-wide filesystem I/O and compute how much I/O # is random versus sequential; meaning it happens at the file # offset following the last byte of the previous read or write. # # Some types of metadata operations are tracked as well, and # statistics are reported by mounted filesystem. # # NOTES # # There are some types of filesystem I/O - most notably that on # mmap()-ed files, for which tracing of file offset of an actual # access is much harder - in particular there are no read(), # write(), fop_read(), fop_write() calls at which the offset can # be observed. Filesystem I/O for which only fop_getpage() and/or # fop_putpage() activity is observed is reported under a third # class of I/O named "page". # # The DTrace probes will only catch I/O relating to filesystems # via the VFS interface in Solaris. It will not report on "raw" # device I/O, including for example ASM. This is probably not # hard to do however. # # With a small amount of work, the DTrace could be modified to # filter based on UID, PID, project, etc. # # The fsinfo DTrace provider was added to Solaris 10 6/06 # (Update 2). This utility therefore requires this version of # Solaris 10 as a minimum. # # LICENSE # This work is licensed under Artistic License 2.0, # Copyright (c) 2000-2006, The Perl Foundation. Details at # http://opensource.org/licenses/Artistic-2.0 # # WEBSITE # This script is described and archived at my blog, via this # permanent link: # https://blogs.oracle.com/timc/entry/random_or_sequential_i_o # #------------------------------------------------------------------------ case $# in 1 ) ELAPSED=$1 ;; * ) echo "usage: $0 " >&2 ; exit 1 ;; esac #SU=pfexec SU= $SU /usr/sbin/dtrace -n ' /* * fsmode.d - Collect stats on file access mode (random v sequential) * * Timothy.Cook@oracle.com, 22 Mar 2011 */ /* * syscall::{read,readv,pread}:entry (same for *write*) * - Trace thread until fbt::fop_read:entry * * fbt::fop_read:entry (same for fop_write) * - args[0] is a struct vnode * * will be same value (address) as the "fileinfo_t" provided * by fsinfo::: probes * - args[1] is a struct uio *, which provides: * _uio_offset._f (long long) * uio_iovcnt (int) * * fsinfo::fop_read:read (etc) * - argo is a "fileinfo_t"[1], which provides * string fi_name; name (basename of fi_pathname) * string fi_dirname; directory (dirname of fi_pathname) * string fi_pathname; full pathname * string fi_fs; filesystem * string fi_mount; mount point of file system * * [1] - actually translated from a vnode *, so no fi_offset provided * => http://bugs.opensolaris.org/bugdatabase/view_bug.do?bug_id=6405662 * * syscall::lseek:return * - Change of file pointer without I/O (new offset is arg0 unless it is -1) * * syscall::{read,readv,pread}:return (same for write) * - Provides number of bytes read (unless -1) * - Use this to determine where file ptr went to. * * syscall::{aioread,aiowrite}: * int aioread(int fildes, char *bufp, int bufs, off_t offset, * int whence, aio_result_t *resultp); 36 typedef struct aio_result_t { 37 ssize_t aio_return; // return value of read or write 38 int aio_errno; // errno generated by the IO 39 } aio_result_t; * * - These do not generate fop_read/fop_write (just fop_getpage) * * syscall::aiowait:return * aio_result_t *aiowait(const struct timeval *timeout); * * fsinfo::{various} * - Count metadata ops (do not count rwlock,rwunlock, as these * happen for every read and write) */ #pragma D option quiet #pragma D option dynvarsize=16m /* */ /* #define VREG 1 */ inline int VREG = 1; BEGIN { seconds = $1; start = walltimestamp; printf("Sample interval: %d seconds\n", seconds); self->tracerw = 0; self->counted = 0; self->fsrw = 0; no_fsrw = 0; self->ptr[0] = -1; } /* ======================================================================== * 1: Read/write via fop_read, fop_write */ /* * Start tracing at the syscall - assume we will be the same thread until * entering fbt::fop_read. * * Need to match: read, pread, pread64 and readv (same for *write*) */ syscall::*read:entry,syscall::*read64:entry,syscall::readv:entry, syscall::*write:entry,syscall::*write64:entry,syscall::writev:entry { self->fd = arg0; self->tracerw = 1; self->fsinfo = 0; } /* We can set/reset the pointer on file open & close */ syscall::open*:return { self->ptr[arg0] = 0; } syscall::close*:entry { self->ptr[arg0] = -1; } /* Skip ops on non-filesystem vnodes */ fbt::fop_read:entry,fbt::fop_write:entry /self->tracerw && args[0]->v_type != VREG/ { /* vnode type != VREG */ self->tracerw = 0; } fbt::fop_read:entry,fbt::fop_write:entry /self->tracerw/ { self->rwfunc = probefunc; self->vp = args[0]; self->vtype = self->vp->v_type; self->offset = args[1]->_uio_offset._f; } fsinfo::fop_read:read,fsinfo::fop_write:write /self->tracerw/ { self->fsrw += 1; } fsinfo::: /self->tracerw/ { self->fs = args[0]->fi_mount; } fbt::fop_read:return,fbt::fop_write:return /self->tracerw && self->fsrw == 0/ { /* Something in my logic has failed */ @vreg_no_fsrw[execname, self->fs, arg0] = count(); no_fsrw++; } syscall::*read:return,syscall::*read64:return,syscall::readv:return /self->tracerw && arg0 != -1/ { /* Successful read */ self->mode = (self->offset == self->ptr[self->fd]); @rd_reads[self->fs, self->mode] = count(); @rd_bytes[self->fs, self->mode] = sum(arg0); self->ptr[self->fd] = self->offset + arg0; } syscall::*write:return,syscall::*write64:return,syscall::writev:return /self->tracerw && arg0 != -1/ { /* Successful read */ self->mode = (self->offset == self->ptr[self->fd]); @wr_writes[self->fs, self->mode] = count(); @wr_bytes[self->fs, self->mode] = sum(arg0); self->ptr[self->fd] = self->offset + arg0; } syscall::*read:return,syscall::*read64:return,syscall::readv:return, syscall::*write:return,syscall::*write64:return,syscall::writev:return /self->tracerw/ { self->fd = -1; self->tracerw = 0; self->fsrw = 0; self->vp = 0; self->mode = 0; self->path = 0; self->vtype = 0; self->fs = 0; self->rwfunc = 0; self->bytes = 0; self->pageio = 0; } /* ======================================================================== * 2: fop_getpage(), fop_putpage() * * These two vnode ops usually relate to I/O on mmap()-ed files, assuming * the putpage/getpage is NOT associated with a read/write. */ /* * getpage/putpage - vnode_t * is arg0 * * Intent here is to capture pagein/out activity not related to * read/write. */ fbt::fop_getpage:entry,fbt::fop_putpage:entry /args[0]->v_type == VREG && self->tracerw == 0/ { self->bytes = arg2; self->pageio = 1; } fbt::fop_getpage:entry,fbt::fop_putpage:entry /args[0]->v_type == VREG && self->tracerw/ { /* * This can happen, e.g. - to map pages of a file prior to * modifying them in a write(). */ @rw_with_pageio[self->rwfunc, probefunc] = count(); rw_with_page = 1; } fsinfo::fop_getpage:getpage /self->pageio/ { @rd_pages[args[0]->fi_mount] = count(); @rd_pagebytes[args[0]->fi_mount] = sum(self->bytes); self->pageio = 0; self->bytes = 0; } fsinfo::fop_putpage:putpage /self->pageio/ { @wr_pages[args[0]->fi_mount] = count(); @wr_pagebytes[args[0]->fi_mount] = sum(self->bytes); self->pageio = 0; self->bytes = 0; } /* ======================================================================== * 3: Other FS ops * * These relate mostly to metadata */ fsinfo::fop_create:create,fsinfo::fop_remove:remove /self->tracerw == 0/ { @other_ops[args[0]->fi_mount, "cr/rm"] = count(); self->counted = 1; } fsinfo::fop_access:access,fsinfo::fop_lookup:lookup /self->tracerw == 0/ { @other_ops[args[0]->fi_mount, "acc/look"] = count(); self->counted = 1; } fsinfo::fop_setattr:setattr,fsinfo::fop_getattr:getattr /self->tracerw == 0/ { @other_ops[args[0]->fi_mount, "seta/geta"] = count(); self->counted = 1; } fsinfo::fop_readdir:readdir /self->tracerw == 0/ { @other_ops[args[0]->fi_mount, "readdir"] = count(); self->counted = 1; } fsinfo::: /self->tracerw == 0 && self->counted == 0/ { @other_ops[args[0]->fi_mount, "other"] = count(); } fsinfo::: { self->counted = 0; } tick-1s /--seconds == 0/ { exit(0); } END { this->finish = walltimestamp; printf("End time: %Y\n", this->finish); printf("Elapsed: %ld msec\n", (this->finish - start) / 1000000); printa("READ ops [%s] %d %@lu\n", @rd_reads); printa("WRITE ops [%s] %d %@lu\n", @wr_writes); printa("READ ops [%s] 2 %@lu\n", @rd_pages); printa("WRITE ops [%s] 2 %@lu\n", @wr_pages); printa("READ bytes [%s] %d %@ld\n", @rd_bytes); printa("WRITE bytes [%s] %d %@ld\n", @wr_bytes); printa("READ bytes [%s] 2 %@ld\n", @rd_pagebytes); printa("WRITE bytes [%s] 2 %@ld\n", @wr_pagebytes); printa("OTHER ops [%s] %s %@lu\n", @other_ops); } END /no_fsrw/ { printf("\nWARNING: RW ops on VREG vnodes with no fsinfo read/write:\n"); printa("execname = %s, path = %s, arg0=%d, count=%@lu\n", @vreg_no_fsrw); } END /rw_with_page/ { printf("\nFYI: Page I/O on VREG vnodes after read/write:"); printa(@rw_with_pageio); }' $ELAPSED | perl -e ' # fsmodepost.pl - Post-process output of fsmode3.d # # Tim Cook, 23 Mar 2011 use strict; use warnings; #use diagnostics; our (%Stats, %Other); our $Elapsed = 0.0; #------------------------------------------------------------------------ sub fmtdec { my $dec = shift; my $width = shift; my ($rw, $metric, $mount, $mode, $value, $index, $op); # Special case for zero if ($dec == 0.0) { return (sprintf("%*d", $width, 0)); } my $intdigits = int(log($dec) / log(10.0)) + 1; $intdigits = 1 if ($intdigits <= 0); my $fracdigits = 0; if (($intdigits + 2) <= $width) { $fracdigits = $width - $intdigits - 1; } return (sprintf("%*.*f", $width, $fracdigits, $dec)); } #------------------------------------------------------------------------ sub process_input { my ($rw, $metric, $mount, $mode, $value, $index, $op); my @s; our (%Stats, %Other); while (<>) { if (/^Elapsed:\s* (\d+) msec/) { $Elapsed = $1 / 1000.0; next; } if (/(READ|WRITE)\s+(ops|bytes)\s+\[(.*)\]\s+(\d+)\s+(\d+)/) { ($rw, $metric, $mount, $mode, $value) = ($1, $2, $3, $4, $5); $mount =~ s/\s+$//; $index = 0; if ($rw eq "WRITE") { # 6 stats each for READ $ WRITE $index += 6; } if ($metric eq "bytes") { # 3 each for ops & bytes $index += 3; } $index += $mode; if (defined($Stats{$mount})) { @s = @{$Stats{$mount}}; } else { @s = (); } $s[$index] = $value; @{$Stats{$mount}} = @s; next; } if (/OTHER\s+ops\s+\[(.*)\]\s+(\S+)\s+(\d+)/) { ($mount, $op, $value) = ($1, $2, $3); $Other{$mount}{$op} = $value; next; } print; } } # READ WRITE # OPS BYTES OPS BYTES # RND SEQ PAG RND SEQ PAG RND SEQ PAG RND SEQ PAG # 0 1 2 3 4 5 6 7 8 9 10 11 #------------------------------------------------------------------------ sub output_stats { my ($mount, $value, @s, $k, $i); my ($totRd, $totWr, $totRdK, $totWrK); our (%Stats, %Other); foreach $mount (sort(keys(%main::Stats))) { @s = @{$Stats{$mount}}; for ($i = 0; $i < 12; $i++) { if (defined($s[$i])) { $value = $s[$i]; } else { $value = 0.0; } if (($i % 6) > 2 && ($value > 0.0)) { $value = $value / 1024.0; } $s[$i] = $value / $Elapsed; } foreach $k (("acc/look", "readdir", "geta/seta", "cr/rm", "other")) { if (! defined($Other{$mount}{$k})) { $Other{$mount}{$k} = 0; } else { $Other{$mount}{$k} = $Other{$mount}{$k} / $Elapsed; } } printf("\n%-26s rd/s wr/s rdKB/s wrKB/s acc/look: %s\n", $mount, fmtdec($Other{$mount}{"acc/look"}, 7)); printf(" sequential %s %s %s %s readdir: %s\n", fmtdec($s[1], 7), fmtdec($s[7], 7), fmtdec($s[4], 8), fmtdec($s[10], 8), fmtdec($Other{$mount}{"readdir"}, 7)); printf(" random %s %s %s %s geta/seta: %s\n", fmtdec($s[0], 7), fmtdec($s[6], 7), fmtdec($s[3], 8), fmtdec($s[9], 8), fmtdec($Other{$mount}{"geta/seta"}, 7)); printf(" page %s %s %s %s cr/rm: %s\n", fmtdec($s[2], 7), fmtdec($s[8], 7), fmtdec($s[5], 8), fmtdec($s[11], 8), fmtdec($Other{$mount}{"cr/rm"}, 7)); $totRd = $s[2] + $s[0] + $s[1]; $totWr = $s[7] + $s[6] + $s[8]; $totRdK = $s[4] + $s[3] + $s[5]; $totWrK = $s[10] + $s[9] + $s[11]; printf(" TOTAL %s %s %s %s other: %s\n", fmtdec($totRd, 7), fmtdec($totWr, 7), fmtdec($totRdK, 8), fmtdec($totWrK, 8), fmtdec($Other{$mount}{"other"}, 7)); # foreach $key (keys(%{ $Other{$mount} })) { # printf(" %-25s %s -> %ld\n", $mount, $key, $Other{$mount}{$key}); # } } } #======================================================================== # MAIN process_input(); output_stats(); '