|
| 1 | +from bcc import BPF |
| 2 | +import glob |
| 3 | +import os |
| 4 | +import re |
| 5 | +import time |
| 6 | +import argparse |
| 7 | +from time import sleep |
| 8 | +import signal |
| 9 | +import ctypes as ct |
| 10 | + |
| 11 | +debug = 0 |
| 12 | + |
| 13 | +bpf_text = """ |
| 14 | +#include <uapi/linux/ptrace.h> |
| 15 | +#include <linux/blkdev.h> |
| 16 | +#include <linux/genhd.h> |
| 17 | +#include <linux/device.h> |
| 18 | +#include <linux/kdev_t.h> |
| 19 | +#include <linux/uio.h> |
| 20 | +
|
| 21 | +typedef struct request_size_s { |
| 22 | + u64 size; |
| 23 | + u64 read; |
| 24 | +} request_size_t; |
| 25 | +
|
| 26 | +BPF_PERF_OUTPUT(bio_events); |
| 27 | +BPF_PERF_OUTPUT(iter_events); |
| 28 | +BPF_PERF_OUTPUT(req_events); |
| 29 | +BPF_PERF_OUTPUT(split_events); |
| 30 | +
|
| 31 | +// This sucks, but we have no better solution |
| 32 | +static dev_t get_devt(struct request *req) |
| 33 | +{ |
| 34 | + struct gendisk *disk = req->rq_disk; |
| 35 | + return disk->part0.__dev.devt; |
| 36 | +} |
| 37 | +
|
| 38 | +int trace_req_start(struct pt_regs *ctx, struct request *req) |
| 39 | +{ |
| 40 | + dev_t device = get_devt(req); |
| 41 | + int major = MAJOR(device); |
| 42 | + int minor = MINOR(device); |
| 43 | +
|
| 44 | + if (!(CONDITIONALS)) |
| 45 | + return 0; |
| 46 | + request_size_t data = { |
| 47 | + .size = req->__data_len, |
| 48 | + .read = !(req->cmd_flags & 1), |
| 49 | + }; |
| 50 | + req_events.perf_submit(ctx, &data, sizeof(data)); |
| 51 | + return 0; |
| 52 | +} |
| 53 | +
|
| 54 | +int trace_bio_split(struct pt_regs *ctx, struct bio *bio, int nr_sectors) |
| 55 | +{ |
| 56 | + dev_t device = bio->bi_bdev->bd_disk->part0.__dev.devt; |
| 57 | + int major = MAJOR(device); |
| 58 | + int minor = MINOR(device); |
| 59 | +
|
| 60 | + if (!(CONDITIONALS)) |
| 61 | + return 0; |
| 62 | + request_size_t data = { |
| 63 | + .size = nr_sectors << 9, |
| 64 | + .read = !(bio->bi_opf & 1), |
| 65 | + }; |
| 66 | + split_events.perf_submit(ctx, &data, sizeof(data)); |
| 67 | + return 0; |
| 68 | +} |
| 69 | +
|
| 70 | +int trace_submit_bio(struct pt_regs *ctx, struct bio *bio) |
| 71 | +{ |
| 72 | + dev_t device = bio->bi_bdev->bd_disk->part0.__dev.devt; |
| 73 | + int major = MAJOR(device); |
| 74 | + int minor = MINOR(device); |
| 75 | + u64 count = bio->bi_iter.bi_size; |
| 76 | +
|
| 77 | + if (!(CONDITIONALS)) |
| 78 | + return 0; |
| 79 | + request_size_t data = { |
| 80 | + .size = count, |
| 81 | + .read = !(bio->bi_opf & 1), |
| 82 | + }; |
| 83 | + bio_events.perf_submit(ctx, &data, sizeof(data)); |
| 84 | + return 0; |
| 85 | +} |
| 86 | +
|
| 87 | +typedef struct bio_storage_s { |
| 88 | + struct bio *bio; |
| 89 | +} bio_storage_t; |
| 90 | +
|
| 91 | +BPF_HASH(bios, u64, bio_storage_t); |
| 92 | +
|
| 93 | +int trace_bio_iov_iter_get_pages(struct pt_regs *ctx, struct bio *bio) |
| 94 | +{ |
| 95 | + u64 pid = bpf_get_current_pid_tgid(); |
| 96 | + bio_storage_t data = { |
| 97 | + .bio = bio, |
| 98 | + }; |
| 99 | + bios.update(&pid, &data); |
| 100 | + return 0; |
| 101 | +} |
| 102 | +
|
| 103 | +int trace_bio_iov_iter_get_pages_ret(struct pt_regs *ctx) |
| 104 | +{ |
| 105 | + u64 pid = bpf_get_current_pid_tgid(); |
| 106 | + bio_storage_t *data; |
| 107 | +
|
| 108 | + data = bios.lookup(&pid); |
| 109 | + if (!data) |
| 110 | + return 0; |
| 111 | +
|
| 112 | + u64 opf; |
| 113 | + request_size_t req = {}; |
| 114 | + bpf_probe_read(&req.size, sizeof(u64), &data->bio->bi_iter.bi_size); |
| 115 | + bpf_probe_read(&opf, sizeof(u64), &data->bio->bi_opf); |
| 116 | + req.read = !(opf & 1); |
| 117 | + iter_events.perf_submit(ctx, &req, sizeof(req)); |
| 118 | + bios.delete(&pid); |
| 119 | + return 0; |
| 120 | +} |
| 121 | +
|
| 122 | +""" |
| 123 | + |
| 124 | +parser = argparse.ArgumentParser() |
| 125 | +parser.add_argument("-d", "--device", |
| 126 | + help="Trace this device only") |
| 127 | +args = parser.parse_args() |
| 128 | + |
| 129 | +disks = [] |
| 130 | +if args.device: |
| 131 | + disks.append({'name': os.path.basename(args.device)}) |
| 132 | +else: |
| 133 | + dev_patterns = ['sd.*', 'nvme.*', 'nbd.*', 'md.*', "fio*", "etherd*"] |
| 134 | + for device in glob.glob("/sys/block/*"): |
| 135 | + for pattern in dev_patterns: |
| 136 | + if re.compile(pattern).match(os.path.basename(device)): |
| 137 | + if pattern == "etherd*": |
| 138 | + disks.append({'name': os.path.basename(device).replace('!', '/')}) |
| 139 | + else: |
| 140 | + disks.append({'name': os.path.basename(device)}) |
| 141 | +if debug: |
| 142 | + print(disks) |
| 143 | + |
| 144 | +first = True |
| 145 | +conditional_template = "(major == MAJOR && minor == MINOR)" |
| 146 | +conditionals = "" |
| 147 | +for disk in disks: |
| 148 | + stinfo = os.stat('/dev/{}'.format(disk['name'])) |
| 149 | + disk['major'] = os.major(stinfo.st_rdev) |
| 150 | + disk['minor'] = os.minor(stinfo.st_rdev) |
| 151 | + tmp = conditional_template.replace('MAJOR', "{}".format(disk['major'])) |
| 152 | + tmp = tmp.replace('MINOR', "{}".format(disk['minor'])) |
| 153 | + if not first: |
| 154 | + conditionals += " || " |
| 155 | + first = False |
| 156 | + conditionals += tmp |
| 157 | + |
| 158 | +if conditionals == "": |
| 159 | + conditionals = "1" |
| 160 | +bpf_text = bpf_text.replace('CONDITIONALS', conditionals) |
| 161 | + |
| 162 | +# load BPF program |
| 163 | +b = BPF(text=bpf_text) |
| 164 | +b.attach_kprobe(event="submit_bio", fn_name="trace_submit_bio") |
| 165 | +b.attach_kprobe(event="bio_iov_iter_get_pages", fn_name="trace_bio_iov_iter_get_pages") |
| 166 | +b.attach_kretprobe(event="bio_iov_iter_get_pages", fn_name="trace_bio_iov_iter_get_pages_ret") |
| 167 | +b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start") |
| 168 | +b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start") |
| 169 | +b.attach_kprobe(event="bio_split", fn_name="trace_bio_split") |
| 170 | + |
| 171 | +class RequestSize(ct.Structure): |
| 172 | + _fields_ = [ |
| 173 | + ("size", ct.c_ulonglong), |
| 174 | + ("read", ct.c_ulonglong), |
| 175 | + ] |
| 176 | + |
| 177 | +def print_size(prestr, event): |
| 178 | + iostr = "write" |
| 179 | + if event.read == 1: |
| 180 | + iostr = "read" |
| 181 | + print("{} {}: {}".format(prestr, iostr, event.size)) |
| 182 | + |
| 183 | +def print_bio_size(cpu, data, size): |
| 184 | + event = ct.cast(data, ct.POINTER(RequestSize)).contents |
| 185 | + print_size("bio", event) |
| 186 | + |
| 187 | +def print_iter_size(cpu, data, size): |
| 188 | + event = ct.cast(data, ct.POINTER(RequestSize)).contents |
| 189 | + print_size("iter", event) |
| 190 | + |
| 191 | +def print_req_size(cpu, data, size): |
| 192 | + event = ct.cast(data, ct.POINTER(RequestSize)).contents |
| 193 | + print_size("req", event) |
| 194 | + |
| 195 | +def print_split_size(cpu, data, size): |
| 196 | + event = ct.cast(data, ct.POINTER(RequestSize)).contents |
| 197 | + print_size("split", event) |
| 198 | + |
| 199 | +b["bio_events"].open_perf_buffer(print_bio_size) |
| 200 | +b["iter_events"].open_perf_buffer(print_iter_size) |
| 201 | +b["req_events"].open_perf_buffer(print_req_size) |
| 202 | +b["split_events"].open_perf_buffer(print_split_size) |
| 203 | +while 1: |
| 204 | + b.kprobe_poll() |
0 commit comments