// SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. * * Example program for Host Bandwidth Managment * * This program loads a cgroup skb BPF program to enforce cgroup output * (egress) or input (ingress) bandwidth limits. * * USAGE: hbm [-d] [-l] [-n <id>] [-r <rate>] [-s] [-t <secs>] [-w] [-h] [prog] * Where: * -d Print BPF trace debug buffer * -l Also limit flows doing loopback * -n <#> To create cgroup \"/hbm#\" and attach prog * Default is /hbm1 * --no_cn Do not return cn notifications * -r <rate> Rate limit in Mbps * -s Get HBM stats (marked, dropped, etc.) * -t <time> Exit after specified seconds (default is 0) * -w Work conserving flag. cgroup can increase its bandwidth * beyond the rate limit specified while there is available * bandwidth. Current implementation assumes there is only * NIC (eth0), but can be extended to support multiple NICs. * Currrently only supported for egress. * -h Print this info * prog BPF program file name. Name defaults to hbm_out_kern.o */ #define _GNU_SOURCE #include <stdio.h> #include <stdlib.h> #include <assert.h> #include <sys/time.h> #include <unistd.h> #include <errno.h> #include <fcntl.h> #include <linux/unistd.h> #include <linux/compiler.h> #include <linux/bpf.h> #include <bpf/bpf.h> #include <getopt.h> #include "cgroup_helpers.h" #include "hbm.h" #include "bpf_util.h" #include <bpf/libbpf.h> bool outFlag = true; int minRate = 1000; /* cgroup rate limit in Mbps */ int rate = 1000; /* can grow if rate conserving is enabled */ int dur = 1; bool stats_flag; bool loopback_flag; bool debugFlag; bool work_conserving_flag; bool no_cn_flag; bool edt_flag; static void Usage(void); static void read_trace_pipe2(void); static void do_error(char *msg, bool errno_flag); #define TRACEFS "/sys/kernel/tracing/" static struct bpf_program *bpf_prog; static struct bpf_object *obj; static int queue_stats_fd; static void read_trace_pipe2(void) { int trace_fd; FILE *outf; char *outFname = "hbm_out.log"; trace_fd = open(TRACEFS "trace_pipe", O_RDONLY, 0); if (trace_fd < 0) { printf("Error opening trace_pipe\n"); return; } // Future support of ingress // if (!outFlag) // outFname = "hbm_in.log"; outf = fopen(outFname, "w"); if (outf == NULL) printf("Error creating %s\n", outFname); while (1) { static char buf[4097]; ssize_t sz; sz = read(trace_fd, buf, sizeof(buf) - 1); if (sz > 0) { buf[sz] = 0; puts(buf); if (outf != NULL) { fprintf(outf, "%s\n", buf); fflush(outf); } } } } static void do_error(char *msg, bool errno_flag) { if (errno_flag) printf("ERROR: %s, errno: %d\n", msg, errno); else printf("ERROR: %s\n", msg); exit(1); } static int prog_load(char *prog) { struct bpf_program *pos; const char *sec_name; obj = bpf_object__open_file(prog, NULL); if (libbpf_get_error(obj)) { printf("ERROR: opening BPF object file failed\n"); return 1; } /* load BPF program */ if (bpf_object__load(obj)) { printf("ERROR: loading BPF object file failed\n"); goto err; } bpf_object__for_each_program(pos, obj) { sec_name = bpf_program__section_name(pos); if (sec_name && !strcmp(sec_name, "cgroup_skb/egress")) { bpf_prog = pos; break; } } if (!bpf_prog) { printf("ERROR: finding a prog in obj file failed\n"); goto err; } queue_stats_fd = bpf_object__find_map_fd_by_name(obj, "queue_stats"); if (queue_stats_fd < 0) { printf("ERROR: finding a map in obj file failed\n"); goto err; } return 0; err: bpf_object__close(obj); return 1; } static int run_bpf_prog(char *prog, int cg_id) { struct hbm_queue_stats qstats = {0}; char cg_dir[100], cg_pin_path[100]; struct bpf_link *link = NULL; int key = 0; int cg1 = 0; int rc = 0; sprintf(cg_dir, "/hbm%d", cg_id); rc = prog_load(prog); if (rc != 0) return rc; if (setup_cgroup_environment()) { printf("ERROR: setting cgroup environment\n"); goto err; } cg1 = create_and_get_cgroup(cg_dir); if (!cg1) { printf("ERROR: create_and_get_cgroup\n"); goto err; } if (join_cgroup(cg_dir)) { printf("ERROR: join_cgroup\n"); goto err; } qstats.rate = rate; qstats.stats = stats_flag ? 1 : 0; qstats.loopback = loopback_flag ? 1 : 0; qstats.no_cn = no_cn_flag ? 1 : 0; if (bpf_map_update_elem(queue_stats_fd, &key, &qstats, BPF_ANY)) { printf("ERROR: Could not update map element\n"); goto err; } if (!outFlag) bpf_program__set_expected_attach_type(bpf_prog, BPF_CGROUP_INET_INGRESS); link = bpf_program__attach_cgroup(bpf_prog, cg1); if (libbpf_get_error(link)) { fprintf(stderr, "ERROR: bpf_program__attach_cgroup failed\n"); goto err; } sprintf(cg_pin_path, "/sys/fs/bpf/hbm%d", cg_id); rc = bpf_link__pin(link, cg_pin_path); if (rc < 0) { printf("ERROR: bpf_link__pin failed: %d\n", rc); goto err; } if (work_conserving_flag) { struct timeval t0, t_last, t_new; FILE *fin; unsigned long long last_eth_tx_bytes, new_eth_tx_bytes; signed long long last_cg_tx_bytes, new_cg_tx_bytes; signed long long delta_time, delta_bytes, delta_rate; int delta_ms; #define DELTA_RATE_CHECK 10000 /* in us */ #define RATE_THRESHOLD 9500000000 /* 9.5 Gbps */ bpf_map_lookup_elem(queue_stats_fd, &key, &qstats); if (gettimeofday(&t0, NULL) < 0) do_error("gettimeofday failed", true); t_last = t0; fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", "r"); if (fscanf(fin, "%llu", &last_eth_tx_bytes) != 1) do_error("fscanf fails", false); fclose(fin); last_cg_tx_bytes = qstats.bytes_total; while (true) { usleep(DELTA_RATE_CHECK); if (gettimeofday(&t_new, NULL) < 0) do_error("gettimeofday failed", true); delta_ms = (t_new.tv_sec - t0.tv_sec) * 1000 + (t_new.tv_usec - t0.tv_usec)/1000; if (delta_ms > dur * 1000) break; delta_time = (t_new.tv_sec - t_last.tv_sec) * 1000000 + (t_new.tv_usec - t_last.tv_usec); if (delta_time == 0) continue; t_last = t_new; fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", "r"); if (fscanf(fin, "%llu", &new_eth_tx_bytes) != 1) do_error("fscanf fails", false); fclose(fin); printf(" new_eth_tx_bytes:%llu\n", new_eth_tx_bytes); bpf_map_lookup_elem(queue_stats_fd, &key, &qstats); new_cg_tx_bytes = qstats.bytes_total; delta_bytes = new_eth_tx_bytes - last_eth_tx_bytes; last_eth_tx_bytes = new_eth_tx_bytes; delta_rate = (delta_bytes * 8000000) / delta_time; printf("%5d - eth_rate:%.1fGbps cg_rate:%.3fGbps", delta_ms, delta_rate/1000000000.0, rate/1000.0); if (delta_rate < RATE_THRESHOLD) { /* can increase cgroup rate limit, but first * check if we are using the current limit. * Currently increasing by 6.25%, unknown * if that is the optimal rate. */ int rate_diff100; delta_bytes = new_cg_tx_bytes - last_cg_tx_bytes; last_cg_tx_bytes = new_cg_tx_bytes; delta_rate = (delta_bytes * 8000000) / delta_time; printf(" rate:%.3fGbps", delta_rate/1000000000.0); rate_diff100 = (((long long)rate)*1000000 - delta_rate) * 100 / (((long long) rate) * 1000000); printf(" rdiff:%d", rate_diff100); if (rate_diff100 <= 3) { rate += (rate >> 4); if (rate > RATE_THRESHOLD / 1000000) rate = RATE_THRESHOLD / 1000000; qstats.rate = rate; printf(" INC\n"); } else { printf("\n"); } } else { /* Need to decrease cgroup rate limit. * Currently decreasing by 12.5%, unknown * if that is optimal */ printf(" DEC\n"); rate -= (rate >> 3); if (rate < minRate) rate = minRate; qstats.rate = rate; } if (bpf_map_update_elem(queue_stats_fd, &key, &qstats, BPF_ANY)) do_error("update map element fails", false); } } else { sleep(dur); } // Get stats! if (stats_flag && bpf_map_lookup_elem(queue_stats_fd, &key, &qstats)) { char fname[100]; FILE *fout; if (!outFlag) sprintf(fname, "hbm.%d.in", cg_id); else sprintf(fname, "hbm.%d.out", cg_id); fout = fopen(fname, "w"); fprintf(fout, "id:%d\n", cg_id); fprintf(fout, "ERROR: Could not lookup queue_stats\n"); fclose(fout); } else if (stats_flag && qstats.lastPacketTime > qstats.firstPacketTime) { long long delta_us = (qstats.lastPacketTime - qstats.firstPacketTime)/1000; unsigned int rate_mbps = ((qstats.bytes_total - qstats.bytes_dropped) * 8 / delta_us); double percent_pkts, percent_bytes; char fname[100]; FILE *fout; int k; static const char *returnValNames[] = { "DROP_PKT", "ALLOW_PKT", "DROP_PKT_CWR", "ALLOW_PKT_CWR" }; #define RET_VAL_COUNT 4 // Future support of ingress // if (!outFlag) // sprintf(fname, "hbm.%d.in", cg_id); // else sprintf(fname, "hbm.%d.out", cg_id); fout = fopen(fname, "w"); fprintf(fout, "id:%d\n", cg_id); fprintf(fout, "rate_mbps:%d\n", rate_mbps); fprintf(fout, "duration:%.1f secs\n", (qstats.lastPacketTime - qstats.firstPacketTime) / 1000000000.0); fprintf(fout, "packets:%d\n", (int)qstats.pkts_total); fprintf(fout, "bytes_MB:%d\n", (int)(qstats.bytes_total / 1000000)); fprintf(fout, "pkts_dropped:%d\n", (int)qstats.pkts_dropped); fprintf(fout, "bytes_dropped_MB:%d\n", (int)(qstats.bytes_dropped / 1000000)); // Marked Pkts and Bytes percent_pkts = (qstats.pkts_marked * 100.0) / (qstats.pkts_total + 1); percent_bytes = (qstats.bytes_marked * 100.0) / (qstats.bytes_total + 1); fprintf(fout, "pkts_marked_percent:%6.2f\n", percent_pkts); fprintf(fout, "bytes_marked_percent:%6.2f\n", percent_bytes); // Dropped Pkts and Bytes percent_pkts = (qstats.pkts_dropped * 100.0) / (qstats.pkts_total + 1); percent_bytes = (qstats.bytes_dropped * 100.0) / (qstats.bytes_total + 1); fprintf(fout, "pkts_dropped_percent:%6.2f\n", percent_pkts); fprintf(fout, "bytes_dropped_percent:%6.2f\n", percent_bytes); // ECN CE markings percent_pkts = (qstats.pkts_ecn_ce * 100.0) / (qstats.pkts_total + 1); fprintf(fout, "pkts_ecn_ce:%6.2f (%d)\n", percent_pkts, (int)qstats.pkts_ecn_ce); // Average cwnd fprintf(fout, "avg cwnd:%d\n", (int)(qstats.sum_cwnd / (qstats.sum_cwnd_cnt + 1))); // Average rtt fprintf(fout, "avg rtt:%d\n", (int)(qstats.sum_rtt / (qstats.pkts_total + 1))); // Average credit if (edt_flag) fprintf(fout, "avg credit_ms:%.03f\n", (qstats.sum_credit / (qstats.pkts_total + 1.0)) / 1000000.0); else fprintf(fout, "avg credit:%d\n", (int)(qstats.sum_credit / (1500 * ((int)qstats.pkts_total ) + 1))); // Return values stats for (k = 0; k < RET_VAL_COUNT; k++) { percent_pkts = (qstats.returnValCount[k] * 100.0) / (qstats.pkts_total + 1); fprintf(fout, "%s:%6.2f (%d)\n", returnValNames[k], percent_pkts, (int)qstats.returnValCount[k]); } fclose(fout); } if (debugFlag) read_trace_pipe2(); goto cleanup; err: rc = 1; cleanup: bpf_link__destroy(link); bpf_object__close(obj); if (cg1 != -1) close(cg1); if (rc != 0) cleanup_cgroup_environment(); return rc; } static void Usage(void) { printf("This program loads a cgroup skb BPF program to enforce\n" "cgroup output (egress) bandwidth limits.\n\n" "USAGE: hbm [-o] [-d] [-l] [-n <id>] [--no_cn] [-r <rate>]\n" " [-s] [-t <secs>] [-w] [-h] [prog]\n" " Where:\n" " -o indicates egress direction (default)\n" " -d print BPF trace debug buffer\n" " --edt use fq's Earliest Departure Time\n" " -l also limit flows using loopback\n" " -n <#> to create cgroup \"/hbm#\" and attach prog\n" " Default is /hbm1\n" " --no_cn disable CN notifications\n" " -r <rate> Rate in Mbps\n" " -s Update HBM stats\n" " -t <time> Exit after specified seconds (default is 0)\n" " -w Work conserving flag. cgroup can increase\n" " bandwidth beyond the rate limit specified\n" " while there is available bandwidth. Current\n" " implementation assumes there is only eth0\n" " but can be extended to support multiple NICs\n" " -h print this info\n" " prog BPF program file name. Name defaults to\n" " hbm_out_kern.o\n"); } int main(int argc, char **argv) { char *prog = "hbm_out_kern.o"; int k; int cg_id = 1; char *optstring = "iodln:r:st:wh"; struct option loptions[] = { {"no_cn", 0, NULL, 1}, {"edt", 0, NULL, 2}, {NULL, 0, NULL, 0} }; while ((k = getopt_long(argc, argv, optstring, loptions, NULL)) != -1) { switch (k) { case 1: no_cn_flag = true; break; case 2: prog = "hbm_edt_kern.o"; edt_flag = true; break; case'o': break; case 'd': debugFlag = true; break; case 'l': loopback_flag = true; break; case 'n': cg_id = atoi(optarg); break; case 'r': minRate = atoi(optarg) * 1.024; rate = minRate; break; case 's': stats_flag = true; break; case 't': dur = atoi(optarg); break; case 'w': work_conserving_flag = true; break; case '?': if (optopt == 'n' || optopt == 'r' || optopt == 't') fprintf(stderr, "Option -%c requires an argument.\n\n", optopt); case 'h': default: Usage(); return 0; } } if (optind < argc) prog = argv[optind]; printf("HBM prog: %s\n", prog != NULL ? prog : "NULL"); /* Use libbpf 1.0 API mode */ libbpf_set_strict_mode(LIBBPF_STRICT_ALL); return run_bpf_prog(prog, cg_id); }