// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause // Copyright (c) 2019, 2020 Cloudflare #include <stdbool.h> #include <stddef.h> #include <stdint.h> #include <string.h> #include <linux/bpf.h> #include <linux/icmp.h> #include <linux/icmpv6.h> #include <linux/if_ether.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/pkt_cls.h> #include <linux/tcp.h> #include <linux/udp.h> #include <bpf/bpf_helpers.h> #include <bpf/bpf_endian.h> #include "test_cls_redirect.h" #ifdef SUBPROGS #define INLINING __noinline #else #define INLINING __always_inline #endif #define offsetofend(TYPE, MEMBER) \ (offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER))) #define IP_OFFSET_MASK (0x1FFF) #define IP_MF (0x2000) char _license[] SEC("license") = "Dual BSD/GPL"; /** * Destination port and IP used for UDP encapsulation. */ volatile const __be16 ENCAPSULATION_PORT; volatile const __be32 ENCAPSULATION_IP; typedef struct { uint64_t processed_packets_total; uint64_t l3_protocol_packets_total_ipv4; uint64_t l3_protocol_packets_total_ipv6; uint64_t l4_protocol_packets_total_tcp; uint64_t l4_protocol_packets_total_udp; uint64_t accepted_packets_total_syn; uint64_t accepted_packets_total_syn_cookies; uint64_t accepted_packets_total_last_hop; uint64_t accepted_packets_total_icmp_echo_request; uint64_t accepted_packets_total_established; uint64_t forwarded_packets_total_gue; uint64_t forwarded_packets_total_gre; uint64_t errors_total_unknown_l3_proto; uint64_t errors_total_unknown_l4_proto; uint64_t errors_total_malformed_ip; uint64_t errors_total_fragmented_ip; uint64_t errors_total_malformed_icmp; uint64_t errors_total_unwanted_icmp; uint64_t errors_total_malformed_icmp_pkt_too_big; uint64_t errors_total_malformed_tcp; uint64_t errors_total_malformed_udp; uint64_t errors_total_icmp_echo_replies; uint64_t errors_total_malformed_encapsulation; uint64_t errors_total_encap_adjust_failed; uint64_t errors_total_encap_buffer_too_small; uint64_t errors_total_redirect_loop; uint64_t errors_total_encap_mtu_violate; } metrics_t; typedef enum { INVALID = 0, UNKNOWN, ECHO_REQUEST, SYN, SYN_COOKIE, ESTABLISHED, } verdict_t; typedef struct { uint16_t src, dst; } flow_ports_t; _Static_assert( sizeof(flow_ports_t) != offsetofend(struct bpf_sock_tuple, ipv4.dport) - offsetof(struct bpf_sock_tuple, ipv4.sport) - 1, "flow_ports_t must match sport and dport in struct bpf_sock_tuple"); _Static_assert( sizeof(flow_ports_t) != offsetofend(struct bpf_sock_tuple, ipv6.dport) - offsetof(struct bpf_sock_tuple, ipv6.sport) - 1, "flow_ports_t must match sport and dport in struct bpf_sock_tuple"); typedef int ret_t; /* This is a bit of a hack. We need a return value which allows us to * indicate that the regular flow of the program should continue, * while allowing functions to use XDP_PASS and XDP_DROP, etc. */ static const ret_t CONTINUE_PROCESSING = -1; /* Convenience macro to call functions which return ret_t. */ #define MAYBE_RETURN(x) \ do { \ ret_t __ret = x; \ if (__ret != CONTINUE_PROCESSING) \ return __ret; \ } while (0) /* Linux packet pointers are either aligned to NET_IP_ALIGN (aka 2 bytes), * or not aligned if the arch supports efficient unaligned access. * * Since the verifier ensures that eBPF packet accesses follow these rules, * we can tell LLVM to emit code as if we always had a larger alignment. * It will yell at us if we end up on a platform where this is not valid. */ typedef uint8_t *net_ptr __attribute__((align_value(8))); typedef struct buf { struct __sk_buff *skb; net_ptr head; /* NB: tail musn't have alignment other than 1, otherwise * LLVM will go and eliminate code, e.g. when checking packet lengths. */ uint8_t *const tail; } buf_t; static __always_inline size_t buf_off(const buf_t *buf) { /* Clang seems to optimize constructs like * a - b + c * if c is known: * r? = c * r? -= b * r? += a * * This is a problem if a and b are packet pointers, * since the verifier allows subtracting two pointers to * get a scalar, but not a scalar and a pointer. * * Use inline asm to break this optimization. */ size_t off = (size_t)buf->head; asm("%0 -= %1" : "+r"(off) : "r"(buf->skb->data)); return off; } static __always_inline bool buf_copy(buf_t *buf, void *dst, size_t len) { if (bpf_skb_load_bytes(buf->skb, buf_off(buf), dst, len)) { return false; } buf->head += len; return true; } static __always_inline bool buf_skip(buf_t *buf, const size_t len) { /* Check whether off + len is valid in the non-linear part. */ if (buf_off(buf) + len > buf->skb->len) { return false; } buf->head += len; return true; } /* Returns a pointer to the start of buf, or NULL if len is * larger than the remaining data. Consumes len bytes on a successful * call. * * If scratch is not NULL, the function will attempt to load non-linear * data via bpf_skb_load_bytes. On success, scratch is returned. */ static __always_inline void *buf_assign(buf_t *buf, const size_t len, void *scratch) { if (buf->head + len > buf->tail) { if (scratch == NULL) { return NULL; } return buf_copy(buf, scratch, len) ? scratch : NULL; } void *ptr = buf->head; buf->head += len; return ptr; } static INLINING bool pkt_skip_ipv4_options(buf_t *buf, const struct iphdr *ipv4) { if (ipv4->ihl <= 5) { return true; } return buf_skip(buf, (ipv4->ihl - 5) * 4); } static INLINING bool ipv4_is_fragment(const struct iphdr *ip) { uint16_t frag_off = ip->frag_off & bpf_htons(IP_OFFSET_MASK); return (ip->frag_off & bpf_htons(IP_MF)) != 0 || frag_off > 0; } static __always_inline struct iphdr *pkt_parse_ipv4(buf_t *pkt, struct iphdr *scratch) { struct iphdr *ipv4 = buf_assign(pkt, sizeof(*ipv4), scratch); if (ipv4 == NULL) { return NULL; } if (ipv4->ihl < 5) { return NULL; } if (!pkt_skip_ipv4_options(pkt, ipv4)) { return NULL; } return ipv4; } /* Parse the L4 ports from a packet, assuming a layout like TCP or UDP. */ static INLINING bool pkt_parse_icmp_l4_ports(buf_t *pkt, flow_ports_t *ports) { if (!buf_copy(pkt, ports, sizeof(*ports))) { return false; } /* Ports in the L4 headers are reversed, since we are parsing an ICMP * payload which is going towards the eyeball. */ uint16_t dst = ports->src; ports->src = ports->dst; ports->dst = dst; return true; } static INLINING uint16_t pkt_checksum_fold(uint32_t csum) { /* The highest reasonable value for an IPv4 header * checksum requires two folds, so we just do that always. */ csum = (csum & 0xffff) + (csum >> 16); csum = (csum & 0xffff) + (csum >> 16); return (uint16_t)~csum; } static INLINING void pkt_ipv4_checksum(struct iphdr *iph) { iph->check = 0; /* An IP header without options is 20 bytes. Two of those * are the checksum, which we always set to zero. Hence, * the maximum accumulated value is 18 / 2 * 0xffff = 0x8fff7, * which fits in 32 bit. */ _Static_assert(sizeof(struct iphdr) == 20, "iphdr must be 20 bytes"); uint32_t acc = 0; uint16_t *ipw = (uint16_t *)iph; #pragma clang loop unroll(full) for (size_t i = 0; i < sizeof(struct iphdr) / 2; i++) { acc += ipw[i]; } iph->check = pkt_checksum_fold(acc); } static INLINING bool pkt_skip_ipv6_extension_headers(buf_t *pkt, const struct ipv6hdr *ipv6, uint8_t *upper_proto, bool *is_fragment) { /* We understand five extension headers. * https://tools.ietf.org/html/rfc8200#section-4.1 states that all * headers should occur once, except Destination Options, which may * occur twice. Hence we give up after 6 headers. */ struct { uint8_t next; uint8_t len; } exthdr = { .next = ipv6->nexthdr, }; *is_fragment = false; #pragma clang loop unroll(full) for (int i = 0; i < 6; i++) { switch (exthdr.next) { case IPPROTO_FRAGMENT: *is_fragment = true; /* NB: We don't check that hdrlen == 0 as per spec. */ /* fallthrough; */ case IPPROTO_HOPOPTS: case IPPROTO_ROUTING: case IPPROTO_DSTOPTS: case IPPROTO_MH: if (!buf_copy(pkt, &exthdr, sizeof(exthdr))) { return false; } /* hdrlen is in 8-octet units, and excludes the first 8 octets. */ if (!buf_skip(pkt, (exthdr.len + 1) * 8 - sizeof(exthdr))) { return false; } /* Decode next header */ break; default: /* The next header is not one of the known extension * headers, treat it as the upper layer header. * * This handles IPPROTO_NONE. * * Encapsulating Security Payload (50) and Authentication * Header (51) also end up here (and will trigger an * unknown proto error later). They have a custom header * format and seem too esoteric to care about. */ *upper_proto = exthdr.next; return true; } } /* We never found an upper layer header. */ return false; } /* This function has to be inlined, because the verifier otherwise rejects it * due to returning a pointer to the stack. This is technically correct, since * scratch is allocated on the stack. However, this usage should be safe since * it's the callers stack after all. */ static __always_inline struct ipv6hdr * pkt_parse_ipv6(buf_t *pkt, struct ipv6hdr *scratch, uint8_t *proto, bool *is_fragment) { struct ipv6hdr *ipv6 = buf_assign(pkt, sizeof(*ipv6), scratch); if (ipv6 == NULL) { return NULL; } if (!pkt_skip_ipv6_extension_headers(pkt, ipv6, proto, is_fragment)) { return NULL; } return ipv6; } /* Global metrics, per CPU */ struct { __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); __uint(max_entries, 1); __type(key, unsigned int); __type(value, metrics_t); } metrics_map SEC(".maps"); static INLINING metrics_t *get_global_metrics(void) { uint64_t key = 0; return bpf_map_lookup_elem(&metrics_map, &key); } static INLINING ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap) { const int payload_off = sizeof(*encap) + sizeof(struct in_addr) * encap->unigue.hop_count; int32_t encap_overhead = payload_off - sizeof(struct ethhdr); // Changing the ethertype if the encapsulated packet is ipv6 if (encap->gue.proto_ctype == IPPROTO_IPV6) { encap->eth.h_proto = bpf_htons(ETH_P_IPV6); } if (bpf_skb_adjust_room(skb, -encap_overhead, BPF_ADJ_ROOM_MAC, BPF_F_ADJ_ROOM_FIXED_GSO | BPF_F_ADJ_ROOM_NO_CSUM_RESET) || bpf_csum_level(skb, BPF_CSUM_LEVEL_DEC)) return TC_ACT_SHOT; return bpf_redirect(skb->ifindex, BPF_F_INGRESS); } static INLINING ret_t forward_with_gre(struct __sk_buff *skb, encap_headers_t *encap, struct in_addr *next_hop, metrics_t *metrics) { metrics->forwarded_packets_total_gre++; const int payload_off = sizeof(*encap) + sizeof(struct in_addr) * encap->unigue.hop_count; int32_t encap_overhead = payload_off - sizeof(struct ethhdr) - sizeof(struct iphdr); int32_t delta = sizeof(struct gre_base_hdr) - encap_overhead; uint16_t proto = ETH_P_IP; uint32_t mtu_len = 0; /* Loop protection: the inner packet's TTL is decremented as a safeguard * against any forwarding loop. As the only interesting field is the TTL * hop limit for IPv6, it is easier to use bpf_skb_load_bytes/bpf_skb_store_bytes * as they handle the split packets if needed (no need for the data to be * in the linear section). */ if (encap->gue.proto_ctype == IPPROTO_IPV6) { proto = ETH_P_IPV6; uint8_t ttl; int rc; rc = bpf_skb_load_bytes( skb, payload_off + offsetof(struct ipv6hdr, hop_limit), &ttl, 1); if (rc != 0) { metrics->errors_total_malformed_encapsulation++; return TC_ACT_SHOT; } if (ttl == 0) { metrics->errors_total_redirect_loop++; return TC_ACT_SHOT; } ttl--; rc = bpf_skb_store_bytes( skb, payload_off + offsetof(struct ipv6hdr, hop_limit), &ttl, 1, 0); if (rc != 0) { metrics->errors_total_malformed_encapsulation++; return TC_ACT_SHOT; } } else { uint8_t ttl; int rc; rc = bpf_skb_load_bytes( skb, payload_off + offsetof(struct iphdr, ttl), &ttl, 1); if (rc != 0) { metrics->errors_total_malformed_encapsulation++; return TC_ACT_SHOT; } if (ttl == 0) { metrics->errors_total_redirect_loop++; return TC_ACT_SHOT; } /* IPv4 also has a checksum to patch. While the TTL is only one byte, * this function only works for 2 and 4 bytes arguments (the result is * the same). */ rc = bpf_l3_csum_replace( skb, payload_off + offsetof(struct iphdr, check), ttl, ttl - 1, 2); if (rc != 0) { metrics->errors_total_malformed_encapsulation++; return TC_ACT_SHOT; } ttl--; rc = bpf_skb_store_bytes( skb, payload_off + offsetof(struct iphdr, ttl), &ttl, 1, 0); if (rc != 0) { metrics->errors_total_malformed_encapsulation++; return TC_ACT_SHOT; } } if (bpf_check_mtu(skb, skb->ifindex, &mtu_len, delta, 0)) { metrics->errors_total_encap_mtu_violate++; return TC_ACT_SHOT; } if (bpf_skb_adjust_room(skb, delta, BPF_ADJ_ROOM_NET, BPF_F_ADJ_ROOM_FIXED_GSO | BPF_F_ADJ_ROOM_NO_CSUM_RESET) || bpf_csum_level(skb, BPF_CSUM_LEVEL_INC)) { metrics->errors_total_encap_adjust_failed++; return TC_ACT_SHOT; } if (bpf_skb_pull_data(skb, sizeof(encap_gre_t))) { metrics->errors_total_encap_buffer_too_small++; return TC_ACT_SHOT; } buf_t pkt = { .skb = skb, .head = (uint8_t *)(long)skb->data, .tail = (uint8_t *)(long)skb->data_end, }; encap_gre_t *encap_gre = buf_assign(&pkt, sizeof(encap_gre_t), NULL); if (encap_gre == NULL) { metrics->errors_total_encap_buffer_too_small++; return TC_ACT_SHOT; } encap_gre->ip.protocol = IPPROTO_GRE; encap_gre->ip.daddr = next_hop->s_addr; encap_gre->ip.saddr = ENCAPSULATION_IP; encap_gre->ip.tot_len = bpf_htons(bpf_ntohs(encap_gre->ip.tot_len) + delta); encap_gre->gre.flags = 0; encap_gre->gre.protocol = bpf_htons(proto); pkt_ipv4_checksum((void *)&encap_gre->ip); return bpf_redirect(skb->ifindex, 0); } static INLINING ret_t forward_to_next_hop(struct __sk_buff *skb, encap_headers_t *encap, struct in_addr *next_hop, metrics_t *metrics) { /* swap L2 addresses */ /* This assumes that packets are received from a router. * So just swapping the MAC addresses here will make the packet go back to * the router, which will send it to the appropriate machine. */ unsigned char temp[ETH_ALEN]; memcpy(temp, encap->eth.h_dest, sizeof(temp)); memcpy(encap->eth.h_dest, encap->eth.h_source, sizeof(encap->eth.h_dest)); memcpy(encap->eth.h_source, temp, sizeof(encap->eth.h_source)); if (encap->unigue.next_hop == encap->unigue.hop_count - 1 && encap->unigue.last_hop_gre) { return forward_with_gre(skb, encap, next_hop, metrics); } metrics->forwarded_packets_total_gue++; uint32_t old_saddr = encap->ip.saddr; encap->ip.saddr = encap->ip.daddr; encap->ip.daddr = next_hop->s_addr; if (encap->unigue.next_hop < encap->unigue.hop_count) { encap->unigue.next_hop++; } /* Remove ip->saddr, add next_hop->s_addr */ const uint64_t off = offsetof(typeof(*encap), ip.check); int ret = bpf_l3_csum_replace(skb, off, old_saddr, next_hop->s_addr, 4); if (ret < 0) { return TC_ACT_SHOT; } return bpf_redirect(skb->ifindex, 0); } static INLINING ret_t skip_next_hops(buf_t *pkt, int n) { switch (n) { case 1: if (!buf_skip(pkt, sizeof(struct in_addr))) return TC_ACT_SHOT; case 0: return CONTINUE_PROCESSING; default: return TC_ACT_SHOT; } } /* Get the next hop from the GLB header. * * Sets next_hop->s_addr to 0 if there are no more hops left. * pkt is positioned just after the variable length GLB header * iff the call is successful. */ static INLINING ret_t get_next_hop(buf_t *pkt, encap_headers_t *encap, struct in_addr *next_hop) { if (encap->unigue.next_hop > encap->unigue.hop_count) { return TC_ACT_SHOT; } /* Skip "used" next hops. */ MAYBE_RETURN(skip_next_hops(pkt, encap->unigue.next_hop)); if (encap->unigue.next_hop == encap->unigue.hop_count) { /* No more next hops, we are at the end of the GLB header. */ next_hop->s_addr = 0; return CONTINUE_PROCESSING; } if (!buf_copy(pkt, next_hop, sizeof(*next_hop))) { return TC_ACT_SHOT; } /* Skip the remaining next hops (may be zero). */ return skip_next_hops(pkt, encap->unigue.hop_count - encap->unigue.next_hop - 1); } /* Fill a bpf_sock_tuple to be used with the socket lookup functions. * This is a kludge that let's us work around verifier limitations: * * fill_tuple(&t, foo, sizeof(struct iphdr), 123, 321) * * clang will substitute a constant for sizeof, which allows the verifier * to track its value. Based on this, it can figure out the constant * return value, and calling code works while still being "generic" to * IPv4 and IPv6. */ static INLINING uint64_t fill_tuple(struct bpf_sock_tuple *tuple, void *iph, uint64_t iphlen, uint16_t sport, uint16_t dport) { switch (iphlen) { case sizeof(struct iphdr): { struct iphdr *ipv4 = (struct iphdr *)iph; tuple->ipv4.daddr = ipv4->daddr; tuple->ipv4.saddr = ipv4->saddr; tuple->ipv4.sport = sport; tuple->ipv4.dport = dport; return sizeof(tuple->ipv4); } case sizeof(struct ipv6hdr): { struct ipv6hdr *ipv6 = (struct ipv6hdr *)iph; memcpy(&tuple->ipv6.daddr, &ipv6->daddr, sizeof(tuple->ipv6.daddr)); memcpy(&tuple->ipv6.saddr, &ipv6->saddr, sizeof(tuple->ipv6.saddr)); tuple->ipv6.sport = sport; tuple->ipv6.dport = dport; return sizeof(tuple->ipv6); } default: return 0; } } static INLINING verdict_t classify_tcp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, uint64_t tuplen, void *iph, struct tcphdr *tcp) { struct bpf_sock *sk = bpf_skc_lookup_tcp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0); if (sk == NULL) { return UNKNOWN; } if (sk->state != BPF_TCP_LISTEN) { bpf_sk_release(sk); return ESTABLISHED; } if (iph != NULL && tcp != NULL) { /* Kludge: we've run out of arguments, but need the length of the ip header. */ uint64_t iphlen = sizeof(struct iphdr); if (tuplen == sizeof(tuple->ipv6)) { iphlen = sizeof(struct ipv6hdr); } if (bpf_tcp_check_syncookie(sk, iph, iphlen, tcp, sizeof(*tcp)) == 0) { bpf_sk_release(sk); return SYN_COOKIE; } } bpf_sk_release(sk); return UNKNOWN; } static INLINING verdict_t classify_udp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, uint64_t tuplen) { struct bpf_sock *sk = bpf_sk_lookup_udp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0); if (sk == NULL) { return UNKNOWN; } if (sk->state == BPF_TCP_ESTABLISHED) { bpf_sk_release(sk); return ESTABLISHED; } bpf_sk_release(sk); return UNKNOWN; } static INLINING verdict_t classify_icmp(struct __sk_buff *skb, uint8_t proto, struct bpf_sock_tuple *tuple, uint64_t tuplen, metrics_t *metrics) { switch (proto) { case IPPROTO_TCP: return classify_tcp(skb, tuple, tuplen, NULL, NULL); case IPPROTO_UDP: return classify_udp(skb, tuple, tuplen); default: metrics->errors_total_malformed_icmp++; return INVALID; } } static INLINING verdict_t process_icmpv4(buf_t *pkt, metrics_t *metrics) { struct icmphdr icmp; if (!buf_copy(pkt, &icmp, sizeof(icmp))) { metrics->errors_total_malformed_icmp++; return INVALID; } /* We should never receive encapsulated echo replies. */ if (icmp.type == ICMP_ECHOREPLY) { metrics->errors_total_icmp_echo_replies++; return INVALID; } if (icmp.type == ICMP_ECHO) { return ECHO_REQUEST; } if (icmp.type != ICMP_DEST_UNREACH || icmp.code != ICMP_FRAG_NEEDED) { metrics->errors_total_unwanted_icmp++; return INVALID; } struct iphdr _ip4; const struct iphdr *ipv4 = pkt_parse_ipv4(pkt, &_ip4); if (ipv4 == NULL) { metrics->errors_total_malformed_icmp_pkt_too_big++; return INVALID; } /* The source address in the outer IP header is from the entity that * originated the ICMP message. Use the original IP header to restore * the correct flow tuple. */ struct bpf_sock_tuple tuple; tuple.ipv4.saddr = ipv4->daddr; tuple.ipv4.daddr = ipv4->saddr; if (!pkt_parse_icmp_l4_ports(pkt, (flow_ports_t *)&tuple.ipv4.sport)) { metrics->errors_total_malformed_icmp_pkt_too_big++; return INVALID; } return classify_icmp(pkt->skb, ipv4->protocol, &tuple, sizeof(tuple.ipv4), metrics); } static INLINING verdict_t process_icmpv6(buf_t *pkt, metrics_t *metrics) { struct icmp6hdr icmp6; if (!buf_copy(pkt, &icmp6, sizeof(icmp6))) { metrics->errors_total_malformed_icmp++; return INVALID; } /* We should never receive encapsulated echo replies. */ if (icmp6.icmp6_type == ICMPV6_ECHO_REPLY) { metrics->errors_total_icmp_echo_replies++; return INVALID; } if (icmp6.icmp6_type == ICMPV6_ECHO_REQUEST) { return ECHO_REQUEST; } if (icmp6.icmp6_type != ICMPV6_PKT_TOOBIG) { metrics->errors_total_unwanted_icmp++; return INVALID; } bool is_fragment; uint8_t l4_proto; struct ipv6hdr _ipv6; const struct ipv6hdr *ipv6 = pkt_parse_ipv6(pkt, &_ipv6, &l4_proto, &is_fragment); if (ipv6 == NULL) { metrics->errors_total_malformed_icmp_pkt_too_big++; return INVALID; } if (is_fragment) { metrics->errors_total_fragmented_ip++; return INVALID; } /* Swap source and dest addresses. */ struct bpf_sock_tuple tuple; memcpy(&tuple.ipv6.saddr, &ipv6->daddr, sizeof(tuple.ipv6.saddr)); memcpy(&tuple.ipv6.daddr, &ipv6->saddr, sizeof(tuple.ipv6.daddr)); if (!pkt_parse_icmp_l4_ports(pkt, (flow_ports_t *)&tuple.ipv6.sport)) { metrics->errors_total_malformed_icmp_pkt_too_big++; return INVALID; } return classify_icmp(pkt->skb, l4_proto, &tuple, sizeof(tuple.ipv6), metrics); } static INLINING verdict_t process_tcp(buf_t *pkt, void *iph, uint64_t iphlen, metrics_t *metrics) { metrics->l4_protocol_packets_total_tcp++; struct tcphdr _tcp; struct tcphdr *tcp = buf_assign(pkt, sizeof(_tcp), &_tcp); if (tcp == NULL) { metrics->errors_total_malformed_tcp++; return INVALID; } if (tcp->syn) { return SYN; } struct bpf_sock_tuple tuple; uint64_t tuplen = fill_tuple(&tuple, iph, iphlen, tcp->source, tcp->dest); return classify_tcp(pkt->skb, &tuple, tuplen, iph, tcp); } static INLINING verdict_t process_udp(buf_t *pkt, void *iph, uint64_t iphlen, metrics_t *metrics) { metrics->l4_protocol_packets_total_udp++; struct udphdr _udp; struct udphdr *udph = buf_assign(pkt, sizeof(_udp), &_udp); if (udph == NULL) { metrics->errors_total_malformed_udp++; return INVALID; } struct bpf_sock_tuple tuple; uint64_t tuplen = fill_tuple(&tuple, iph, iphlen, udph->source, udph->dest); return classify_udp(pkt->skb, &tuple, tuplen); } static INLINING verdict_t process_ipv4(buf_t *pkt, metrics_t *metrics) { metrics->l3_protocol_packets_total_ipv4++; struct iphdr _ip4; struct iphdr *ipv4 = pkt_parse_ipv4(pkt, &_ip4); if (ipv4 == NULL) { metrics->errors_total_malformed_ip++; return INVALID; } if (ipv4->version != 4) { metrics->errors_total_malformed_ip++; return INVALID; } if (ipv4_is_fragment(ipv4)) { metrics->errors_total_fragmented_ip++; return INVALID; } switch (ipv4->protocol) { case IPPROTO_ICMP: return process_icmpv4(pkt, metrics); case IPPROTO_TCP: return process_tcp(pkt, ipv4, sizeof(*ipv4), metrics); case IPPROTO_UDP: return process_udp(pkt, ipv4, sizeof(*ipv4), metrics); default: metrics->errors_total_unknown_l4_proto++; return INVALID; } } static INLINING verdict_t process_ipv6(buf_t *pkt, metrics_t *metrics) { metrics->l3_protocol_packets_total_ipv6++; uint8_t l4_proto; bool is_fragment; struct ipv6hdr _ipv6; struct ipv6hdr *ipv6 = pkt_parse_ipv6(pkt, &_ipv6, &l4_proto, &is_fragment); if (ipv6 == NULL) { metrics->errors_total_malformed_ip++; return INVALID; } if (ipv6->version != 6) { metrics->errors_total_malformed_ip++; return INVALID; } if (is_fragment) { metrics->errors_total_fragmented_ip++; return INVALID; } switch (l4_proto) { case IPPROTO_ICMPV6: return process_icmpv6(pkt, metrics); case IPPROTO_TCP: return process_tcp(pkt, ipv6, sizeof(*ipv6), metrics); case IPPROTO_UDP: return process_udp(pkt, ipv6, sizeof(*ipv6), metrics); default: metrics->errors_total_unknown_l4_proto++; return INVALID; } } SEC("tc") int cls_redirect(struct __sk_buff *skb) { metrics_t *metrics = get_global_metrics(); if (metrics == NULL) { return TC_ACT_SHOT; } metrics->processed_packets_total++; /* Pass bogus packets as long as we're not sure they're * destined for us. */ if (skb->protocol != bpf_htons(ETH_P_IP)) { return TC_ACT_OK; } encap_headers_t *encap; /* Make sure that all encapsulation headers are available in * the linear portion of the skb. This makes it easy to manipulate them. */ if (bpf_skb_pull_data(skb, sizeof(*encap))) { return TC_ACT_OK; } buf_t pkt = { .skb = skb, .head = (uint8_t *)(long)skb->data, .tail = (uint8_t *)(long)skb->data_end, }; encap = buf_assign(&pkt, sizeof(*encap), NULL); if (encap == NULL) { return TC_ACT_OK; } if (encap->ip.ihl != 5) { /* We never have any options. */ return TC_ACT_OK; } if (encap->ip.daddr != ENCAPSULATION_IP || encap->ip.protocol != IPPROTO_UDP) { return TC_ACT_OK; } /* TODO Check UDP length? */ if (encap->udp.dest != ENCAPSULATION_PORT) { return TC_ACT_OK; } /* We now know that the packet is destined to us, we can * drop bogus ones. */ if (ipv4_is_fragment((void *)&encap->ip)) { metrics->errors_total_fragmented_ip++; return TC_ACT_SHOT; } if (encap->gue.variant != 0) { metrics->errors_total_malformed_encapsulation++; return TC_ACT_SHOT; } if (encap->gue.control != 0) { metrics->errors_total_malformed_encapsulation++; return TC_ACT_SHOT; } if (encap->gue.flags != 0) { metrics->errors_total_malformed_encapsulation++; return TC_ACT_SHOT; } if (encap->gue.hlen != sizeof(encap->unigue) / 4 + encap->unigue.hop_count) { metrics->errors_total_malformed_encapsulation++; return TC_ACT_SHOT; } if (encap->unigue.version != 0) { metrics->errors_total_malformed_encapsulation++; return TC_ACT_SHOT; } if (encap->unigue.reserved != 0) { return TC_ACT_SHOT; } struct in_addr next_hop; MAYBE_RETURN(get_next_hop(&pkt, encap, &next_hop)); if (next_hop.s_addr == 0) { metrics->accepted_packets_total_last_hop++; return accept_locally(skb, encap); } verdict_t verdict; switch (encap->gue.proto_ctype) { case IPPROTO_IPIP: verdict = process_ipv4(&pkt, metrics); break; case IPPROTO_IPV6: verdict = process_ipv6(&pkt, metrics); break; default: metrics->errors_total_unknown_l3_proto++; return TC_ACT_SHOT; } switch (verdict) { case INVALID: /* metrics have already been bumped */ return TC_ACT_SHOT; case UNKNOWN: return forward_to_next_hop(skb, encap, &next_hop, metrics); case ECHO_REQUEST: metrics->accepted_packets_total_icmp_echo_request++; break; case SYN: if (encap->unigue.forward_syn) { return forward_to_next_hop(skb, encap, &next_hop, metrics); } metrics->accepted_packets_total_syn++; break; case SYN_COOKIE: metrics->accepted_packets_total_syn_cookies++; break; case ESTABLISHED: metrics->accepted_packets_total_established++; break; } return accept_locally(skb, encap); }