关于ebpf 的co-re
迪丽瓦拉
2024-02-05 19:01:17
0

前言

由于linux内核更新很快,linux内核底层的结构体千变万化,字段时有变更,如何让我们的ebpf程序一次编译到处执行到不同的linux系统上是我们需要关注的点。由此我们需要关注co-re,不做特别深的研究,因为研发层面我们去关注co-re数据结构和算法我认为是没有必要的,只要保证可以用到项目上就可以,保证我们的ebpf一次编译以后在各种linux机器上能够跑起来就行,跟同事经过很长时间的学习,总结了一下笔记

一、co-re介绍

底层库依靠的是libbpf

https://github.com/libbpf/libbpf

co-re的运行主要依赖btf格式,这个东西还是比较复杂,在这里不需要关注,btf相关介绍:

https://www.kernel.org/doc/html/latest/bpf/btf.html

kernel代码include/uapi/linux/bpf.h解释了`CO-RE`的原理。

clang有内置标记`__attribute__((preserve_access_index))`(等效于`__builtin_preserve_access_index`)。ebpf.c代码这样标记所有它需要访问的结构体。clang在对象ELF文件ebpf.o中为每个这样的访问生成一个`bpf_core_relo`。libbpf将在加载ebpf.o时按照ELF中的bpf_core_relo修改指令段。

/** "struct bpf_core_relo" is used to pass relocation data form LLVM to libbpf* and from libbpf to the kernel.** CO-RE relocation captures the following data:* - insn_off - instruction offset (in bytes) within a BPF program that needs*   its insn->imm field to be relocated with actual field info;* - type_id - BTF type ID of the "root" (containing) entity of a relocatable*   type or field;* - access_str_off - offset into corresponding .BTF string section. String*   interpretation depends on specific relocation kind:*     - for field-based relocations, string encodes an accessed field using*       a sequence of field and array indices, separated by colon (:). It's*       conceptually very close to LLVM's getelementptr ([0]) instruction's*       arguments for identifying offset to a field.*     - for type-based relocations, strings is expected to be just "0";*     - for enum value-based relocations, string contains an index of enum*       value within its enum type;* - kind - one of enum bpf_core_relo_kind;** Example:*   struct sample {*       int a;*       struct {*           int b[10];*       };*   };**   struct sample *s = ...;*   int *x = &s->a;     // encoded as "0:0" (a is field #0)*   int *y = &s->b[5];  // encoded as "0:1:0:5" (anon struct is field #1,*                       // b is field #0 inside anon struct, accessing elem #5)*   int *z = &s[10]->b; // encoded as "10:1" (ptr is used as an array)** type_id for all relocs in this example will capture BTF type id of* `struct sample`.** Such relocation is emitted when using __builtin_preserve_access_index()* Clang built-in, passing expression that captures field address, e.g.:** bpf_probe_read(&dst, sizeof(dst),*		  __builtin_preserve_access_index(&src->a.b.c));** In this case Clang will emit field relocation recording necessary data to* be able to find offset of embedded `a.b.c` field within `src` struct.** [0] https://llvm.org/docs/LangRef.html#getelementptr-instruction*/
struct bpf_core_relo {__u32 insn_off;__u32 type_id;__u32 access_str_off;enum bpf_core_relo_kind kind;
};

vmlinux.h文件开头的如下片段,为内核所有结构体加上了标记

#ifndef BPF_NO_PRESERVE_ACCESS_INDEX
#pragma clang attribute push (__attribute__((preserve_access_index)), apply_to = record)
#endif

BTF 类型信息:用于获取内核、BPF 程序类型及 BPF 代码的关键信息

__builtin_preserve_access_index

__builtin_preserve_access_index specifies a code section where array subscript access and structure/union member access are relocatable under bpf compile-once run-everywhere framework. Debuginfo (typically with -g) is needed, otherwise, the compiler will exit with an error. The return type for the intrinsic is the same as the type of the argument.

使用案例:

struct t {int i;int j;union {int a;int b;} c[4];
};
struct t *v = ...;
int *pb =__builtin_preserve_access_index(&v->c[3].b);
__builtin_preserve_access_index(v->j);

二、c程序需要做的兼容

1.读取内核结构体

c程序需要使用bpf_core_read 使用clang支持的__builtin_preserve_access_index

bpf_core_read:

应用读取结构体:

#define bpf_core_read(dst, sz, src)					    \bpf_probe_read_kernel(dst, sz, (const void *)__builtin_preserve_access_index(src))

这样elf文件里就有了 access_index, bpf程序启动的时候会读取当前机器的

/sys/kernel/btf/vmlinux

这里的btf和之前.o文件的btf,计算结构体的偏移量,然后进行修订

简单来说就是使用bpf_core_read代替bpf_probe_read

实战案例kernel部分:

static __always_inline u32 ct_status(const struct nf_conn *ct) {u32 status = 0;u32 ct_conn_status =  BPF_CORE_READ(ct, status);bpf_probe_read_kernel_with_telemetry(&status, sizeof(status), (void *)(&ct_conn_status));return status;
}

当我们读取内核netlink中的nfconn中结构体程序的时候不再使用bpf_probe_read而是使用bpf_core_read,如果多个成员可以使用BPF_CORE_READ宏定义

#define BPF_CORE_READ(src, a, ...) ({					    \___type((src), a, ##__VA_ARGS__) __r;				    \BPF_CORE_READ_INTO(&__r, (src), a, ##__VA_ARGS__);		    \__r;								    \
})

读取结构体嵌套:

static __always_inline int nf_conntrack_tuple_to_conntrack_tuple(conntrack_tuple_t *t, const struct nf_conntrack_tuple *ct) {memset(t, 0, sizeof(conntrack_tuple_t));//ct->dst.protonumswitch (BPF_CORE_READ(ct, dst.protonum)) {case IPPROTO_TCP:t->metadata = CONN_TYPE_TCP;t->sport = BPF_CORE_READ(ct, src.u.tcp.port);t->dport = BPF_CORE_READ(ct, dst.u.tcp.port);break;case IPPROTO_UDP:t->metadata = CONN_TYPE_UDP;t->sport = BPF_CORE_READ(ct, src.u.udp.port);t->dport = BPF_CORE_READ(ct, dst.u.udp.port);break;default:log_debug("ERR(to_conn_tuple): unknown protocol number: %u\n", ct->dst.protonum);return 0;}t->sport = bpf_ntohs(t->sport);t->dport = bpf_ntohs(t->dport);if (t->sport == 0 || t->dport == 0) {log_debug("ERR(to_conn_tuple): src/dst port not set: src: %u, dst: %u\n", t->sport, t->dport);return 0;}if (BPF_CORE_READ(ct, src.l3num) == AF_INET) {t->metadata |= CONN_V4;t->saddr_l = BPF_CORE_READ(ct, src.u3.ip);t->daddr_l = BPF_CORE_READ(ct, dst.u3.ip);if (!t->saddr_l || !t->daddr_l) {log_debug("ERR(to_conn_tuple.v4): src/dst addr not set src:%u, dst:%u\n", t->saddr_l, t->daddr_l);return 0;}}
#ifdef FEATURE_IPV6_ENABLEDelse if (BPF_CORE_READ(ct, src.l3num) == AF_INET6) {t->metadata |= CONN_V6;read_in6_addr(&t->saddr_h, &t->saddr_l, &BPF_CORE_READ(ct, src.u3.in6));read_in6_addr(&t->daddr_h, &t->daddr_l, &BPF_CORE_READ(ct, dst.u3.in6));if (!(t->saddr_h || t->saddr_l)) {log_debug("ERR(to_conn_tuple.v6): src addr not set: src_l: %llu, src_h: %llu\n",t->saddr_l, t->saddr_h);return 0;}if (!(t->daddr_h || t->daddr_l)) {log_debug("ERR(to_conn_tuple.v6): dst addr not set: dst_l: %llu, dst_h: %llu\n",t->daddr_l, t->daddr_h);return 0;}}
#endifreturn 1;
}

有一篇文章说了,我们不一定总是需要去显示调用bpf_core_read一类的函数

https://nakryiko.com/posts/bpf-core-reference-guide/#btf-enabled-bpf-program-types-with-direct-memory-reads

这篇文章里有介绍

2.关于检查字段是否存在

非co-re的读取情况,我们严重依赖头文件

// depending on the kernel version p_net may be a struct net** or possible_net_t*
static __always_inline u32 get_netns(void *p_net) {u32 net_ns_inum = 0;
#ifdef CONFIG_NET_NSstruct net *ct_net = NULL;bpf_probe_read_kernel_with_telemetry(&ct_net, sizeof(ct_net), p_net);#ifdef _LINUX_NS_COMMON_Hbpf_probe_read_kernel_with_telemetry(&net_ns_inum, sizeof(net_ns_inum), &ct_net->ns.inum);#elsebpf_probe_read_kernel_with_telemetry(&net_ns_inum, sizeof(net_ns_inum), &ct_net->proc_inum);#endif
#endifreturn net_ns_inum;
}

当我们使用libbpf之后,应有了强大的co-re能力

我们重铸指针:

struct ct_net___old {unsigned int proc_inum;
} __attribute__((preserve_access_index));

co-re后的代码,我们使用bpf_core_field_exists判断字段是否存在,而不去依赖宏

// depending on the kernel version p_net may be a struct net** or possible_net_t*
static __always_inline u32 get_netns(void *p_net) {u32 net_ns_inum = 0;struct net *ct_net = NULL;bpf_probe_read_kernel_with_telemetry(&ct_net, sizeof(ct_net), p_net);if (bpf_core_field_exists(ct_net->ns.inum)) {unsigned int inum = BPF_CORE_READ(ct_net, ns.inum);bpf_core_read(&net_ns_inum, sizeof(net_ns_inum), &inum);} else {struct ct_net___old *ct_net_old = (void *)ct_net;unsigned int proc_inum = BPF_CORE_READ(ct_net_old, proc_inum);bpf_probe_read(&net_ns_inum, sizeof(net_ns_inum), &proc_inum);}return net_ns_inum;
}

三、golang加载部分:

读取btf:

var btfData *btf.SpecbtfData, telemetry = ddebpf.GetBTF(cfg.BTFPath, cfg.BPFDir)

GetBTF实现:

// LoadKernelSpec returns the current kernel's BTF information.
//
// Defaults to /sys/kernel/btf/vmlinux and falls back to scanning the file system
// for vmlinux ELFs. Returns an error wrapping ErrNotSupported if BTF is not enabled.
func LoadKernelSpec() (*Spec, error) {fh, err := os.Open("/sys/kernel/btf/vmlinux")if err == nil {defer fh.Close()return loadRawSpec(fh, internal.NativeEndian, nil, nil)}file, err := findVMLinux()if err != nil {return nil, err}defer file.Close()return loadSpecFromELF(file)
}

加载进入kernel

       if btfData != nil {opts.VerifierOptions = ebpf.CollectionOptions{Programs: ebpf.ProgramOptions{KernelTypes: btfData,},}}err = mgr.InitWithOptions(buf, opts)if err != nil {return nil, err}return mgr, nil

简单来说就是读取当前主机的vmlinux的etf信息

然后通过libbpf对结构体偏移量进行修正,然后加载入内核

至此co-re介绍结束.

相关内容