eBPF/sockmap实现socket转发offload
通过代理服务器在两个TCP接连之间转发数据是一个非常常见的需求,然而这个代理服务器也是整条路径中的瓶颈之所在,代理服务器的七层转发行为极大地消耗着单机性能,所以通过代理服务器的七层转发优化,是一件必须要做的事情。
*eBPF能否将代理程序的数据转发offload到内核呢?*如果可以做到,这就意味着这个offload可以达到和XDP offload相近的功效。
在kubernetes环境下通过代理来执行L7策略
代理(比如Envoy)执行额外的L7策略(Health checks, service discovery, load balancing, mutual TLS),如下图所示的传统方法数据包的路径比较冗余并且有着比较大的开销。
Envoy可以在BPF的支持下来加速数据包的传递路径,基于BPF来实现策略执行、检查和重定向。如下:
sockmap的引入
工作流程:
- 使用
BPF_PROG_LOAD
加载BPF程序,在这之后,BPF_PROG_TYPE_SK_MSG
类型的BPF程序可以通过文件描述符来定位; - 创建
BPF_MAP_TYPE_SOCKMAP
或者BPF_MAP_TYPE_SOCKHASH
类型的BPF映射,第一种类型的映射是一个数组,第二种类型的映射是一个哈希表,映射的值为socket句柄; -
SK_MSG
类型的BPF程序通过bpf系统调用attach到上述BPF映射;此时,上述BPF程序将在任何添加到BPF映射的Socket上执行; - Sockets通过BPF系统调用
BPF_MAP_UPDATE_ELEM
或者BPF程序中的bpf_sock_map_update
或者bpf_sock_hash_update
帮助函数来添加到sockmap或者sockhash中; - 一旦Sockets句柄被添加到了attach有
SK_MSG
类型的BPF程序的BPF映射中,当sendmsg/sendfile调用发生时,BPF程序将执行; -
SK_MSG
BPF程序可通过BPF_PROG_DETACH
从映射中卸载;另外,当映射被销毁,程序将会被自动地移除。
一些规则:
- Although a socket may exist in multiple maps no more than one of those maps may contain a SK_MSG program;
- when a SK_MSG program is detached from a map it will not be removed from sockets running in the map. Sockets must be explicitly removed from the map to have the SK_MSG program removed (e.g. no longer invoked on sendmsg/sendfile) regardless of the attach/detach state of the BPF program. This is easily done by BPF syscall BPF_MAP_DELETE_ELEM or alternatively if the map is removed, all sockets will be removed from the map first and hence SK_MSG programs removed.
- The SK_MSG program only applies to sockets added after the program has beed attached.
Sockmap栗子
正常的代理程序
下面这个图展示了两个程序通过中间代理发送消息的例子。app1发送消息到socket,代理程序通过proxysd1对应的socket接收到消息(代理程序可以对消息做检查),消息合法则将其通过proxysd2对应的socket转发给app2,从而实现了代理的功能;从app2给app1发送消息的过程也是类似的。
下面是简单代理程序的实验代码:
// proxy.c
// gcc proxy.c -o proxy
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/types.h>
#include <netinet/in.h>
#include <sys/socket.h>
#include <sys/select.h>
#include <netdb.h>
#include <signal.h>
#define MAXSIZE 100
char buf[MAXSIZE];
int proxysd1, proxysd2;
static void int_handler(int a)
{
close(proxysd1);
close(proxysd2);
exit(0);
}
int main(int argc, char *argv[])
{
int ret;
struct sockaddr_in proxyaddr1, proxyaddr2;
struct hostent *proxy1, *proxy2;
unsigned short port1, port2;
fd_set rset;
int maxfd = 10, n;
if (argc != 5) {
exit(1);
}
signal(SIGINT, int_handler);
FD_ZERO(&rset);
proxysd1 = socket(AF_INET, SOCK_STREAM, 0);
proxysd2 = socket(AF_INET, SOCK_STREAM, 0);
proxy1 = gethostbyname(argv[1]);
port1 = atoi(argv[2]);
proxy2 = gethostbyname(argv[3]);
port2 = atoi(argv[4]);
bzero(&proxyaddr1, sizeof(struct sockaddr_in));
proxyaddr1.sin_family = AF_INET;
proxyaddr1.sin_port = htons(port1);
proxyaddr1.sin_addr = *((struct in_addr *)proxy1->h_addr);
bzero(&proxyaddr2, sizeof(struct sockaddr_in));
proxyaddr2.sin_family = AF_INET;
proxyaddr2.sin_port = htons(port2);
proxyaddr2.sin_addr = *((struct in_addr *)proxy2->h_addr);
connect(proxysd1, (struct sockaddr *)&proxyaddr1, sizeof(struct sockaddr));
connect(proxysd2, (struct sockaddr *)&proxyaddr2, sizeof(struct sockaddr));
while (1) {
FD_SET(proxysd1, &rset);
FD_SET(proxysd2, &rset);
select(maxfd, &rset, NULL, NULL, NULL);
memset(buf, 0, MAXSIZE);
if (FD_ISSET(proxysd1, &rset)) {
ret = recv(proxysd1, buf, MAXSIZE, 0);
printf("%d --> %d proxy string:%s\n", proxysd1, proxysd2, buf);
send(proxysd2, buf, ret, 0);
}
if (FD_ISSET(proxysd2, &rset)) {
ret = recv(proxysd2, buf, MAXSIZE, 0);
printf("%d --> %d proxy string:%s\n", proxysd2, proxysd1, buf);
send(proxysd1, buf, ret, 0);
}
}
return 0;
}
首先开启中间代理程序:(代理转发的过程)
然后使用netcat监听本机8086端口作为app1,用netcat监听本机8888端口作为app2。(左侧输入msg from app1,右侧显示;右侧输入msg from app2,左侧显示)
eBPF buff加成的代理程序
回想一下BPF程序的编写过程,它分为*kern.c
和*user.c
,分别对应加载到内核的BPF程序和完成BPF程序加载、与内核BPF程序通过BPF映射交互的用户态程序。不同于XDP类型的BPF程序(XDP钩子会挂载到具体的网卡上),BPF_PROG_TYPE_SK_MSG
类型的钩子挂载在BPF_MAP_TYPE_SOCKMAP
或者BPF_MAP_TYPE_SOCKHASH
类型的BPF映射上,在这里有两个eBPF程序挂载到BPF映射上,分别是parser和verdict。通过bpf_map_update_elem
帮助函数将Socket句柄添加到BPF Sockmap类型的映射中,与Socket相关的消息发出来后将分别经过parser
和verdict
钩子程序的处理。目前所看到的示例代码中,一般是在parser
中返回数据包相关的信息,比如skb长度,在verdict
通过bpf_sk_redirect_map(skb, &sock_map, idx, 0)
来将数据包从一个Socket的发送队列直接转发到另一个Socket的接收队列。
SEC("prog_parser")
int _prog_parser(struct __sk_buff *skb)
{
return skb->len;
}
SEC("prog_verdict")
int _prog_verdict(struct __sk_buff *skb)
{
uint32_t idx = 0;
return bpf_sk_redirect_map(skb, &sock_map, idx, 0);
}
下面来看一下sockmap_kern.c
:
// sockmap_kern.c
#include <uapi/linux/bpf.h>
#include "bpf_helpers.h"
#include "bpf_endian.h"
struct bpf_map_def SEC("maps") proxy_map = {
.type = BPF_MAP_TYPE_HASH,
.key_size = sizeof(unsigned short), // 端口号
.value_size = sizeof(int), // sockmap索引
.max_entries = 2,
};
struct bpf_map_def SEC("maps") sock_map = {
.type = BPF_MAP_TYPE_SOCKMAP,
.key_size = sizeof(int),
.value_size = sizeof(int), // socket句柄
.max_entries = 2,
};
SEC("prog_parser")
int bpf_prog1(struct __sk_buff *skb)
{
return skb->len;
}
SEC("prog_verdict")
int bpf_prog2(struct __sk_buff *skb)
{
__u32 *index = 0;
__u16 port = (__u16)bpf_ntohl(skb->remote_port);
char info_fmt[] = "data to port [%d]\n";
bpf_trace_printk(info_fmt, sizeof(info_fmt), port);
index = bpf_map_lookup_elem(&proxy_map, &port);
if (index == NULL)
return 0;
return bpf_sk_redirect_map(skb, &sock_map, *index, 0);
}
char _license[] SEC("license") = "GPL";
其中bpf_sk_redirect_map
帮助函数在文档中的描述如下所示:
用户态程序sockmap_user.c
如下:
// sockmap_user.c
#include <stdio.h>
#include <stdlib.h>
#include <sys/socket.h>
#include <sys/select.h>
#include <unistd.h>
#include <netdb.h>
#include <signal.h>
#include "bpf_load.h"
#include "bpf_util.h"
#define MAXSIZE 1024
char buf[MAXSIZE];
static int proxysd1, proxysd2;
static int sockmap_fd, proxymap_fd, bpf_prog_fd;
static int progs_fd[2];
static int key, val;
static unsigned short key16;
static int ctrl = 0;
static void int_handler(int a)
{
close(proxysd1);
close(proxysd2);
exit(0);
}
// 可以通过发送HUP信号来打开和关闭sockmap offload功能
static void hup_handler(int a)
{
if (ctrl == 1) {
key = 0;
bpf_map_update_elem(sockmap_fd, &key, &proxysd1, BPF_ANY);
key = 1;
bpf_map_update_elem(sockmap_fd, &key, &proxysd2, BPF_ANY);
ctrl = 0;
} else if (ctrl == 0){
key = 0;
bpf_map_delete_elem(sockmap_fd, &key);
key = 1;
bpf_map_delete_elem(sockmap_fd, &key);
ctrl = 1;
}
}
int main(int argc, char **argv)
{
char filename[256];
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
struct bpf_object *obj;
struct bpf_program *prog;
struct bpf_prog_load_attr prog_load_attr = {
.prog_type = BPF_PROG_TYPE_SK_SKB,
};
int ret;
struct sockaddr_in proxyaddr1, proxyaddr2;
struct hostent *proxy1, *proxy2;
unsigned short port1, port2;
fd_set rset;
int maxfd = 10;
if (argc != 5) {
printf("Usage: sockmap ip1 port1 ip2 port2!\n")
exit(1);
}
prog_load_attr.file = filename;
signal(SIGINT, int_handler);
signal(SIGHUP, hup_handler);
// 这部分增加的代码引入了ebpf/sockmap逻辑, 完成了bpf的加载、attach
bpf_prog_load_xattr(&prog_load_attr, &obj, &bpf_prog_fd);
sockmap_fd = bpf_object__find_map_fd_by_name(obj, "sock_map");
proxymap_fd = bpf_object__find_map_fd_by_name(obj, "proxy_map");
prog = bpf_object__find_program_by_title(obj, "prog_parser");
progs_fd[0] = bpf_program__fd(prog);
bpf_prog_attach(progs_fd[0], sockmap_fd, BPF_SK_SKB_STREAM_PARSER, 0);
prog = bpf_object__find_program_by_title(obj, "prog_verdict");
progs_fd[1] = bpf_program__fd(prog);
bpf_prog_attach(progs_fd[1], sockmap_fd, BPF_SK_SKB_STREAM_VERDICT, 0);
proxysd1 = socket(AF_INET, SOCK_STREAM, 0);
proxysd2 = socket(AF_INET, SOCK_STREAM, 0);
proxy1 = gethostbyname(argv[1]);
port1 = atoi(argv[2]);
proxy2 = gethostbyname(argv[3]);
port2 = atoi(argv[4]);
bzero(&proxyaddr1, sizeof(struct sockaddr_in));
proxyaddr1.sin_family = AF_INET;
proxyaddr1.sin_port = htons(port1);
proxyaddr1.sin_addr = *((struct in_addr *)proxy1->h_addr);
bzero(&proxyaddr2, sizeof(struct sockaddr_in));
proxyaddr2.sin_family = AF_INET;
proxyaddr2.sin_port = htons(port2);
proxyaddr2.sin_addr = *((struct in_addr *)proxy2->h_addr);
connect(proxysd1, (struct sockaddr *)&proxyaddr1, sizeof(struct sockaddr));
connect(proxysd2, (struct sockaddr *)&proxyaddr2, sizeof(struct sockaddr));
// 将代理的socket添加到bpf映射sockmap中
key = 0;
bpf_map_update_elem(sockmap_fd, &key, &proxysd1, BPF_ANY);
key = 1;
bpf_map_update_elem(sockmap_fd, &key, &proxysd2, BPF_ANY);
// 当数据包到达proxysd1的Socket接收队列,attach到BPF映射的钩子程序执行:取出__skb_buf元数据remote_port
// 即数据包的源端口号8086, 并查询proxy_map,此端口号对应的socket在sockmap的索引值为1,即proxysd2,执行
// bpf_sk_redirect_map将数据包转发到该socket的egress(不指定BPF_F_INGRESS Flag值)
key16 = port1;
val = 1;
bpf_map_update_elem(proxymap_fd, &key16, &val, BPF_ANY);
key16 = port2;
val = 0;
bpf_map_update_elem(proxymap_fd, &key16, &val, BPF_ANY);
// 余下的proxy转发代码保持不变,这部分代码一旦开启了sockmap offload,将不会再被执行。
while (1) {
FD_SET(proxysd1, &rset);
FD_SET(proxysd2, &rset);
select(maxfd, &rset, NULL, NULL, NULL);
memset(buf, 0, MAXSIZE);
if (FD_ISSET(proxysd1, &rset)) {
ret = recv(proxysd1, buf, MAXSIZE, 0);
printf("%d --> %d proxy string:%s\n", proxysd1, proxysd2, buf);
send(proxysd2, buf, ret, 0);
}
if (FD_ISSET(proxysd2, &rset)) {
ret = recv(proxysd2, buf, MAXSIZE, 0);
printf("%d --> %d proxy string:%s\n", proxysd2, proxysd1, buf);
send(proxysd1, buf, ret, 0);
}
}
return 0;
}
在samples/bpf
目录下编译成功:
运行结果:
上图说明了BPF程序能够用户态的代理数据转发offload到内核协议栈上完成。
接下来执行kill -SIGHUP SOCKMAP_PID
发送SIGHUP
信号从而将socket句柄从sockmap
上移除,此时将开启用户态代理的数据转发,效果和上一个实验的效果相同:
接下来使用tcpdump来抓包看一下,可以看到是可以通过tcpdump在内核协议栈上看到相关数据包的信息的。