IPVS之Bypass转发模式

程序员文章站 2024-03-13 07:59:33

...

不同于NAT/DR/Tunnel转发模式，bypass模式不能够通过ipvsadm命令行显示的指定，而是在调度失败之后，可能进入的一种转发模式，由内核自动决定。

对于UDP、TCP和SCTP协议，在调度过程中，如果连接创建失败，例如由于未找到合适的目的调度服务器、内存不足等原因，ignored变量小于等于零的情况发生。其中ignored小于零，表明内存分配失败导致的连接创建失败，此时verdict设置为NF_DROP，结束执行。

如果ignored等于零，表明未找到合适的目的服务器，此种情况下由函数ip_vs_leave进行进一步处理。

static int udp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
          struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph)
{

        /* Let the virtual server select a real server for the incoming connection, and create a connection entry.
         */
        *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
        if (!*cpp && ignored <= 0) {
            if (!ignored)
                *verdict = ip_vs_leave(svc, skb, pd, iph);
            else
                *verdict = NF_DROP;
            return 0;

以下TCP协议的调度处理函数tcp_conn_schedule，涉及连接创建失败的处理与以上UDP协议的处理相同。

 34 static int tcp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
 36           struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph)
 39 {
 90         *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
 91         if (!*cpp && ignored <= 0) {
 92             if (!ignored)
 93                 *verdict = ip_vs_leave(svc, skb, pd, iph);
 94             else
 95                 *verdict = NF_DROP;
 96             return 0;

以下SCTP协议的调度处理函数sctp_conn_schedule，涉及连接创建失败的处理与以上UDP协议的处理相同。

static int sctp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
           struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph)
{
        *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
        if (!*cpp && ignored <= 0) {
            if (!ignored)
                *verdict = ip_vs_leave(svc, skb, pd, iph);
            else
                *verdict = NF_DROP;
            return 0;

未调度报文处理

如上所述，ip_vs_leave函数负责处理未成功调度的报文。首先通过函数frag_safe_skb_hp安全的获取到目的端口号，四层协议（如UDP、TCP等）的端口号信息的偏移位于IP头部之后，即offset为iph->len，此函数确保无论offset偏移是位于skb缓存的线性区，或者是共享缓存区都可安全的获取到端口号所在缓存的起始指针pptr。

/*
 *  Pass or drop the packet.
 *  Called by ip_vs_in, when the virtual service is available but
 *  no destination is available for a new connection.
 */
int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, struct ip_vs_proto_data *pd, struct ip_vs_iphdr *iph)
{
    __be16 _ports[2], *pptr, dport;
    struct netns_ipvs *ipvs = svc->ipvs;
    struct net *net = ipvs->net;

    pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph);
    if (!pptr)
        return NF_DROP;
    dport = likely(!ip_vs_iph_inverse(iph)) ? pptr[1] : pptr[0];

如果IPVS虚拟服务是使用防火墙标记fwmark设定的，并且报文的目的地址为单播地址，而且此报文不是ICMP报文，其也不是回复方向的报文，即为原始请求报文，符合以上所有的条件，并且内核开启了cache_bypass功能，尝试将报文按照其IP头部的目的地址进行转发。

可通过PROC文件/proc/sys/net/ipv4/vs/cache_bypass修改sysctl_cache_bypass的值，默认情况下为零，即丢弃未找到目的服务器的报文。

    /* if it is fwmark-based service, the cache_bypass sysctl is up and the destination is a non-local unicast, then create
       a cache_bypass connection entry */
    if (sysctl_cache_bypass(ipvs) && svc->fwmark &&
        !(iph->hdr_flags & (IP_VS_HDR_INVERSE | IP_VS_HDR_ICMP)) &&
        ip_vs_addr_is_unicast(net, svc->af, &iph->daddr)) {
        int ret;
        struct ip_vs_conn *cp;
        unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0;
        union nf_inet_addr daddr =  { .all = { 0, 0, 0, 0 } };

如果cache_bypass开启，以下创建一个新的连接结构，其中指定连接标志IP_VS_CONN_F_BYPASS标志，表示此链接为一个bypass类型连接，并为此连接绑定IP_VS_CONN_F_BYPASS类型的传输函数ip_vs_bypass_xmit。

        /* create a new connection entry */
        {
            struct ip_vs_conn_param p;
            ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, &iph->saddr, pptr[0], &iph->daddr, pptr[1], &p);
            cp = ip_vs_conn_new(&p, svc->af, &daddr, 0, IP_VS_CONN_F_BYPASS | flags, NULL, skb->mark);
            if (!cp)
                return NF_DROP;
        }

以下调用连接绑定的packet_xmit发送函数，即ip_vs_bypass_xmit函数发送报文。

        /* set state */
        ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);

        /* transmit the first SYN packet */
        ret = cp->packet_xmit(skb, cp, pd->pp, iph);
        /* do not touch skb anymore */

        if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control)
            atomic_inc(&cp->control->in_pkts);
        else
            atomic_inc(&cp->in_pkts);
        ip_vs_conn_put(cp);
        return ret;
    }

以上的条件未满足，如果IPVS虚拟服务指定为FTP服务端口（21），但是当前报文的目的端口不等于21，表明为其它的服务，交由linux内核协议栈进行处理。再者，如果当前报文为ICMP报文，返回NF_DROP。

    /*
     * When the virtual ftp service is presented, packets destined for other services on the VIP may get here (except services
     * listed in the ipvs table), pass the packets, because it is not ipvs job to decide to drop the packets.
     */
    if (svc->port == FTPPORT && dport != FTPPORT)
        return NF_ACCEPT;

    if (unlikely(ip_vs_iph_icmp(iph)))
        return NF_DROP;

如果执行到ip_vs_leave函数最后，将向客户端发送ICMP的端口不可达消息报文。

    /*
     * Notify the client that the destination is unreachable, and release the socket buffer.
     * Since it is in IP layer, the TCP socket is not actually created, the TCP RST packet cannot be sent, instead that
     * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
     */
#ifdef CONFIG_IP_VS_IPV6
    if (svc->af == AF_INET6) {
        if (!skb->dev)
            skb->dev = net->loopback_dev;
        icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
    } else
#endif
        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);

    return NF_DROP;

Bypass模式绑定发送函数

在创建bypass类型的连接时，函数ip_vs_conn_new调用ip_vs_bind_xmit为新创建的连接绑定发送函数。

struct ip_vs_conn *ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, const union nf_inet_addr *daddr, __be16 dport, ...)
{
    /* Bind its packet transmitter */
#ifdef CONFIG_IP_VS_IPV6
    if (p->af == AF_INET6)
        ip_vs_bind_xmit_v6(cp);
    else
#endif
        ip_vs_bind_xmit(cp);

如下，根据连接的转发模式，函数ip_vs_bind_xmit为bypass类型连接绑定了发送函数ip_vs_bypass_xmit。

static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
{
    switch (IP_VS_FWD_METHOD(cp)) {
    case IP_VS_CONN_F_BYPASS:
        cp->packet_xmit = ip_vs_bypass_xmit;
        break;

Bypass发送函数

如同所有的转发模式发送函数，首先查找出口路由，与其它模式不同的是，在调用路由查找函数__ip_vs_get_out_rt时，bypass模式指定了IP_VS_RT_MODE_NON_LOCAL标志，即要求查询到的出口路由一定要是外部目的地址。

/*  
 * Bypass transmitter
 * Let packets bypass the destination when the destination is not available, it may be only used in transparent cache cluster.
 */     
int ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
    struct iphdr  *iph = ip_hdr(skb);

    if (__ip_vs_get_out_rt(cp->ipvs, cp->af, skb, NULL, iph->daddr, IP_VS_RT_MODE_NON_LOCAL, NULL, ipvsh) < 0)
        goto tx_error;

    ip_send_check(iph);

在报文发送之前，设置忽略禁止分片标志ignore_df，以避免在随后的分片函数ip_fragment中，遇到IP报头设置有DF标志的报文，并且其长度大于MTU，而引发icmp_send函数发送代码为ICMP_FRAG_NEEDED的ICMP报文。最后由函数ip_vs_send_or_cont发送报文。

    /* Another hack: avoid icmp_send in ip_fragment */
    skb->ignore_df = 1;

    ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);

    return NF_STOLEN;

内核版本 4.15

IPVS之Bypass转发模式

未调度报文处理

Bypass模式绑定发送函数

Bypass发送函数

IPVS之Bypass转发模式

IPVS之NAT转发模式

IPVS之隧道转发模式

IPVS之路由转发模式