欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

IPVS之Bypass转发模式

程序员文章站 2024-03-13 07:59:33
...

不同于NAT/DR/Tunnel转发模式,bypass模式不能够通过ipvsadm命令行显示的指定,而是在调度失败之后,可能进入的一种转发模式,由内核自动决定。

对于UDP、TCP和SCTP协议,在调度过程中,如果连接创建失败,例如由于未找到合适的目的调度服务器、内存不足等原因,ignored变量小于等于零的情况发生。其中ignored小于零,表明内存分配失败导致的连接创建失败,此时verdict设置为NF_DROP,结束执行。

如果ignored等于零,表明未找到合适的目的服务器,此种情况下由函数ip_vs_leave进行进一步处理。

static int udp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
          struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph)
{

        /* Let the virtual server select a real server for the incoming connection, and create a connection entry.
         */
        *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
        if (!*cpp && ignored <= 0) {
            if (!ignored)
                *verdict = ip_vs_leave(svc, skb, pd, iph);
            else
                *verdict = NF_DROP;
            return 0;

以下TCP协议的调度处理函数tcp_conn_schedule,涉及连接创建失败的处理与以上UDP协议的处理相同。

 34 static int tcp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
 36           struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph)
 39 {
 90         *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
 91         if (!*cpp && ignored <= 0) {
 92             if (!ignored)
 93                 *verdict = ip_vs_leave(svc, skb, pd, iph);
 94             else
 95                 *verdict = NF_DROP;
 96             return 0;

以下SCTP协议的调度处理函数sctp_conn_schedule,涉及连接创建失败的处理与以上UDP协议的处理相同。

static int sctp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
           struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph)
{
        *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
        if (!*cpp && ignored <= 0) {
            if (!ignored)
                *verdict = ip_vs_leave(svc, skb, pd, iph);
            else
                *verdict = NF_DROP;
            return 0;

未调度报文处理

如上所述,ip_vs_leave函数负责处理未成功调度的报文。首先通过函数frag_safe_skb_hp安全的获取到目的端口号,四层协议(如UDP、TCP等)的端口号信息的偏移位于IP头部之后,即offset为iph->len,此函数确保无论offset偏移是位于skb缓存的线性区,或者是共享缓存区都可安全的获取到端口号所在缓存的起始指针pptr。

/*
 *  Pass or drop the packet.
 *  Called by ip_vs_in, when the virtual service is available but
 *  no destination is available for a new connection.
 */
int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, struct ip_vs_proto_data *pd, struct ip_vs_iphdr *iph)
{
    __be16 _ports[2], *pptr, dport;
    struct netns_ipvs *ipvs = svc->ipvs;
    struct net *net = ipvs->net;

    pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph);
    if (!pptr)
        return NF_DROP;
    dport = likely(!ip_vs_iph_inverse(iph)) ? pptr[1] : pptr[0];

如果IPVS虚拟服务是使用防火墙标记fwmark设定的,并且报文的目的地址为单播地址,而且此报文不是ICMP报文,其也不是回复方向的报文,即为原始请求报文,符合以上所有的条件,并且内核开启了cache_bypass功能,尝试将报文按照其IP头部的目的地址进行转发。

可通过PROC文件/proc/sys/net/ipv4/vs/cache_bypass修改sysctl_cache_bypass的值,默认情况下为零,即丢弃未找到目的服务器的报文。

    /* if it is fwmark-based service, the cache_bypass sysctl is up and the destination is a non-local unicast, then create
       a cache_bypass connection entry */
    if (sysctl_cache_bypass(ipvs) && svc->fwmark &&
        !(iph->hdr_flags & (IP_VS_HDR_INVERSE | IP_VS_HDR_ICMP)) &&
        ip_vs_addr_is_unicast(net, svc->af, &iph->daddr)) {
        int ret;
        struct ip_vs_conn *cp;
        unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0;
        union nf_inet_addr daddr =  { .all = { 0, 0, 0, 0 } };

如果cache_bypass开启,以下创建一个新的连接结构,其中指定连接标志IP_VS_CONN_F_BYPASS标志,表示此链接为一个bypass类型连接,并为此连接绑定IP_VS_CONN_F_BYPASS类型的传输函数ip_vs_bypass_xmit。

        /* create a new connection entry */
        {
            struct ip_vs_conn_param p;
            ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, &iph->saddr, pptr[0], &iph->daddr, pptr[1], &p);
            cp = ip_vs_conn_new(&p, svc->af, &daddr, 0, IP_VS_CONN_F_BYPASS | flags, NULL, skb->mark);
            if (!cp)
                return NF_DROP;
        }

以下调用连接绑定的packet_xmit发送函数,即ip_vs_bypass_xmit函数发送报文。

        /* set state */
        ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);

        /* transmit the first SYN packet */
        ret = cp->packet_xmit(skb, cp, pd->pp, iph);
        /* do not touch skb anymore */

        if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control)
            atomic_inc(&cp->control->in_pkts);
        else
            atomic_inc(&cp->in_pkts);
        ip_vs_conn_put(cp);
        return ret;
    }

以上的条件未满足,如果IPVS虚拟服务指定为FTP服务端口(21),但是当前报文的目的端口不等于21,表明为其它的服务,交由linux内核协议栈进行处理。再者,如果当前报文为ICMP报文,返回NF_DROP。

    /*
     * When the virtual ftp service is presented, packets destined for other services on the VIP may get here (except services
     * listed in the ipvs table), pass the packets, because it is not ipvs job to decide to drop the packets.
     */
    if (svc->port == FTPPORT && dport != FTPPORT)
        return NF_ACCEPT;

    if (unlikely(ip_vs_iph_icmp(iph)))
        return NF_DROP;

如果执行到ip_vs_leave函数最后,将向客户端发送ICMP的端口不可达消息报文。

    /*
     * Notify the client that the destination is unreachable, and release the socket buffer.
     * Since it is in IP layer, the TCP socket is not actually created, the TCP RST packet cannot be sent, instead that
     * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
     */
#ifdef CONFIG_IP_VS_IPV6
    if (svc->af == AF_INET6) {
        if (!skb->dev)
            skb->dev = net->loopback_dev;
        icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
    } else
#endif
        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);

    return NF_DROP;

Bypass模式绑定发送函数

在创建bypass类型的连接时,函数ip_vs_conn_new调用ip_vs_bind_xmit为新创建的连接绑定发送函数。

struct ip_vs_conn *ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, const union nf_inet_addr *daddr, __be16 dport, ...)
{
    /* Bind its packet transmitter */
#ifdef CONFIG_IP_VS_IPV6
    if (p->af == AF_INET6)
        ip_vs_bind_xmit_v6(cp);
    else
#endif
        ip_vs_bind_xmit(cp); 

如下,根据连接的转发模式,函数ip_vs_bind_xmit为bypass类型连接绑定了发送函数ip_vs_bypass_xmit。

static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
{
    switch (IP_VS_FWD_METHOD(cp)) {
    case IP_VS_CONN_F_BYPASS:
        cp->packet_xmit = ip_vs_bypass_xmit;
        break;

Bypass发送函数

如同所有的转发模式发送函数,首先查找出口路由,与其它模式不同的是,在调用路由查找函数__ip_vs_get_out_rt时,bypass模式指定了IP_VS_RT_MODE_NON_LOCAL标志,即要求查询到的出口路由一定要是外部目的地址。

/*  
 * Bypass transmitter
 * Let packets bypass the destination when the destination is not available, it may be only used in transparent cache cluster.
 */     
int ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh)
{
    struct iphdr  *iph = ip_hdr(skb);

    if (__ip_vs_get_out_rt(cp->ipvs, cp->af, skb, NULL, iph->daddr, IP_VS_RT_MODE_NON_LOCAL, NULL, ipvsh) < 0)
        goto tx_error;

    ip_send_check(iph);

在报文发送之前,设置忽略禁止分片标志ignore_df,以避免在随后的分片函数ip_fragment中,遇到IP报头设置有DF标志的报文,并且其长度大于MTU,而引发icmp_send函数发送代码为ICMP_FRAG_NEEDED的ICMP报文。最后由函数ip_vs_send_or_cont发送报文。

    /* Another hack: avoid icmp_send in ip_fragment */
    skb->ignore_df = 1;

    ip_vs_send_or_cont(NFPROTO_IPV4, skb, cp, 0);

    return NF_STOLEN;

内核版本 4.15

相关标签: ipvs