IPVS收发数据相关的速率计算
本文介绍IPVS中的收发数据相关的速率计算。
使能估算器
ip_vs_start_estimator函数将第二个参数stats包含的estimator挂载到ipvs网络命名空间中的估算器列表中。
void ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
{
struct ip_vs_estimator *est = &stats->est;
INIT_LIST_HEAD(&est->list);
spin_lock_bh(&ipvs->est_lock);
list_add(&est->list, &ipvs->est_list);
spin_unlock_bh(&ipvs->est_lock);
}
以下函数调用ip_vs_start_estimator启动估算器。其中函数__ip_vs_update_dest中的调用用于估算每个真实服务器相关的速率;函数ip_vs_add_service中的调用用来估算虚拟服务中的速率。
static void __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest, int add)
{
struct netns_ipvs *ipvs = svc->ipvs;
if (add) {
ip_vs_start_estimator(svc->ipvs, &dest->stats);
}
static int ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, struct ip_vs_service **svc_p)
{
ip_vs_start_estimator(ipvs, &svc->stats);
}
static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
{
ip_vs_start_estimator(ipvs, &ipvs->tot_stats);
}
ipvs性能估算器初始化由函数ip_vs_estimator_net_init完成。其中的关键是初始化一个估算定时器,时长为2秒钟,超时处理函数为:estimation_timer。此设置是网络命名空间独立的,每个命名空间都有一个ipvs估算器。
int __net_init ip_vs_estimator_net_init(struct netns_ipvs *ipvs)
{
INIT_LIST_HEAD(&ipvs->est_list);
spin_lock_init(&ipvs->est_lock);
timer_setup(&ipvs->est_timer, estimation_timer, 0);
mod_timer(&ipvs->est_timer, jiffies + 2 * HZ);
}
在超时函数estimation_timer,计算5个速率值:连接速率、输入报文速率、输出报文速率、输入数据速率和输出数据速率。以上的5个速率值每2秒钟更新一次。速率估算的方式是,计算最近8秒的速率,每2秒计算一次这2秒的速率,此值在最终的平均速率中占比为1/4,参见以下的公式:
avgrate = avgrate*(1-W) + rate*W
where W = 2^(-2)
以每秒新建连接cps为例,e->cps = e->cps + (rate - e->cps) * 1/4 = e->cps * (1-1/4) + rate * 1/4。为防止右移操作导致的数据丢失,内核在计算支持将连接数左移了9位。对于inbytes和outbytes数据,左移了4位。e->cps中保存的即为最近2秒接收到的报文量。
static void estimation_timer(struct timer_list *t)
{
struct ip_vs_estimator *e;
struct ip_vs_stats *s;
u64 rate;
struct netns_ipvs *ipvs = from_timer(ipvs, t, est_timer);
spin_lock(&ipvs->est_lock);
list_for_each_entry(e, &ipvs->est_list, list) {
s = container_of(e, struct ip_vs_stats, est);
spin_lock(&s->lock);
ip_vs_read_cpu_stats(&s->kstats, s->cpustats);
/* scaled by 2^10, but divided 2 seconds */
rate = (s->kstats.conns - e->last_conns) << 9;
e->last_conns = s->kstats.conns;
e->cps += ((s64)rate - (s64)e->cps) >> 2;
rate = (s->kstats.inpkts - e->last_inpkts) << 9;
e->last_inpkts = s->kstats.inpkts;
e->inpps += ((s64)rate - (s64)e->inpps) >> 2;
rate = (s->kstats.outpkts - e->last_outpkts) << 9;
e->last_outpkts = s->kstats.outpkts;
e->outpps += ((s64)rate - (s64)e->outpps) >> 2;
/* scaled by 2^5, but divided 2 seconds */
rate = (s->kstats.inbytes - e->last_inbytes) << 4;
e->last_inbytes = s->kstats.inbytes;
e->inbps += ((s64)rate - (s64)e->inbps) >> 2;
rate = (s->kstats.outbytes - e->last_outbytes) << 4;
e->last_outbytes = s->kstats.outbytes;
e->outbps += ((s64)rate - (s64)e->outbps) >> 2;
spin_unlock(&s->lock);
}
spin_unlock(&ipvs->est_lock);
mod_timer(&ipvs->est_timer, jiffies + 2*HZ);
}
右移在估算超时函数estimation_timer中的左移操作,在速率读取函数ip_vs_read_estimator中进行了右移操作。以cps为例,由于e->cps中保存的为2秒的连接数量,在进行右移操作时,不是右移9位,而是10位,即得到每秒的连接数量。
void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats)
{
struct ip_vs_estimator *e = &stats->est;
dst->cps = (e->cps + 0x1FF) >> 10;
dst->inpps = (e->inpps + 0x1FF) >> 10;
dst->outpps = (e->outpps + 0x1FF) >> 10;
dst->inbps = (e->inbps + 0xF) >> 5;
dst->outbps = (e->outbps + 0xF) >> 5;
}
ip_vs_read_cpu_stats读取由顺序锁保护的统计数据。将每个处理器的统计数据进行累加操作。
static void ip_vs_read_cpu_stats(struct ip_vs_kstats *sum, struct ip_vs_cpu_stats __percpu *stats)
{
bool add = false;
for_each_possible_cpu(i) {
struct ip_vs_cpu_stats *s = per_cpu_ptr(stats, i);
unsigned int start;
u64 conns, inpkts, outpkts, inbytes, outbytes;
if (add) {
do {
start = u64_stats_fetch_begin(&s->syncp);
conns = s->cnt.conns;
inpkts = s->cnt.inpkts;
outpkts = s->cnt.outpkts;
inbytes = s->cnt.inbytes;
outbytes = s->cnt.outbytes;
} while (u64_stats_fetch_retry(&s->syncp, start));
sum->conns += conns;
sum->inpkts += inpkts;
sum->outpkts += outpkts;
sum->inbytes += inbytes;
sum->outbytes += outbytes;
} else {
add = true;
do {
start = u64_stats_fetch_begin(&s->syncp);
sum->conns = s->cnt.conns;
sum->inpkts = s->cnt.inpkts;
sum->outpkts = s->cnt.outpkts;
sum->inbytes = s->cnt.inbytes;
sum->outbytes = s->cnt.outbytes;
} while (u64_stats_fetch_retry(&s->syncp, start));
}
}
}
输入统计
函数ip_vs_in_stats用于输入信息的统计。由代码可见,在接收到一个数据包之后,内核一方面会增加其最终的真实服务器的输入报文和输入数据长度,还会增加此数据包匹配的虚拟服务的输入报文和输入数据长度。最后,还将增加ipvs网络命名空间中的输入统计信息。
统计信息的修改,由顺序锁进行保护。
static inline void ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
{
struct ip_vs_dest *dest = cp->dest;
struct netns_ipvs *ipvs = cp->ipvs;
if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
struct ip_vs_cpu_stats *s;
struct ip_vs_service *svc;
s = this_cpu_ptr(dest->stats.cpustats);
u64_stats_update_begin(&s->syncp);
s->cnt.inpkts++;
s->cnt.inbytes += skb->len;
u64_stats_update_end(&s->syncp);
svc = rcu_dereference(dest->svc);
s = this_cpu_ptr(svc->stats.cpustats);
u64_stats_update_begin(&s->syncp);
s->cnt.inpkts++;
s->cnt.inbytes += skb->len;
u64_stats_update_end(&s->syncp);
s = this_cpu_ptr(ipvs->tot_stats.cpustats);
u64_stats_update_begin(&s->syncp);
s->cnt.inpkts++;
s->cnt.inbytes += skb->len;
u64_stats_update_end(&s->syncp);
}
}
以上输入统计信息函数在ip_vs_in函数中调用,而ip_vs_in函数在NF_INET_LOCAL_IN和NF_INET_LOCAL_OUT两个hook点上都由被调用。所以此统计函数可统计由系统外部或者应用层进入ipvs系统的数据。另外,对于调度失败的情况,例如tcp_conn_schedule函数中,如果ignored未设置,ip_vs_leave函数中也可能调用ip_vs_in_stats函数增加统计信息。
static int tcp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph)
{
if (svc) {
int ignored;
/*
* Let the virtual server select a real server for the incoming connection, and create a connection entry.
*/
*cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
if (!*cpp && ignored <= 0) {
if (!ignored)
*verdict = ip_vs_leave(svc, skb, pd, iph);
输出统计
函数ip_vs_out_stats用于输出信息的统计。与以上的输入统计函数ip_vs_in_stats类似,此函数将增加真实服务器、虚拟服务和ipvs网络命名空间三者中的输出统计信息。统计信息的修改,由顺序锁进行保护。
static inline void ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
{
struct ip_vs_dest *dest = cp->dest;
struct netns_ipvs *ipvs = cp->ipvs;
if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
struct ip_vs_cpu_stats *s;
struct ip_vs_service *svc;
s = this_cpu_ptr(dest->stats.cpustats);
u64_stats_update_begin(&s->syncp);
s->cnt.outpkts++;
s->cnt.outbytes += skb->len;
u64_stats_update_end(&s->syncp);
svc = rcu_dereference(dest->svc);
s = this_cpu_ptr(svc->stats.cpustats);
u64_stats_update_begin(&s->syncp);
s->cnt.outpkts++;
s->cnt.outbytes += skb->len;
u64_stats_update_end(&s->syncp);
s = this_cpu_ptr(ipvs->tot_stats.cpustats);
u64_stats_update_begin(&s->syncp);
s->cnt.outpkts++;
s->cnt.outbytes += skb->len;
u64_stats_update_end(&s->syncp);
}
}
以上输出统计信息函数,在NF_INET_LOCAL_IN,NF_INET_FORWARD和NF_INET_LOCAL_OUT三个hook点都有调用流程。对于这三个hook点,在NAT转发模式下,如果找到匹配的连接,表明是回复的报文,增加统计信息。
连接统计
连接信息统计函数ip_vs_conn_stats,用来增加真实服务器、虚拟服务和ipvs网络命名空间三者中的连接统计信息。统计信息的修改,由顺序锁进行保护。
static inline void ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
{
struct netns_ipvs *ipvs = svc->ipvs;
struct ip_vs_cpu_stats *s;
s = this_cpu_ptr(cp->dest->stats.cpustats);
u64_stats_update_begin(&s->syncp);
s->cnt.conns++;
u64_stats_update_end(&s->syncp);
s = this_cpu_ptr(svc->stats.cpustats);
u64_stats_update_begin(&s->syncp);
s->cnt.conns++;
u64_stats_update_end(&s->syncp);
s = this_cpu_ptr(ipvs->tot_stats.cpustats);
u64_stats_update_begin(&s->syncp);
s->cnt.conns++;
u64_stats_update_end(&s->syncp);
}
以上连接统计函数在ip_vs_sched_persist、ip_vs_new_conn_out和ip_vs_schedule函数中都由调用,要注意的是其在创建新的ipvs连接之后,才调用此函数。如下的函数ip_vs_new_conn_out:
struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc, struct ip_vs_dest *dest,
struct sk_buff *skb, const struct ip_vs_iphdr *iph, __be16 dport, __be16 cport)
{
cp = ip_vs_conn_new(¶m, dest->af, daddr, dport, flags, dest, 0);
if (!cp) {
if (ct) ip_vs_conn_put(ct);
return NULL;
}
if (ct) {
ip_vs_control_add(cp, ct);
ip_vs_conn_put(ct);
}
ip_vs_conn_stats(cp, svc);
}
Linux内核版本 4.15