ovs vlan tag管理
1、背景
当使用ovs管理虚机的网络时,ovs会为虚机的tap设备添加一个内部的vlan tag,如下图所示,存在两个bridge,br0-ovs、br-int,其中br0-ovs挂载物理网卡设备eth2,br-int挂载虚机tatp设备,br0-ovs与br-int之间通过一对patch口连接,ovs为tap设备分配了内部vlan 2和内部vlan 3,本文主要描述ovs是如何完成这些内部vlan与外部vlan的转换。
2、流表规则
首先看一下br-int的流表规则,可以看到从int-br0-ovs这个patch进来的外部报文,分成两种情况,如果收到的报文没有vlan,则添加一个内部vlan 2,然后normal查询actiton动作,如果收到的报文带vlan 1049,则转换成内部vlan 3,然后normal查找actiton动作;
然后看一下br0-ovs的流表规则,从phy-br0-ovs这个patch收到的报文,再发出去之前判断如果带内部vlan 2,则把vlan信息剥离,如果带内部vlan 3,则添加外部vlan 1049,然后normal查询action动作;
从这两个桥的流表规则大致可以看出,是在br-int收到外部报文时,将其vlan信息转成内部vlan,br0-ovs收到虚机往外发的报文时,将内部vlan转化成外部vlan;既然br-int收到的发往虚机的报文,无论是不是有vlan,都会添加一个内部vlan,那为什么在虚机内部转包的时候又没有vlan信息呢?这个mod_vlan_vid究竟是做什么用呢?
3、mod_vlan_vid
首先先看一下ovs的mod_vlan_vid的动作,看ovs的upcall流程里,mod_vlan_vid的时候是会修改ctx->xin->flow的vlan信息,当ovs触发upcall时,ovs会将skb的流信息保存到ctx->xin->flow里,然后通过用户态的流表规则修改flow信息(在recv_upcalls->dpif_recv->odp_flow_key_to_flow流程里赋值的),最终匹配好actiton动作后再通知给datapath处理,从这个mod_vlan_vid的处理逻辑看,应该是修改了flow的vlan信息;
do_xlate_actions(const struct ofpact *ofpacts, size_t ofpacts_len,
struct xlate_ctx *ctx, bool is_last_action,
bool group_bucket_action)
{
struct flow_wildcards *wc = ctx->wc;
struct flow *flow = &ctx->xin->flow;
const struct ofpact *a;
...
case OFPACT_SET_VLAN_VID:
wc->masks.vlans[0].tci |= htons(VLAN_VID_MASK | VLAN_CFI);
if (flow->vlans[0].tci & htons(VLAN_CFI) ||
ofpact_get_SET_VLAN_VID(a)->push_vlan_if_needed) {
if (!flow->vlans[0].tpid) {
flow->vlans[0].tpid = htons(ETH_TYPE_VLAN);
}
flow->vlans[0].tci &= ~htons(VLAN_VID_MASK);
flow->vlans[0].tci |=
(htons(ofpact_get_SET_VLAN_VID(a)->vlan_vid) |
htons(VLAN_CFI));
}
break;
...
}
4、xlate_normal
以br-int的流表规则为例,mod_vlan_vid之后就是normal查询:
static void
xlate_normal(struct xlate_ctx *ctx)
{
/*校验flow的vlan信息与in_bound口是否匹配*/
/* Check VLAN. */
xvlan_extract(flow, &in_xvlan);
if (!input_vid_is_valid(ctx, in_xvlan.v[0].vid, in_xbundle)) {
xlate_report(ctx, OFT_WARN,
"disallowed VLAN VID for this input port, dropping");
return;
}
/*根据in_xbound类型,做内部vlan转换*/
xvlan_input_translate(in_xbundle, &in_xvlan, &xvlan);
vlan = xvlan.v[0].vid;
/* Check other admissibility requirements. */
if (in_port && !is_admissible(ctx, in_port, vlan)) {
return;
}
/* Learn source MAC. */
bool is_grat_arp = is_gratuitous_arp(flow, wc);
if (ctx->xin->allow_side_effects
&& flow->packet_type == htonl(PT_ETH)
&& in_port->pt_mode != NETDEV_PT_LEGACY_L3
) {
update_learning_table(ctx, in_xbundle, flow->dl_src, vlan,
is_grat_arp);
}
if (ctx->xin->xcache && in_xbundle != &ofpp_none_bundle) {
struct xc_entry *entry;
/* Save just enough info to update mac learning table later. */
entry = xlate_cache_add_entry(ctx->xin->xcache, XC_NORMAL);
entry->normal.ofproto = ctx->xbridge->ofproto;
entry->normal.in_port = flow->in_port.ofp_port;
entry->normal.dl_src = flow->dl_src;
entry->normal.vlan = vlan;
entry->normal.is_gratuitous_arp = is_grat_arp;
}
/*发送到out_xbound*/
/* Determine output bundle. */
if (mcast_snooping_enabled(ctx->xbridge->ms)
&& !eth_addr_is_broadcast(flow->dl_dst)
&& eth_addr_is_multicast(flow->dl_dst)
&& is_ip_any(flow)) {
struct mcast_snooping *ms = ctx->xbridge->ms;
struct mcast_group *grp = NULL;
struct dp_packet *p = CONST_CAST(struct dp_packet *,
ctx->xin->packet);
/* We will need the whole data for processing the packet below */
if (p && !dp_packet_is_linear(p)) {
dp_packet_linearize(p);
}
if (is_igmp(flow, wc)) {
/*
* IGMP packets need to take the slow path, in order to be
* processed for mdb updates. That will prevent expires
* firing off even after hosts have sent reports.
*/
ctx->xout->slow |= SLOW_ACTION;
memset(&wc->masks.tp_src, 0xff, sizeof wc->masks.tp_src);
if (mcast_snooping_is_membership(flow->tp_src) ||
mcast_snooping_is_query(flow->tp_src)) {
if (ctx->xin->allow_side_effects && ctx->xin->packet) {
update_mcast_snooping_table(ctx, flow, vlan,
in_xbundle, ctx->xin->packet);
}
}
if (mcast_snooping_is_membership(flow->tp_src)) {
struct mcast_output out = MCAST_OUTPUT_INIT;
ovs_rwlock_rdlock(&ms->rwlock);
xlate_normal_mcast_send_mrouters(ctx, ms, in_xbundle, &xvlan,
&out);
/* RFC4541: section 2.1.1, item 1: A snooping switch should
* forward IGMP Membership Reports only to those ports where
* multicast routers are attached. Alternatively stated: a
* snooping switch should not forward IGMP Membership Reports
* to ports on which only hosts are attached.
* An administrative control may be provided to override this
* restriction, allowing the report messages to be flooded to
* other ports. */
xlate_normal_mcast_send_rports(ctx, ms, in_xbundle, &out);
ovs_rwlock_unlock(&ms->rwlock);
mcast_output_finish(ctx, &out, in_xbundle, &xvlan);
} else {
xlate_report(ctx, OFT_DETAIL, "multicast traffic, flooding");
xlate_normal_flood(ctx, in_xbundle, &xvlan);
}
return;
} else if (is_mld(flow, wc)) {
ctx->xout->slow |= SLOW_ACTION;
if (ctx->xin->allow_side_effects && ctx->xin->packet) {
update_mcast_snooping_table(ctx, flow, vlan,
in_xbundle, ctx->xin->packet);
}
if (is_mld_report(flow, wc)) {
struct mcast_output out = MCAST_OUTPUT_INIT;
ovs_rwlock_rdlock(&ms->rwlock);
xlate_normal_mcast_send_mrouters(ctx, ms, in_xbundle, &xvlan,
&out);
xlate_normal_mcast_send_rports(ctx, ms, in_xbundle, &out);
ovs_rwlock_unlock(&ms->rwlock);
mcast_output_finish(ctx, &out, in_xbundle, &xvlan);
} else {
xlate_report(ctx, OFT_DETAIL, "MLD query, flooding");
xlate_normal_flood(ctx, in_xbundle, &xvlan);
}
} else {
if (is_ip_local_multicast(flow, wc)) {
/* RFC4541: section 2.1.2, item 2: Packets with a dst IP
* address in the 224.0.0.x range which are not IGMP must
* be forwarded on all ports */
xlate_report(ctx, OFT_DETAIL,
"RFC4541: section 2.1.2, item 2, flooding");
xlate_normal_flood(ctx, in_xbundle, &xvlan);
return;
}
}
/* forwarding to group base ports */
struct mcast_output out = MCAST_OUTPUT_INIT;
ovs_rwlock_rdlock(&ms->rwlock);
if (flow->dl_type == htons(ETH_TYPE_IP)) {
grp = mcast_snooping_lookup4(ms, flow->nw_dst, vlan);
} else if (flow->dl_type == htons(ETH_TYPE_IPV6)) {
grp = mcast_snooping_lookup(ms, &flow->ipv6_dst, vlan);
}
if (grp) {
xlate_normal_mcast_send_group(ctx, ms, grp, in_xbundle, &out);
xlate_normal_mcast_send_fports(ctx, ms, in_xbundle, &out);
xlate_normal_mcast_send_mrouters(ctx, ms, in_xbundle, &xvlan,
&out);
} else {
if (mcast_snooping_flood_unreg(ms)) {
xlate_report(ctx, OFT_DETAIL,
"unregistered multicast, flooding");
out.flood = true;
} else {
xlate_normal_mcast_send_mrouters(ctx, ms, in_xbundle, &xvlan,
&out);
xlate_normal_mcast_send_fports(ctx, ms, in_xbundle, &out);
}
}
ovs_rwlock_unlock(&ms->rwlock);
mcast_output_finish(ctx, &out, in_xbundle, &xvlan);
} else {
ovs_rwlock_rdlock(&ctx->xbridge->ml->rwlock);
mac = mac_learning_lookup(ctx->xbridge->ml, flow->dl_dst, vlan);
mac_port = mac ? mac_entry_get_port(ctx->xbridge->ml, mac) : NULL;
ovs_rwlock_unlock(&ctx->xbridge->ml->rwlock);
if (mac_port) {
struct xbundle *mac_xbundle = xbundle_lookup(ctx->xcfg, mac_port);
if (mac_xbundle
&& mac_xbundle != in_xbundle
&& mac_xbundle->ofbundle != in_xbundle->ofbundle) {
xlate_report(ctx, OFT_DETAIL, "forwarding to learned port");
output_normal(ctx, mac_xbundle, &xvlan);
} else if (!mac_xbundle) {
xlate_report(ctx, OFT_WARN,
"learned port is unknown, dropping");
} else {
xlate_report(ctx, OFT_DETAIL,
"learned port is input port, dropping");
}
} else {
xlate_report(ctx, OFT_DETAIL,
"no learned MAC for destination, flooding");
xlate_normal_flood(ctx, in_xbundle, &xvlan);
}
}
}
从以上的noraml处理流程,可以看出ovs的normal逻辑主要包含以下几个动作:
1)、检查flow的vlan与in_xbound是否匹配;
这一步的校验主要就是获取flow的vlan信息,然后看跟in_xbound的类型是否匹配,比如access口不能发出带vlan的报文,ovs为每个port都分配了一个vlan mode类型,默认都是trunk类型,如果port带tag信息,则port为access口(可通过ovs-vsctl list port xxx查询);
static void
xvlan_extract(const struct flow *flow, struct xvlan *xvlan)
{
int i;
memset(xvlan, 0, sizeof(*xvlan));
for (i = 0; i < FLOW_MAX_VLAN_HEADERS; i++) {
if (!eth_type_vlan(flow->vlans[i].tpid) ||
!(flow->vlans[i].tci & htons(VLAN_CFI))) {
break;
}
xvlan->v[i].tpid = ntohs(flow->vlans[i].tpid);
xvlan->v[i].vid = vlan_tci_to_vid(flow->vlans[i].tci);
xvlan->v[i].pcp = ntohs(flow->vlans[i].tci) & VLAN_PCP_MASK;
}
}
static bool
input_vid_is_valid(const struct xlate_ctx *ctx,
uint16_t vid, struct xbundle *in_xbundle)
{
/* Allow any VID on the OFPP_NONE port. */
if (in_xbundle == &ofpp_none_bundle) {
return true;
}
switch (in_xbundle->vlan_mode) {
case PORT_VLAN_ACCESS:
if (vid) {
xlate_report_error(ctx, "dropping VLAN %"PRIu16" tagged "
"packet received on port %s configured as VLAN "
"%d access port", vid, in_xbundle->name,
in_xbundle->vlan);
return false;
}
return true;
case PORT_VLAN_NATIVE_UNTAGGED:
case PORT_VLAN_NATIVE_TAGGED:
if (!vid) {
/* Port must always carry its native VLAN. */
return true;
}
/* Fall through. */
case PORT_VLAN_TRUNK:
if (!xbundle_trunks_vlan(in_xbundle, vid)) {
xlate_report_error(ctx, "dropping VLAN %"PRIu16" packet "
"received on port %s not configured for "
"trunking VLAN %"PRIu16,
vid, in_xbundle->name, vid);
return false;
}
return true;
case PORT_VLAN_DOT1Q_TUNNEL:
if (!xbundle_allows_cvlan(in_xbundle, vid)) {
xlate_report_error(ctx, "dropping VLAN %"PRIu16" packet received "
"on dot1q-tunnel port %s that excludes this "
"VLAN", vid, in_xbundle->name);
return false;
}
return true;
default:
OVS_NOT_REACHED();
}
}
2)、根据in_xbound类型,做内部vlan转;
这个流程主要是根据in_xbound口类型,设置内部vlan信息(xvlan),如果报文是从access口来的,则内部vlan为in_xbound的内部vlan值,如果in_bound口trunk,则内部vlan为flow里解析得到的vlan值;这里计算得到的内部vlan值主要作用有两个:更新记录mac以及normal flood时保证只在同个vlan域flood;
static void
xvlan_input_translate(const struct xbundle *in_xbundle,
const struct xvlan *in_xvlan, struct xvlan *xvlan)
{
switch (in_xbundle->vlan_mode) {
//如果报文是从access口来的,则xvlan设置为in_xbound口的内部vlan信息;
case PORT_VLAN_ACCESS:
memset(xvlan, 0, sizeof(*xvlan));
xvlan->v[0].tpid = in_xvlan->v[0].tpid ? in_xvlan->v[0].tpid :
ETH_TYPE_VLAN_8021Q;
xvlan->v[0].vid = in_xbundle->vlan;
xvlan->v[0].pcp = in_xvlan->v[0].pcp;
break;
//如果报文是从trunk口来的,则xvlan设置为in_xvlan的值,in_xvlan即为xvlan_extract里根据flow解析
//得到的vlan信息
case PORT_VLAN_TRUNK:
xvlan_copy(xvlan, in_xvlan);
break;
case PORT_VLAN_NATIVE_UNTAGGED:
case PORT_VLAN_NATIVE_TAGGED:
xvlan_copy(xvlan, in_xvlan);
if (!in_xvlan->v[0].vid) {
xvlan->v[0].tpid = in_xvlan->v[0].tpid ? in_xvlan->v[0].tpid :
ETH_TYPE_VLAN_8021Q;
xvlan->v[0].vid = in_xbundle->vlan;
xvlan->v[0].pcp = in_xvlan->v[0].pcp;
}
break;
case PORT_VLAN_DOT1Q_TUNNEL:
xvlan_copy(xvlan, in_xvlan);
xvlan_push_uninit(xvlan);
xvlan->v[0].tpid = in_xbundle->qinq_ethtype;
xvlan->v[0].vid = in_xbundle->vlan;
xvlan->v[0].pcp = 0;
break;
default:
OVS_NOT_REACHED();
}
}
3)、更新mac表信息;
这里生成entry cache信息,后面revalidate使用该entry信息跟新mac表,mac表记录了某个mac地址是从哪个port来的,以及改port所属的内部vlan信息;
if (ctx->xin->xcache && in_xbundle != &ofpp_none_bundle) {
struct xc_entry *entry;
/* Save just enough info to update mac learning table later. */
entry = xlate_cache_add_entry(ctx->xin->xcache, XC_NORMAL);
entry->normal.ofproto = ctx->xbridge->ofproto;
entry->normal.in_port = flow->in_port.ofp_port;
entry->normal.dl_src = flow->dl_src;
entry->normal.vlan = vlan;
entry->normal.is_gratuitous_arp = is_grat_arp;
}
4)、根据目的mac,找到对应的out_xbound口;
如果normal的时候根据目的mac找不到out_xbound,就需要flood,flood的时候就会根据第(2)步计算得到的内部vlan,然后遍历bridge上的所有port,当port的vlan与计算得到的内部vlan匹配时,才会转发到对应的口;
static void
xlate_normal_flood(struct xlate_ctx *ctx, struct xbundle *in_xbundle,
struct xvlan *xvlan)
{
struct xbundle *xbundle;
LIST_FOR_EACH (xbundle, list_node, &ctx->xbridge->xbundles) {
if (xbundle != in_xbundle
&& xbundle->ofbundle != in_xbundle->ofbundle
//xvlan为xvlan_input_translate阶段计算得到的内部vlan
&& xbundle_includes_vlan(xbundle, xvlan)
&& xbundle->floodable
&& !xbundle_mirror_out(ctx->xbridge, xbundle)) {
output_normal(ctx, xbundle, xvlan);
}
}
ctx->nf_output_iface = NF_OUT_FLOOD;
}
5)、output_normal
当找到匹配的出口时,ovs会调用output_normal,在output_normal的时候,ovs还会根据实际out_xbound口的类型获取out_xvlan,这个out_xvlan最终会通过output_normal->compose_output_action->xlate_commit_actions->commit_odp_actions设置到真正要下发给datapath的缓存流表里,也就是说一开始mod_vlan_vid所修改的vlan信息并不一定真的会修改实际数据包的vlan信息,在normal流程里,是否修改实际数据包的vlan信息取决于out_xbound的类型。当out_xbound类型为access口时,out_xvlan为0,这时候就不去修改实际数据包的vlan信息,当out_xbound为trunk时,out_xvlan为xvlan的值,这里的xvlan即为in_xbound的值,也就是说当数据包从access口发给trunk口时,数据包到了trunk口就会添加上access口的内部vlan信息。
static void
xvlan_output_translate(const struct xbundle *out_xbundle,
const struct xvlan *xvlan, struct xvlan *out_xvlan)
{
switch (out_xbundle->vlan_mode) {
//如果是发往access口,则不设置vlan
case PORT_VLAN_ACCESS:
memset(out_xvlan, 0, sizeof(*out_xvlan));
break;
//如果是发往trunk口,则将in_xbound 口的内部vlan拷贝过来
case PORT_VLAN_TRUNK:
case PORT_VLAN_NATIVE_TAGGED:
xvlan_copy(out_xvlan, xvlan);
break;
case PORT_VLAN_NATIVE_UNTAGGED:
xvlan_copy(out_xvlan, xvlan);
if (xvlan->v[0].vid == out_xbundle->vlan) {
xvlan_pop(out_xvlan);
}
break;
case PORT_VLAN_DOT1Q_TUNNEL:
xvlan_copy(out_xvlan, xvlan);
xvlan_pop(out_xvlan);
break;
default:
OVS_NOT_REACHED();
}
}
总结:
1)、流表规则normal前的mod_vlan_vid并不会真正修改数据包的vlan信息,这里的mod_vlan_vid主要作用是保证从trunk来的数据包,当需要做normal flood的时候,只在同一个vlan域内flood;
2)、当数据包从access口发往trunk口时,会带上access口的vlan信息;
3)、流表规则output前的mod_vlan_vid会修改数据包的vlan信息;
推荐阅读