欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

inet_csk_get_port(...)

程序员文章站 2022-06-10 23:42:34
...

1. tcp协议内部绑定tcp_hashinfo

struct proto tcp_prot = {
	.name			= "TCP",
	.owner			= THIS_MODULE,
	.close			= tcp_close,
	.connect		= tcp_v4_connect,
	.disconnect		= tcp_disconnect,
	.accept			= inet_csk_accept,
	.ioctl			= tcp_ioctl,
	.init			= tcp_v4_init_sock,
	.destroy		= tcp_v4_destroy_sock,
	.shutdown		= tcp_shutdown,
	.setsockopt		= tcp_setsockopt,
	.getsockopt		= tcp_getsockopt,
	.recvmsg		= tcp_recvmsg,
	.sendmsg		= tcp_sendmsg,
	.sendpage		= tcp_sendpage,
	.backlog_rcv		= tcp_v4_do_rcv,
	.release_cb		= tcp_release_cb,
	.mtu_reduced		= tcp_v4_mtu_reduced,
	.hash			= inet_hash,
	.unhash			= inet_unhash,
	.get_port		= inet_csk_get_port,
	.enter_memory_pressure	= tcp_enter_memory_pressure,
	.sockets_allocated	= &tcp_sockets_allocated,
	.orphan_count		= &tcp_orphan_count,
	.memory_allocated	= &tcp_memory_allocated,
	.memory_pressure	= &tcp_memory_pressure,
	.sysctl_wmem		= sysctl_tcp_wmem,
	.sysctl_rmem		= sysctl_tcp_rmem,
	.max_header		= MAX_TCP_HEADER,
	.obj_size		= sizeof(struct tcp_sock),
	.slab_flags		= SLAB_DESTROY_BY_RCU,
	.twsk_prot		= &tcp_timewait_sock_ops,
	.rsk_prot		= &tcp_request_sock_ops,
	.h.hashinfo		= &tcp_hashinfo,
	.no_autobind		= true,
#ifdef CONFIG_COMPAT
	.compat_setsockopt	= compat_tcp_setsockopt,
	.compat_getsockopt	= compat_tcp_getsockopt,
#endif
#ifdef CONFIG_MEMCG_KMEM
	.init_cgroup		= tcp_init_cgroup,
	.destroy_cgroup		= tcp_destroy_cgroup,
	.proto_cgroup		= tcp_proto_cgroup,
#endif
};
EXPORT_SYMBOL(tcp_prot);

其中有如下成员初始化

.h.hashinfo		= &tcp_hashinfo,

而tcp_hashinfo的结构体类型如下

struct inet_hashinfo tcp_hashinfo;
EXPORT_SYMBOL(tcp_hashinfo);

2. struct inet_hashinfo 结构体定义

struct inet_hashinfo {
	/* This is for sockets with full identity only.  Sockets here will
	 * always be without wildcards and will have the following invariant:
	 *
	 *          TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE
	 *
	 * TIME_WAIT sockets use a separate chain (twchain).
	 */
	struct inet_ehash_bucket	*ehash; //established hash链表
	spinlock_t			*ehash_locks;
	unsigned int			ehash_mask; //内核打印信息为512
	unsigned int			ehash_locks_mask;

	/* Ok, let's try this, I give up, we do need a local binding
	 * TCP hash as well as the others for fast bind/connect.
	 */
	struct inet_bind_hashbucket	*bhash; //绑定的bind hash桶链表

	unsigned int			bhash_size; //上面 bhash 指针的个数,即所能bind hashbucket,内核打印为512
	/* 4 bytes hole on 64 bit */

	struct kmem_cache		*bind_bucket_cachep;

	/* All the above members are written once at bootup and
	 * never written again _or_ are predominantly read-access.
	 *
	 * Now align to a new cache line as all the following members
	 * might be often dirty.
	 */
	/* All sockets in TCP_LISTEN state will be in here.  This is the only
	 * table where wildcard'd TCP sockets can exist.  Hash function here
	 * is just local port number.
	 */
	struct inet_listen_hashbucket	listening_hash[INET_LHTABLE_SIZE]
					____cacheline_aligned_in_smp;

	atomic_t			bsockets;
};

在这个结构体内部主要定义了如下成员

a. established hash桶及相关参数

	struct inet_ehash_bucket	*ehash; //established hash桶
	spinlock_t			*ehash_locks; //established 锁
	unsigned int			ehash_mask; //established屏蔽字,内核打印信息为512
	unsigned int			ehash_locks_mask; //established屏蔽字锁

其中inet_ehash_bucket桶定义如下

struct inet_ehash_bucket {
	struct hlist_nulls_head chain; 
	struct hlist_nulls_head twchain;
};
struct hlist_nulls_head {
	struct hlist_nulls_node *first; //节点
};
struct hlist_nulls_node {
	struct hlist_nulls_node *next, **pprev; //hash链表指针成员
};

b. bind hash桶及相应的内存缓存指针

	struct inet_bind_hashbucket	*bhash; //bind hash桶链表

	unsigned int			bhash_size; //上面 bind bhash 指针的个数,内核打印为512
	/* 4 bytes hole on 64 bit */

	struct kmem_cache		*bind_bucket_cachep; //bind桶内存缓存

其中inet_bind_hashbucket bind hash桶定义如下

struct inet_bind_hashbucket {
	spinlock_t		lock;
	struct hlist_head	chain;
};
struct hlist_head {
	struct hlist_node *first;
};
struct hlist_node {
	struct hlist_node *next, **pprev;
};

3. struct inet_hashinfo 结构体成员初始化

源码路径: linux-3.10.x\net\ipv4\tcp.c

源码调用: inet_init()-->tcp_init()

void __init tcp_init(void)
{
	struct sk_buff *skb = NULL;
	unsigned long limit;
	int max_rshare, max_wshare, cnt;
	unsigned int i;

	BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));

	percpu_counter_init(&tcp_sockets_allocated, 0); //TCP socket的个数
	percpu_counter_init(&tcp_orphan_count, 0); //tcp 孤儿的个数

	//分配一个 tcp_hashinfo.bind_bucket_cachep 内存缓存
	tcp_hashinfo.bind_bucket_cachep =
		kmem_cache_create("tcp_bind_bucket",
				  sizeof(struct inet_bind_bucket), 0,
				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);

	/* Size and allocate the main established and bind bucket
	 * hash tables.
	 *
	 * The methodology is similar to that of the buffer cache.
	 */
	 
	//分配一个 established bucket链表
	tcp_hashinfo.ehash =
		alloc_large_system_hash("TCP established",
					sizeof(struct inet_ehash_bucket),
					thash_entries,
					17, /* one slot per 128 KB of memory */
					0,
					NULL,
					//返回ehash_mask屏蔽字
					&tcp_hashinfo.ehash_mask, 
					0,
					thash_entries ? 0 : 512 * 1024);

	//遍历ehash_mask屏蔽字
	for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) { 
		//初始化链表
		INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i); 
		INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i); 
	}

	//分配并初始化锁,及锁的个数
	if (inet_ehash_locks_alloc(&tcp_hashinfo)) //ehash锁分配
		panic("TCP: failed to alloc ehash_locks");

	//---------------------------------------------------------------------

	//分配bind hash桶链表
	tcp_hashinfo.bhash =
		alloc_large_system_hash("TCP bind",
					sizeof(struct inet_bind_hashbucket),
					tcp_hashinfo.ehash_mask + 1, //512+1
					17, /* one slot per 128 KB of memory */
					0,
					//内核日志打印 tcp_hashinfo.bhash_size 为512
					&tcp_hashinfo.bhash_size,
					NULL,
					0,
					64 * 1024);
	tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size;
	for (i = 0; i < tcp_hashinfo.bhash_size; i++) { //遍历绑定的hashbucket
		spin_lock_init(&tcp_hashinfo.bhash[i].lock); //初始化 bind hash链表的锁
		INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); //初始化 bind hash链表
	}


	cnt = tcp_hashinfo.ehash_mask + 1;

	tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
	sysctl_tcp_max_orphans = cnt / 2;
	sysctl_max_syn_backlog = max(128, cnt / 256);

	tcp_init_mem(&init_net);
	/* Set per-socket limits to no more than 1/128 the pressure threshold */
	limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
	max_wshare = min(4UL*1024*1024, limit);
	max_rshare = min(6UL*1024*1024, limit);

	sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
	sysctl_tcp_wmem[1] = 16*1024;
	sysctl_tcp_wmem[2] = max(64*1024, max_wshare);

	sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
	sysctl_tcp_rmem[1] = 87380;
	sysctl_tcp_rmem[2] = max(87380, max_rshare);

	pr_info("Hash tables configured (established %u bind %u)\n",
		tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
	//内核输出如上信息:“TCP: Hash tables configured (established 512 bind 512)”	

	tcp_metrics_init();

	tcp_register_congestion_control(&tcp_reno);

	tcp_tasklet_init();
}

4. inet_csk_get_port()

4.1 源码分析

前面在1~3所描述的内容是为4做“嫁衣”,现在开始inet_csk_get_port()源码分析

int inet_csk_get_port(struct sock *sk, unsigned short snum)
{
	//得到该sock对应协议族的全局的底层容器 hashinfo = tcp_hashinfo ,
	//其中它在struct proto tcp_prot内部初始化。而tcp_hashinfo的部分成
	//员是在 tcp_init()函数内部初始化,要搞清楚这里的关系,一定要查看 
	//tcp_init() 函数内部的实现
	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; 
	struct inet_bind_hashbucket *head;
	struct inet_bind_bucket *tb;
	int ret, attempts = 5;
	struct net *net = sock_net(sk);
	int smallest_size = -1, smallest_rover;
	kuid_t uid = sock_i_uid(sk); //获取当前socket对应的用户id

	local_bh_disable();
	//端口无效(我们的应用程序在开发的时候配置的无效端口,所以这里会随机
	//分配一个),这种情况就是随机绑定一个没有使用的端口
	if (!snum) { //端口无效
		int remaining, rover, low, high;

again:
		inet_get_local_port_range(&low, &high); //获取端口范围,一般就是1到65535,就是我们常用的端口号范围,当然也可以自己配置
		remaining = (high - low) + 1; //剩余端口个数
		smallest_rover = rover = net_random() % remaining + low; //随机分配一个数字作为端口

		smallest_size = -1;
		do {
			//是否是保留的端口
			if (inet_is_reserved_local_port(rover)) 
				goto next_nolock; //如果是保留的端口就切换到下一个,即++rover

			//通过端口号,即哈希值,确定其所在的链表head
			head = &hashinfo->bhash[inet_bhashfn(net, rover,
					hashinfo->bhash_size)];

			/* 锁住哈希桶 */ 
			spin_lock(&head->lock);

			/* 从头遍历哈希桶,在inet_bind_bucket_for_each函数内部运用了
			   container_of机制,通过指针成员获取其对应的结构体,这里既是tb*/
			inet_bind_bucket_for_each(tb, &head->chain)
				 /* 如果端口被使用了,就进行冲突检测 */
				if (net_eq(ib_net(tb), net) && tb->port == rover) { 
					if (((tb->fastreuse > 0 && //tb中的参数可“快速重用”
					      sk->sk_reuse && //socket参数允许快速重用
					      sk->sk_state != TCP_LISTEN) || //不在监听状态
					     (tb->fastreuseport > 0 &&
					      sk->sk_reuseport &&
					      uid_eq(tb->fastuid, uid))) && //socket用户id相等
					    (tb->num_owners < smallest_size || smallest_size == -1)) { 
						smallest_size = tb->num_owners; /* 记下这个端口使用者的个数 */  
						smallest_rover = rover; /* 记下这个端口 */

						/* 如果系统绑定的端口已经很多了,那么就判断端口是否有绑定冲突*/  
						if (atomic_read(&hashinfo->bsockets) > (high - low) + 1 &&
						    !inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
							snum = smallest_rover; /* 没有冲突,使用此端口 */  
							goto tb_found;
						}
					}

					/* 检查是否有端口绑定冲突,该端口是否能重用 */ 
					if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
						snum = rover;
						goto tb_found;
					}
					goto next; /* 此端口不可重用,看下一个 */  
				}

			/* 找到了没被用的端口,退出 */  
			break; //如果一个桶遍历过了,没有冲突的,那么就需要在下面建立一个inet_bind_bucket
		next:
			spin_unlock(&head->lock);
		next_nolock:
			if (++rover > high)
				rover = low;
		} while (--remaining > 0);

		/* Exhausted local port range during search?  It is not
		 * possible for us to be holding one of the bind hash
		 * locks if this test triggers, because if 'remaining'
		 * drops to zero, we broke out of the do/while loop at
		 * the top level, not from the 'break;' statement.
		 */
		ret = 1;
		if (remaining <= 0) {
			if (smallest_size != -1) {
				snum = smallest_rover;
				goto have_snum;
			}
			goto fail;
		}
		/* OK, here is the one we will use.  HEAD is
		 * non-NULL and we hold it's mutex.
		 */
		snum = rover; /* 自动选择的可用端口 */
	} else { /* 如果应用层有指定要绑定的端口 */ 
have_snum: //有端口
		/* 走到这里,表示用户已经自己绑定了端口
		 1. inet_bhashfn(net, snum, hashinfo->bhash_size): 计算struct inet_bind_hashbucket指针索引
		 2. head = &hashinfo->bhash[*]: 返回struct inet_bind_hashbucket hash桶指针,即端口所在的哈希桶 
		 3. inet_bind_bucket_for_each(tb, &head->chain):遍历当前hash桶内部的chain(hlist)链表,该链表
		 	上注册了已被绑定端口,通过该chain链表及node成员找到(运用container_of)找到所属的结构体,即
		 	结构体为tb (struct inet_bind_bucket),具体的端口绑定到链表详见inet_bind_bucket_create()函
		 	数内部的实现
		 4. net_eq(ib_net(tb), net) && tb->port == snum: 是否是同一个net[个人理解,这个应该是创建一个socket就对应一个net] && 端口是否相等 
		*/
		head = &hashinfo->bhash[inet_bhashfn(net, snum,
				hashinfo->bhash_size)];
		spin_lock(&head->lock);
		inet_bind_bucket_for_each(tb, &head->chain)
			if (net_eq(ib_net(tb), net) && tb->port == snum) //从hash链表里获取的端口与应用配置的端口相等?
				goto tb_found; /* 发现端口在用 */ 
	}
	tb = NULL;
	goto tb_not_found;
tb_found:
	 /* 端口上有绑定sock时 */  
	if (!hlist_empty(&tb->owners)) { //为NULL表示tb未被使用
		/* 这是强制的绑定啊,不管端口是否会绑定冲突!*/ 
		if (sk->sk_reuse == SK_FORCE_REUSE)
			goto success;

		//根据socket的参数判断当前的端口是否快速重用
		if (((tb->fastreuse > 0 &&
		      sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
		     (tb->fastreuseport > 0 &&
		      sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
		    smallest_size == -1) {  /* 指定端口的情况 */
			goto success;
		} else {
			ret = 1;
			if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {  /* 端口绑定冲突 */
				/* 自动分配的端口绑定冲突了,再次尝试,最多重试5次。  
                 * 我觉得以下if不必要,因为自动选择时goto tb_found之前都有检测过了。 
                 */
				if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
				     (tb->fastreuseport > 0 &&
				      sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
				    smallest_size != -1 && --attempts >= 0) {
					spin_unlock(&head->lock);
					goto again;
				}

				goto fail_unlock;
			}
		}
	}
tb_not_found: //到这里表示在hash桶里面没有找到端口
	ret = 1;
	/* 申请和初始化一个inet_bind_bucket结构, 返回一个tb hash桶*/  
	if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
					net, head, snum)) == NULL)
		goto fail_unlock;
	if (hlist_empty(&tb->owners)) { //在inet_bind_bucket_create()函数内部tb->owners初始化为NULL
		if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) //sk->sk_reuse变量在inet_create()函数内部初始化的
			tb->fastreuse = 1;
		else
			tb->fastreuse = 0;
		if (sk->sk_reuseport) { //端口重用
			tb->fastreuseport = 1; 
			tb->fastuid = uid; //用户id
		} else
			tb->fastreuseport = 0;
	} else {
		if (tb->fastreuse && //重用
		    (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) //禁止端口复用 || socket状态为监听
			tb->fastreuse = 0; //禁止重用
		if (tb->fastreuseport &&
		    (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))) //端口禁止重用 || 用户id不相等
			tb->fastreuseport = 0;
	}
success:
	/* 赋值icsk中的inet_bind_bucket */ 
	if (!inet_csk(sk)->icsk_bind_hash) //未绑定hash桶, 在下面的 inet_bind_hash()函数内部绑定
		inet_bind_hash(sk, tb, snum); //重要,将hash桶绑定到sk->sk_prot->h.hashinfo上
	WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
	ret = 0;

fail_unlock:
	spin_unlock(&head->lock);
fail:
	local_bh_enable();
	return ret;
}
EXPORT_SYMBOL_GPL(inet_csk_get_port);

该函数的主要功能是通过协议层bind()函数绑定的本地端口判断,端口是否为无效?

a. 端口无效就使用inet_get_local_port_range()自动分配一个端口,否则就使用应用层经过系统调用传入的端口进行绑定

b. 端口已经分配或协议层传入的有效就进行端口有效性判断,确定端口是否在使用并且申请该端口的用户id使用相同,如果端口未被使用就进入到d. 否则进入c.

		head = &hashinfo->bhash[inet_bhashfn(net, snum,
				hashinfo->bhash_size)];
		spin_lock(&head->lock);
		inet_bind_bucket_for_each(tb, &head->chain)
			if (net_eq(ib_net(tb), net) && tb->port == snum) //从hash链表里获取的端口与应用配置的端口相等?
				goto tb_found; /* 发现端口在用 */ 

c. 判定该端口对应的tb(struct inet_bind_bucket)是否在使用、确定该端口是否可重用、在进行冲突检测

	if (!hlist_empty(&tb->owners)) { //为NULL表示tb未被使用
		/* 这是强制的绑定啊,不管端口是否会绑定冲突!*/ 
		if (sk->sk_reuse == SK_FORCE_REUSE)
			goto success;

d. 当端口未被使用就申请一个inet_bind_bucket bind桶,保存当前应用层的端口,最后将端口信息添加到hash桶链表上,详见下面的流程图结构

	/* 申请和初始化一个inet_bind_bucket结构, 返回一个tb hash桶*/  
	if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
					net, head, snum)) == NULL)
struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
						 struct net *net,
						 struct inet_bind_hashbucket *head,
						 const unsigned short snum)
{
	struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);

	if (tb != NULL) {
		write_pnet(&tb->ib_net, hold_net(net)); //tb->ib_net指向命名空间为net
		tb->port      = snum; //绑定端口
		tb->fastreuse = 0;
		tb->fastreuseport = 0;
		tb->num_owners = 0;
		INIT_HLIST_HEAD(&tb->owners); //初始化tb->owners链表为NULL
		hlist_add_head(&tb->node, &head->chain); //将分配的节点tb->node添加到hash桶链表head->chain上
	}
	return tb;
}

e. 当申请一个inet_bind_bucket tb桶成功后,通过inet_bind_hash(sk, tb, snum)将该tb添加到sk->sk_bind_node链表上

f. 至此,就完成了端口在hash桶及sock上的绑定。

4.2 sock与inet_bind_hashbucket与inet_bind_bucket的关系图

inet_csk_get_port(...)

这里有个核心的功能,就是通过结构体的成员获取该结构体的指针

inet_bind_bucket_for_each(tb, &head->chain)
struct inet_bind_hashbucket *head;
struct inet_bind_bucket *tb;
struct inet_bind_hashbucket {
	spinlock_t		lock;
	struct hlist_head	chain;
};
struct inet_bind_bucket {
#ifdef CONFIG_NET_NS
	struct net		*ib_net; //网络命名空间net
#endif
	unsigned short		port; //端口
	signed char		fastreuse; //快速重用,初始化为0
	signed char		fastreuseport; //快速重用端口,初始化0
	kuid_t			fastuid;
	int			num_owners; //初始化为0
	struct hlist_node	node;
	struct hlist_head	owners;
};
inet_bind_bucket_for_each(tb, &head->chain)
#define inet_bind_bucket_for_each(tb, head) \
	hlist_for_each_entry(tb, head, node)
#define hlist_for_each_entry(pos, head, member)				\
	for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member);\
	     pos;							\
	     pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))
#define hlist_entry_safe(ptr, type, member) \
	({ typeof(ptr) ____ptr = (ptr); \
	   ____ptr ? hlist_entry(____ptr, type, member) : NULL; \
	})
#define hlist_entry(ptr, type, member) container_of(ptr,type,member)




另外这位大神写的很详细:点击打开链接



相关标签: inet_csk_get_port