欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

应用层发送一个数据包的时候,是如何到达网卡的(上)

程序员文章站 2022-04-24 15:07:39
...

数据包首先从tcp层进行处理,对应的函数是tcp_write

/*
 *	This routine copies from a user buffer into a socket,
 *	and starts the transmit system.
 */

static int tcp_write(struct sock *sk, unsigned char *from,
	  int len, int nonblock, unsigned flags)
{
	int copied = 0;
	int copy;
	int tmp;
	struct sk_buff *skb;
	struct sk_buff *send_tmp;
	unsigned char *buff;
	struct proto *prot;
	struct device *dev = NULL;

	sk->inuse=1;
	prot = sk->prot;
	while(len > 0) 
	{
		if (sk->err) 
		{			/* Stop on an error */
			release_sock(sk);
			if (copied) 
				return(copied);
			tmp = -sk->err;
			sk->err = 0;
			return(tmp);
		}

		/*
		 *	First thing we do is make sure that we are established. 
		 */
		// 关闭了只能读不能写
		if (sk->shutdown & SEND_SHUTDOWN) 
		{
			release_sock(sk);
			sk->err = EPIPE;
			if (copied) 
				return(copied);
			sk->err = 0;
			return(-EPIPE);
		}

		/* 
		 *	Wait for a connection to finish.
		 */
		// 处于不能写状态,close_wait是可写不可读,因为对端已经关闭了写		
		while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT) 
		{
			if (sk->err) 
			{
				release_sock(sk);
				if (copied) 
					return(copied);
				tmp = -sk->err;
				sk->err = 0;
				return(tmp);
			}
			// syn和syn_recv状态的时候可以写,重复发包,否则是出错状态
			if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV) 
			{
				release_sock(sk);
				if (copied) 
					return(copied);

				if (sk->err) 
				{
					tmp = -sk->err;
					sk->err = 0;
					return(tmp);
				}
				// 长连接 
				if (sk->keepopen) 
				{
					send_sig(SIGPIPE, current, 0);
				}
				return(-EPIPE);
			}

			if (nonblock || copied) 
			{
				release_sock(sk);
				if (copied) 
					return(copied);
				return(-EAGAIN);
			}

			release_sock(sk);
			cli();
		
			if (sk->state != TCP_ESTABLISHED &&
		    		sk->state != TCP_CLOSE_WAIT && sk->err == 0) 
		    	{
				interruptible_sleep_on(sk->sleep);
				if (current->signal & ~current->blocked) 
				{
					sti();
					if (copied) 
						return(copied);
					return(-ERESTARTSYS);
				}
			}
			sk->inuse = 1;
			sti();
		}

	/*
	 * The following code can result in copy <= if sk->mss is ever
	 * decreased.  It shouldn't be.  sk->mss is min(sk->mtu, sk->max_window).
	 * sk->mtu is constant once SYN processing is finished.  I.e. we
	 * had better not get here until we've seen his SYN and at least one
	 * valid ack.  (The SYN sets sk->mtu and the ack sets sk->max_window.)
	 * But ESTABLISHED should guarantee that.  sk->max_window is by definition
	 * non-decreasing.  Note that any ioctl to set user_mss must be done
	 * before the exchange of SYN's.  If the initial ack from the other
	 * end has a window of 0, max_window and thus mss will both be 0.
	 */

	/* 
	 *	Now we need to check if we have a half built packet. 
	 */
		// 先看是否有小块的数据被缓存起来,是的话先取出skb,不需要立刻发送的话再入队
		if ((skb = tcp_dequeue_partial(sk)) != NULL) 
		{
		        int hdrlen;

		         /* IP header + TCP header */
			hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data)
			         + sizeof(struct tcphdr);
	
			/* Add more stuff to the end of skb->len */
			// 不是紧急数据,则把数据追加到缓存的小包数据后面,是紧急数据则先把小包数据发出去,然后下一个循环再发普通数据
			if (!(flags & MSG_OOB)) 
			{	
				// mss-数据长度等于还可以传多少长度的数据
				copy = min(sk->mss - (skb->len - hdrlen), len);
				/* FIXME: this is really a bug. */
				if (copy <= 0) 
				{
			  		printk("TCP: **bug**: \"copy\" <= 0!!\n");
			  		copy = 0;
				}
	  			// 把用户的数据赋值copy长度个字节到数据包的数据部分
				memcpy_fromfs(skb->data + skb->len, from, copy);
				// 更新skb的data字段使用了多少字节
				skb->len += copy;
				// 下次复制的首地址
				from += copy;
				// 已复制的字节长度
				copied += copy;
				// 还有多少字节需要复制
				len -= copy;
				// 下一个发送的字节的***大小
				sk->write_seq += copy;
			}
			// 数据部分大于等于mss或者是带外数据或者还没有发出去一个数据包则直接发送
			if ((skb->len - hdrlen) >= sk->mss ||
				(flags & MSG_OOB) || !sk->packets_out)
				tcp_send_skb(sk, skb);
			else
				// 继续缓存,满足条件后一起发送
				tcp_enqueue_partial(skb, sk);
			continue;
		}

	/*
	 * We also need to worry about the window.
 	 * If window < 1/2 the maximum window we've seen from this
 	 *   host, don't use it.  This is sender side
 	 *   silly window prevention, as specified in RFC1122.
 	 *   (Note that this is different than earlier versions of
 	 *   SWS prevention, e.g. RFC813.).  What we actually do is 
	 *   use the whole MSS.  Since the results in the right
	 *   edge of the packet being outside the window, it will
	 *   be queued for later rather than sent.
	 */
		// 可发送的序列化最大值 - 下一个可写的序列化值等于可以发送的字节数
		copy = sk->window_seq - sk->write_seq;
		if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
			copy = sk->mss;
		// 能发送的比需要发送的大,则取需要发送的
		if (copy > len)
			copy = len;

	/*
	 *	We should really check the window here also. 
	 */
	 
		send_tmp = NULL;
		// 不是紧急数据并且也小于mss,则需要缓存到partial队列,否则直接发送
		if (copy < sk->mss && !(flags & MSG_OOB)) 
		{
			/*
			 *	We will release the socket in case we sleep here. 
			 */
			release_sock(sk);
			/*
			 *	NB: following must be mtu, because mss can be increased.
			 *	mss is always <= mtu 
			 */
			skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header, 0, GFP_KERNEL);
			sk->inuse = 1;
			send_tmp = skb;
		} 
		else 
		{
			/*
			 *	We will release the socket in case we sleep here. 
			 */
			release_sock(sk);
			skb = prot->wmalloc(sk, copy + prot->max_header , 0, GFP_KERNEL);
  			sk->inuse = 1;
		}

		/*
		 *	If we didn't get any memory, we need to sleep. 
		 */
		// 没有写空间了
		if (skb == NULL) 
		{
			sk->socket->flags |= SO_NOSPACE;
			// 非阻塞直接返回已经写入的字节
			if (nonblock) 
			{
				release_sock(sk);
				if (copied) 
					return(copied);
				return(-EAGAIN);
			}

			/*
			 *	FIXME: here is another race condition. 
			 */

			tmp = sk->wmem_alloc;
			// 这个函数会处理收到的数据包,如果收到ack包则会腾出写空间
			release_sock(sk);
			cli();
			/*
			 *	Again we will try to avoid it. 
			 */
			// 处于可写状态但是没有写空间,则阻塞
			if (tmp <= sk->wmem_alloc &&
				  (sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
				&& sk->err == 0) 
			{
				sk->socket->flags &= ~SO_NOSPACE;
				interruptible_sleep_on(sk->sleep);
				if (current->signal & ~current->blocked) 
				{
					sti();
					if (copied) 
						return(copied);
					return(-ERESTARTSYS);
				}
			}
			sk->inuse = 1;
			sti();
			continue;
		}

		skb->len = 0;
		skb->sk = sk;
		skb->free = 0;
		skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
	
		buff = skb->data;
	
		/*
		 * FIXME: we need to optimize this.
		 * Perhaps some hints here would be good.
		 */
		// 构建ip头和mac头,返回ip头+mac头的长度的大小
		tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
				 IPPROTO_TCP, sk->opt, skb->mem_len,sk->ip_tos,sk->ip_ttl);
		if (tmp < 0 ) 
		{
			prot->wfree(sk, skb->mem_addr, skb->mem_len);
			release_sock(sk);
			if (copied) 
				return(copied);
			return(tmp);
		}
		// 更新data中的数据长度
		skb->len += tmp;
		skb->dev = dev;
		// 指向可写地址,准备写入tcp头
		buff += tmp;
		// skb的tcp头指向data字段的tcp头
		skb->h.th =(struct tcphdr *) buff;
		// 构建tcp头,len-copy表示是否已经传输完len字节的数据,用于设置push标记
		tmp = tcp_build_header((struct tcphdr *)buff, sk, len-copy);
		if (tmp < 0) 
		{
			prot->wfree(sk, skb->mem_addr, skb->mem_len);
			release_sock(sk);
			if (copied) 
				return(copied);
			return(tmp);
		}
		// 带外数据
		if (flags & MSG_OOB) 
		{	// 设置urg标记位,设置紧急指针指向紧急数据的后面一个字节
			((struct tcphdr *)buff)->urg = 1;
			((struct tcphdr *)buff)->urg_ptr = ntohs(copy);
		}
		// 更新skb->data中的数据长度
		skb->len += tmp;
		// 复制copy个字节到tcp头后面成为tcp报文的负载
		memcpy_fromfs(buff+tmp, from, copy);
		// 更新需要复制的数据地址
		from += copy;
		// 复制字节数累加
		copied += copy;
		// 还有多少个字节需要复制
		len -= copy;
		// 更新skb->data的数据长度
		skb->len += copy;
		skb->free = 0;
		// 更新下一个tcp报文的序列化
		sk->write_seq += copy;
		// 数据量太少并且不是紧急数据,并且有待确认的包(nagle算法规则),则先缓存
		if (send_tmp != NULL && sk->packets_out) 
		{
			tcp_enqueue_partial(send_tmp, sk);
			continue;
		}
		// 否则直接发送
		tcp_send_skb(sk, skb);
	}
	sk->err = 0;

/*
 *	Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
 *	interactive fast network servers. It's meant to be on and
 *	it really improves the throughput though not the echo time
 *	on my slow slip link - Alan
 */

/*
 *	Avoid possible race on send_tmp - c/o Johannes Stille 
 */
	// 符合nagle算法条件或者没有开启nagle算法且***合法则发送
	if(sk->partial && ((!sk->packets_out) 
     /* If not nagling we can send on the before case too.. */
	      || (sk->nonagle && before(sk->write_seq , sk->window_seq))
      	))
  		tcp_send_partial(sk);

	release_sock(sk);
	return(copied);
}

ip层发送函数,进行了数据包的缓存处理


/*
 * Queues a packet to be sent, and starts the transmitter
 * if necessary.  if free = 1 then we free the block after
 * transmit, otherwise we don't. If free==2 we not only
 * free the block but also don't assign a new ip seq number.
 * This routine also needs to put in the total length,
 * and compute the checksum
 */

void ip_queue_xmit(struct sock *sk, struct device *dev,
	      struct sk_buff *skb, int free)
{
	struct iphdr *iph;
	unsigned char *ptr;

	/* Sanity check */
	if (dev == NULL)
	{
		printk("IP: ip_queue_xmit dev = NULL\n");
		return;
	}

	IS_SKB(skb);

	/*
	 *	Do some book-keeping in the packet for later
	 */


	skb->dev = dev;
	// 发送时间
	skb->when = jiffies;

	/*
	 *	Find the IP header and set the length. This is bad
	 *	but once we get the skb data handling code in the
	 *	hardware will push its header sensibly and we will
	 *	set skb->ip_hdr to avoid this mess and the fixed
	 *	header length problem
	 */

	ptr = skb->data;
	ptr += dev->hard_header_len;
	iph = (struct iphdr *)ptr;
	skb->ip_hdr = iph;
	// 整个ip头和数据的长度
	iph->tot_len = ntohs(skb->len-dev->hard_header_len);

#ifdef CONFIG_IP_FIREWALL
	if(ip_fw_chk(iph, dev, ip_fw_blk_chain, ip_fw_blk_policy, 0) != 1)
		/* just don't send this packet */
		return;
#endif	

	/*
	 *	No reassigning numbers to fragments...
	 */
	// 用于重组分片的id
	if(free!=2)
		iph->id      = htons(ip_id_count++);
	else
		free=1;

	/* All buffers without an owner socket get freed */
	if (sk == NULL)
		free = 1;

	skb->free = free;

	/*
	 *	Do we need to fragment. Again this is inefficient.
	 *	We need to somehow lock the original buffer and use
	 *	bits of it.
	 */
	// 数据包大小mtu则分片处理
	if(skb->len > dev->mtu + dev->hard_header_len)
	{
		ip_fragment(sk,skb,dev,0);
		IS_SKB(skb);
		kfree_skb(skb,FREE_WRITE);
		return;
	}

	/*
	 *	Add an IP checksum
	 */
	// ip层校验和
	ip_send_check(iph);

	/*
	 *	Print the frame when debugging
	 */

	/*
	 *	More debugging. You cannot queue a packet already on a list
	 *	Spot this and moan loudly.
	 */
	if (skb->next != NULL)
	{
		printk("ip_queue_xmit: next != NULL\n");
		skb_unlink(skb);
	}

	/*
	 *	If a sender wishes the packet to remain unfreed
	 *	we add it to his send queue. This arguably belongs
	 *	in the TCP level since nobody else uses it. BUT
	 *	remember IPng might change all the rules.
	 */
	// free等于0说明这个包要缓存
	if (!free)
	{
		unsigned long flags;
		/* The socket now has more outstanding blocks */
		// 发送但还没收到确认的数据包数量
		sk->packets_out++;

		/* Protect the list for a moment */
		save_flags(flags);
		cli();

		if (skb->link3 != NULL)
		{
			printk("ip.c: link3 != NULL\n");
			skb->link3 = NULL;
		}
		// 插入已发送但未确认队列,用于超时重传
		if (sk->send_head == NULL)
		{
			sk->send_tail = skb;
			sk->send_head = skb;
		}
		else
		{
			sk->send_tail->link3 = skb;
			sk->send_tail = skb;
		}
		/* skb->link3 is NULL */

		/* Interrupt restore */
		restore_flags(flags);
	}
	else
		/* Remember who owns the buffer */
		skb->sk = sk;

	/*
	 *	If the indicated interface is up and running, send the packet.
	 */
	 
	ip_statistics.IpOutRequests++;
#ifdef CONFIG_IP_ACCT
	ip_acct_cnt(iph,dev, ip_acct_chain);
#endif	
	
#ifdef CONFIG_IP_MULTICAST	

	/*
	 *	Multicasts are looped back for other local users
	 */
	 
	if (MULTICAST(iph->daddr) && !(dev->flags&IFF_LOOPBACK))
	{
		if(sk==NULL || sk->ip_mc_loop)
		{
			if(iph->daddr==IGMP_ALL_HOSTS)
				ip_loopback(dev,skb);
			else
			{
				struct ip_mc_list *imc=dev->ip_mc_list;
				while(imc!=NULL)
				{
					if(imc->multiaddr==iph->daddr)
					{
						ip_loopback(dev,skb);
						break;
					}
					imc=imc->next;
				}
			}
		}
		/* Multicasts with ttl 0 must not go beyond the host */
		
		if(skb->ip_hdr->ttl==0)
		{
			kfree_skb(skb, FREE_READ);
			return;
		}
	}
#endif
	if((dev->flags&IFF_BROADCAST) && iph->daddr==dev->pa_brdaddr && !(dev->flags&IFF_LOOPBACK))
		ip_loopback(dev,skb);
		
	if (dev->flags & IFF_UP)
	{
		/*
		 *	If we have an owner use its priority setting,
		 *	otherwise use NORMAL
		 */

		if (sk != NULL)
		{	
			// 调用mac层发送
			dev_queue_xmit(skb, dev, sk->priority);
		}
		else
		{
			dev_queue_xmit(skb, dev, SOPRI_NORMAL);
		}
	}
	else
	{
		ip_statistics.IpOutDiscards++;
		if (free)
			kfree_skb(skb, FREE_WRITE);
	}
}