应用层发送一个数据包的时候,是如何到达网卡的(上)
程序员文章站
2022-04-24 15:07:39
...
数据包首先从tcp层进行处理,对应的函数是tcp_write
/*
* This routine copies from a user buffer into a socket,
* and starts the transmit system.
*/
static int tcp_write(struct sock *sk, unsigned char *from,
int len, int nonblock, unsigned flags)
{
int copied = 0;
int copy;
int tmp;
struct sk_buff *skb;
struct sk_buff *send_tmp;
unsigned char *buff;
struct proto *prot;
struct device *dev = NULL;
sk->inuse=1;
prot = sk->prot;
while(len > 0)
{
if (sk->err)
{ /* Stop on an error */
release_sock(sk);
if (copied)
return(copied);
tmp = -sk->err;
sk->err = 0;
return(tmp);
}
/*
* First thing we do is make sure that we are established.
*/
// 关闭了只能读不能写
if (sk->shutdown & SEND_SHUTDOWN)
{
release_sock(sk);
sk->err = EPIPE;
if (copied)
return(copied);
sk->err = 0;
return(-EPIPE);
}
/*
* Wait for a connection to finish.
*/
// 处于不能写状态,close_wait是可写不可读,因为对端已经关闭了写
while(sk->state != TCP_ESTABLISHED && sk->state != TCP_CLOSE_WAIT)
{
if (sk->err)
{
release_sock(sk);
if (copied)
return(copied);
tmp = -sk->err;
sk->err = 0;
return(tmp);
}
// syn和syn_recv状态的时候可以写,重复发包,否则是出错状态
if (sk->state != TCP_SYN_SENT && sk->state != TCP_SYN_RECV)
{
release_sock(sk);
if (copied)
return(copied);
if (sk->err)
{
tmp = -sk->err;
sk->err = 0;
return(tmp);
}
// 长连接
if (sk->keepopen)
{
send_sig(SIGPIPE, current, 0);
}
return(-EPIPE);
}
if (nonblock || copied)
{
release_sock(sk);
if (copied)
return(copied);
return(-EAGAIN);
}
release_sock(sk);
cli();
if (sk->state != TCP_ESTABLISHED &&
sk->state != TCP_CLOSE_WAIT && sk->err == 0)
{
interruptible_sleep_on(sk->sleep);
if (current->signal & ~current->blocked)
{
sti();
if (copied)
return(copied);
return(-ERESTARTSYS);
}
}
sk->inuse = 1;
sti();
}
/*
* The following code can result in copy <= if sk->mss is ever
* decreased. It shouldn't be. sk->mss is min(sk->mtu, sk->max_window).
* sk->mtu is constant once SYN processing is finished. I.e. we
* had better not get here until we've seen his SYN and at least one
* valid ack. (The SYN sets sk->mtu and the ack sets sk->max_window.)
* But ESTABLISHED should guarantee that. sk->max_window is by definition
* non-decreasing. Note that any ioctl to set user_mss must be done
* before the exchange of SYN's. If the initial ack from the other
* end has a window of 0, max_window and thus mss will both be 0.
*/
/*
* Now we need to check if we have a half built packet.
*/
// 先看是否有小块的数据被缓存起来,是的话先取出skb,不需要立刻发送的话再入队
if ((skb = tcp_dequeue_partial(sk)) != NULL)
{
int hdrlen;
/* IP header + TCP header */
hdrlen = ((unsigned long)skb->h.th - (unsigned long)skb->data)
+ sizeof(struct tcphdr);
/* Add more stuff to the end of skb->len */
// 不是紧急数据,则把数据追加到缓存的小包数据后面,是紧急数据则先把小包数据发出去,然后下一个循环再发普通数据
if (!(flags & MSG_OOB))
{
// mss-数据长度等于还可以传多少长度的数据
copy = min(sk->mss - (skb->len - hdrlen), len);
/* FIXME: this is really a bug. */
if (copy <= 0)
{
printk("TCP: **bug**: \"copy\" <= 0!!\n");
copy = 0;
}
// 把用户的数据赋值copy长度个字节到数据包的数据部分
memcpy_fromfs(skb->data + skb->len, from, copy);
// 更新skb的data字段使用了多少字节
skb->len += copy;
// 下次复制的首地址
from += copy;
// 已复制的字节长度
copied += copy;
// 还有多少字节需要复制
len -= copy;
// 下一个发送的字节的***大小
sk->write_seq += copy;
}
// 数据部分大于等于mss或者是带外数据或者还没有发出去一个数据包则直接发送
if ((skb->len - hdrlen) >= sk->mss ||
(flags & MSG_OOB) || !sk->packets_out)
tcp_send_skb(sk, skb);
else
// 继续缓存,满足条件后一起发送
tcp_enqueue_partial(skb, sk);
continue;
}
/*
* We also need to worry about the window.
* If window < 1/2 the maximum window we've seen from this
* host, don't use it. This is sender side
* silly window prevention, as specified in RFC1122.
* (Note that this is different than earlier versions of
* SWS prevention, e.g. RFC813.). What we actually do is
* use the whole MSS. Since the results in the right
* edge of the packet being outside the window, it will
* be queued for later rather than sent.
*/
// 可发送的序列化最大值 - 下一个可写的序列化值等于可以发送的字节数
copy = sk->window_seq - sk->write_seq;
if (copy <= 0 || copy < (sk->max_window >> 1) || copy > sk->mss)
copy = sk->mss;
// 能发送的比需要发送的大,则取需要发送的
if (copy > len)
copy = len;
/*
* We should really check the window here also.
*/
send_tmp = NULL;
// 不是紧急数据并且也小于mss,则需要缓存到partial队列,否则直接发送
if (copy < sk->mss && !(flags & MSG_OOB))
{
/*
* We will release the socket in case we sleep here.
*/
release_sock(sk);
/*
* NB: following must be mtu, because mss can be increased.
* mss is always <= mtu
*/
skb = prot->wmalloc(sk, sk->mtu + 128 + prot->max_header, 0, GFP_KERNEL);
sk->inuse = 1;
send_tmp = skb;
}
else
{
/*
* We will release the socket in case we sleep here.
*/
release_sock(sk);
skb = prot->wmalloc(sk, copy + prot->max_header , 0, GFP_KERNEL);
sk->inuse = 1;
}
/*
* If we didn't get any memory, we need to sleep.
*/
// 没有写空间了
if (skb == NULL)
{
sk->socket->flags |= SO_NOSPACE;
// 非阻塞直接返回已经写入的字节
if (nonblock)
{
release_sock(sk);
if (copied)
return(copied);
return(-EAGAIN);
}
/*
* FIXME: here is another race condition.
*/
tmp = sk->wmem_alloc;
// 这个函数会处理收到的数据包,如果收到ack包则会腾出写空间
release_sock(sk);
cli();
/*
* Again we will try to avoid it.
*/
// 处于可写状态但是没有写空间,则阻塞
if (tmp <= sk->wmem_alloc &&
(sk->state == TCP_ESTABLISHED||sk->state == TCP_CLOSE_WAIT)
&& sk->err == 0)
{
sk->socket->flags &= ~SO_NOSPACE;
interruptible_sleep_on(sk->sleep);
if (current->signal & ~current->blocked)
{
sti();
if (copied)
return(copied);
return(-ERESTARTSYS);
}
}
sk->inuse = 1;
sti();
continue;
}
skb->len = 0;
skb->sk = sk;
skb->free = 0;
skb->localroute = sk->localroute|(flags&MSG_DONTROUTE);
buff = skb->data;
/*
* FIXME: we need to optimize this.
* Perhaps some hints here would be good.
*/
// 构建ip头和mac头,返回ip头+mac头的长度的大小
tmp = prot->build_header(skb, sk->saddr, sk->daddr, &dev,
IPPROTO_TCP, sk->opt, skb->mem_len,sk->ip_tos,sk->ip_ttl);
if (tmp < 0 )
{
prot->wfree(sk, skb->mem_addr, skb->mem_len);
release_sock(sk);
if (copied)
return(copied);
return(tmp);
}
// 更新data中的数据长度
skb->len += tmp;
skb->dev = dev;
// 指向可写地址,准备写入tcp头
buff += tmp;
// skb的tcp头指向data字段的tcp头
skb->h.th =(struct tcphdr *) buff;
// 构建tcp头,len-copy表示是否已经传输完len字节的数据,用于设置push标记
tmp = tcp_build_header((struct tcphdr *)buff, sk, len-copy);
if (tmp < 0)
{
prot->wfree(sk, skb->mem_addr, skb->mem_len);
release_sock(sk);
if (copied)
return(copied);
return(tmp);
}
// 带外数据
if (flags & MSG_OOB)
{ // 设置urg标记位,设置紧急指针指向紧急数据的后面一个字节
((struct tcphdr *)buff)->urg = 1;
((struct tcphdr *)buff)->urg_ptr = ntohs(copy);
}
// 更新skb->data中的数据长度
skb->len += tmp;
// 复制copy个字节到tcp头后面成为tcp报文的负载
memcpy_fromfs(buff+tmp, from, copy);
// 更新需要复制的数据地址
from += copy;
// 复制字节数累加
copied += copy;
// 还有多少个字节需要复制
len -= copy;
// 更新skb->data的数据长度
skb->len += copy;
skb->free = 0;
// 更新下一个tcp报文的序列化
sk->write_seq += copy;
// 数据量太少并且不是紧急数据,并且有待确认的包(nagle算法规则),则先缓存
if (send_tmp != NULL && sk->packets_out)
{
tcp_enqueue_partial(send_tmp, sk);
continue;
}
// 否则直接发送
tcp_send_skb(sk, skb);
}
sk->err = 0;
/*
* Nagle's rule. Turn Nagle off with TCP_NODELAY for highly
* interactive fast network servers. It's meant to be on and
* it really improves the throughput though not the echo time
* on my slow slip link - Alan
*/
/*
* Avoid possible race on send_tmp - c/o Johannes Stille
*/
// 符合nagle算法条件或者没有开启nagle算法且***合法则发送
if(sk->partial && ((!sk->packets_out)
/* If not nagling we can send on the before case too.. */
|| (sk->nonagle && before(sk->write_seq , sk->window_seq))
))
tcp_send_partial(sk);
release_sock(sk);
return(copied);
}
ip层发送函数,进行了数据包的缓存处理
/*
* Queues a packet to be sent, and starts the transmitter
* if necessary. if free = 1 then we free the block after
* transmit, otherwise we don't. If free==2 we not only
* free the block but also don't assign a new ip seq number.
* This routine also needs to put in the total length,
* and compute the checksum
*/
void ip_queue_xmit(struct sock *sk, struct device *dev,
struct sk_buff *skb, int free)
{
struct iphdr *iph;
unsigned char *ptr;
/* Sanity check */
if (dev == NULL)
{
printk("IP: ip_queue_xmit dev = NULL\n");
return;
}
IS_SKB(skb);
/*
* Do some book-keeping in the packet for later
*/
skb->dev = dev;
// 发送时间
skb->when = jiffies;
/*
* Find the IP header and set the length. This is bad
* but once we get the skb data handling code in the
* hardware will push its header sensibly and we will
* set skb->ip_hdr to avoid this mess and the fixed
* header length problem
*/
ptr = skb->data;
ptr += dev->hard_header_len;
iph = (struct iphdr *)ptr;
skb->ip_hdr = iph;
// 整个ip头和数据的长度
iph->tot_len = ntohs(skb->len-dev->hard_header_len);
#ifdef CONFIG_IP_FIREWALL
if(ip_fw_chk(iph, dev, ip_fw_blk_chain, ip_fw_blk_policy, 0) != 1)
/* just don't send this packet */
return;
#endif
/*
* No reassigning numbers to fragments...
*/
// 用于重组分片的id
if(free!=2)
iph->id = htons(ip_id_count++);
else
free=1;
/* All buffers without an owner socket get freed */
if (sk == NULL)
free = 1;
skb->free = free;
/*
* Do we need to fragment. Again this is inefficient.
* We need to somehow lock the original buffer and use
* bits of it.
*/
// 数据包大小mtu则分片处理
if(skb->len > dev->mtu + dev->hard_header_len)
{
ip_fragment(sk,skb,dev,0);
IS_SKB(skb);
kfree_skb(skb,FREE_WRITE);
return;
}
/*
* Add an IP checksum
*/
// ip层校验和
ip_send_check(iph);
/*
* Print the frame when debugging
*/
/*
* More debugging. You cannot queue a packet already on a list
* Spot this and moan loudly.
*/
if (skb->next != NULL)
{
printk("ip_queue_xmit: next != NULL\n");
skb_unlink(skb);
}
/*
* If a sender wishes the packet to remain unfreed
* we add it to his send queue. This arguably belongs
* in the TCP level since nobody else uses it. BUT
* remember IPng might change all the rules.
*/
// free等于0说明这个包要缓存
if (!free)
{
unsigned long flags;
/* The socket now has more outstanding blocks */
// 发送但还没收到确认的数据包数量
sk->packets_out++;
/* Protect the list for a moment */
save_flags(flags);
cli();
if (skb->link3 != NULL)
{
printk("ip.c: link3 != NULL\n");
skb->link3 = NULL;
}
// 插入已发送但未确认队列,用于超时重传
if (sk->send_head == NULL)
{
sk->send_tail = skb;
sk->send_head = skb;
}
else
{
sk->send_tail->link3 = skb;
sk->send_tail = skb;
}
/* skb->link3 is NULL */
/* Interrupt restore */
restore_flags(flags);
}
else
/* Remember who owns the buffer */
skb->sk = sk;
/*
* If the indicated interface is up and running, send the packet.
*/
ip_statistics.IpOutRequests++;
#ifdef CONFIG_IP_ACCT
ip_acct_cnt(iph,dev, ip_acct_chain);
#endif
#ifdef CONFIG_IP_MULTICAST
/*
* Multicasts are looped back for other local users
*/
if (MULTICAST(iph->daddr) && !(dev->flags&IFF_LOOPBACK))
{
if(sk==NULL || sk->ip_mc_loop)
{
if(iph->daddr==IGMP_ALL_HOSTS)
ip_loopback(dev,skb);
else
{
struct ip_mc_list *imc=dev->ip_mc_list;
while(imc!=NULL)
{
if(imc->multiaddr==iph->daddr)
{
ip_loopback(dev,skb);
break;
}
imc=imc->next;
}
}
}
/* Multicasts with ttl 0 must not go beyond the host */
if(skb->ip_hdr->ttl==0)
{
kfree_skb(skb, FREE_READ);
return;
}
}
#endif
if((dev->flags&IFF_BROADCAST) && iph->daddr==dev->pa_brdaddr && !(dev->flags&IFF_LOOPBACK))
ip_loopback(dev,skb);
if (dev->flags & IFF_UP)
{
/*
* If we have an owner use its priority setting,
* otherwise use NORMAL
*/
if (sk != NULL)
{
// 调用mac层发送
dev_queue_xmit(skb, dev, sk->priority);
}
else
{
dev_queue_xmit(skb, dev, SOPRI_NORMAL);
}
}
else
{
ip_statistics.IpOutDiscards++;
if (free)
kfree_skb(skb, FREE_WRITE);
}
}