您的位置：首页 > 理论基础 > 计算机网络

TCP/IP学习(30)——L2数据链路层的数据包处理详细流程

2014-07-17 09:48 417 查看

摘自：http://blog.chinaunix.net/uid-23629988-id-305200.html

本文的copyleft归gfree.wind@gmail.com所有，使用GPL发布，可以自由拷贝，转载。但转载请保持文档的完整性，注明原作者及原链接，严禁用于任何商业用途。

作者：gfree.wind@gmail.com

博客：linuxfocus.blog.chinaunix.net

在前面的博文中，我学习了数据包从L2到L5的流程，但是当时因为时间和水平的限制，整个儿流程并没有涉及太多的细节。前两天大致又过了这个流程，发现有不少细节还是需要注意的。所以决定，将之前略过的一些内容，详细的学习一遍。

今天主要是学习L2数据链路层的数据包的处理机制。在Linux
kernel中，由网卡驱动完成L1物理层和L2数据链路层的工作。

首先看函数net_dev_init

static int __init net_dev_init(void)

{

int i, rc = -ENOMEM;

BUG_ON(!dev_boot_phase);

/*
创建对应的/proc文件，如/proc/net/dev, /proc/net/softnet_stat等
*/

if (dev_proc_init())

goto out;

/* 初始化netdev对应的kobject*/

if (netdev_kobject_init())

goto out;

/*
初始化数据链路层的handle上层数据类型表。
回忆前文《TCP/IP学习(28)——数据包完整接受流程》中，在inet_init中注册了IP包类型到这个表中。
*/

INIT_LIST_HEAD(&ptype_all);

for (i = 0; i < PTYPE_HASH_SIZE; i++)

INIT_LIST_HEAD(&ptype_base[i]);

/*
注册neddev_net_ops subsystem
*/

if (register_pernet_subsys(&netdev_net_ops))

goto out;

/*

* Initialise the packet receive queues.

*/

/*

为每个CPU初始化PERCPU的全局变量softnet_data，作为该CPU的接收缓存

*/

for_each_possible_cpu(i) {

struct softnet_data *sd = &per_cpu(softnet_data, i);

...... ......

}

dev_boot_phase = 0;

/* The loopback device is special if any
other network devices

* is present in a network namespace the loopback device must

* be present. Since we now dynamically allocate and free
the

* loopback device ensure this invariant is maintained by

* keeping the loopback device as the first device on the

* list of network devices. Ensuring the loopback devices

* is the first device that appears and the last network device

* that disappears.

*/

if (register_pernet_device(&loopback_net_ops))

goto out;

if (register_pernet_device(&default_device_ops))

goto out;

/*
enable软中断
*/

open_softirq(NET_TX_SOFTIRQ, net_tx_action);

open_softirq(NET_RX_SOFTIRQ, net_rx_action);

hotcpu_notifier(dev_cpu_callback, 0);

dst_init();

dev_mcast_init();

rc = 0;

out:

return rc;

}

net_dev_init在系统启动时，在注册网卡之前调用，主要就是初始化net device所需要的一些环境。

下面仍然以Intel PRO/1000的网卡驱动为例，e1000_init_module为该驱动的入口。通过e1000_init_module->pci_register_driver->e1000_probe进入初始化函数。

在e1000_probe中，通过下面这条语句绑定了操作函数。

netdev->netdev_ops = &e1000_netdev_ops;

static const struct net_device_ops e1000_netdev_ops = {

.ndo_open = e1000_open,

...... ......

};

对于今天的主题来说，只需关心e1000_open即可。因为该函数是在激活该网卡时被调用，完成资源的申请，中断的注册，即e1000_intr。

static irqreturn_t e1000_intr(int irq, void *data)

{

...... ......

/*

检测是否可以调度NAPI：

当没有disable NAPI且没有该网卡对应的NAPI在运行时（保证对应一个网卡的NAPI只有一个实例在运行），即可调度一个新的NAPI。

NAPI是一种新的网卡数据检查处理方式。基本上是interrupt+poll。详细信息问google

*/

if (likely(napi_schedule_prep(&adapter->napi))) {

/*

清楚单次的统计信息。

刚看到这里时，我也奇怪，为什么total的统计信息要被清零。

实际上这些统计信息只是一次NAPI运行的统计信息，并不是网卡总的统计信息。

网卡的统计信息为netdev->stats。NAPI运行完会将下面的值加到网卡的统计信息上的。

*/

adapter->total_tx_bytes = 0;

adapter->total_tx_packets = 0;

adapter->total_rx_bytes = 0;

adapter->total_rx_packets = 0;

/* 要求调度对应的NAPI实例 */

__napi_schedule(&adapter->napi);

} else {

/* this really should not if it
does it is basically a

* bug, but not a hard error, so
enable ints and continue */

if (!test_bit(__E1000_DOWN, &adapter->flags))

e1000_irq_enable(adapter);

}

return IRQ_HANDLED;

}

上面为中断的关键流程，其中要求调度对应的NAPI实例时，实际上是引发一个软中断。

__raise_softirq_irqoff(NET_RX_SOFTIRQ)。这个中断函数的主要功能就是要求调度一个NAPI——这里跟以前理解的中断函数不太一样。按照教科书式的概念，网卡的中断函数，应该将数据包从网卡的缓冲中取出放到一个系统缓冲中，然后在引发软中断去做剩下的工作。

下面看NET_RX_SOFTIRQ软中断对应的处理函数net_rx_action。

static void net_rx_action(struct softirq_action *h)

{

struct softnet_data *sd = &__get_cpu_var(softnet_data);

unsigned long time_limit = jiffies + 2;

int budget = netdev_budget;

void *have;

local_irq_disable();

/* 开始顺序poll所有需要poll的网卡 */

while (!list_empty(&sd->poll_list)) {

struct napi_struct *n;

int work, weight;

/* If softirq window is exhuasted then punt.

* Allow this to run for 2 jiffies since which will allow

* an average latency of 1.5/HZ.

*/

if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))

goto softnet_break;

local_irq_enable();

/* Even though interrupts have been re-enabled, this

* access is safe because interrupts can only add new

* entries to the tail of this list, and only ->poll()

* calls can remove this head entry from the list.

*/

/* 取得一个网卡的NAPI实例 */

n = list_first_entry(&sd->poll_list, struct
napi_struct, poll_list);

/* 给这个实例上锁 */

have = netpoll_poll_lock(n);

weight = n->weight;

/* This NAPI_STATE_SCHED test is for avoiding
a race

* with netpoll's poll_napi(). Only
the entity which

* obtains the lock and sees NAPI_STATE_SCHED set will

* actually make the ->poll() call. Therefore
we avoid

* accidently calling ->poll() when
NAPI is not scheduled.

*/

work = 0;

if (test_bit(NAPI_STATE_SCHED, &n->state)) {

/* poll这个网卡 */

work = n->poll(n, weight);

trace_napi_poll(n);

}

WARN_ON_ONCE(work > weight);

budget -= work;

local_irq_disable();

/* Drivers must not modify the NAPI state if they

* consume the entire weight. In such cases this code

* still "owns" the NAPI instance and therefore can

* move the instance around on the list at-will.

*/

if (unlikely(work == weight)) {

/* 该NAPI的weight消耗完毕，需要处理下一个 */

if (unlikely(napi_disable_pending(n))) {

local_irq_enable();

napi_complete(n);

local_irq_disable();

} else

list_move_tail(&n->poll_list, &sd->poll_list);

}

netpoll_poll_unlock(have);

}

out:

net_rps_action_and_irq_enable(sd);

#ifdef CONFIG_NET_DMA

/*

* There may not be any more sk_buffs coming right now, so
push

* any pending DMA copies to hardware

*/

dma_issue_pending_all();

#endif

return;

softnet_break:

sd->time_squeeze++;

__raise_softirq_irqoff(NET_RX_SOFTIRQ);

goto out;

}

通过上面这个软中断处理函数，对应每个网卡来说，又需要跳回驱动，去学习对应的poll函数。对于本文的这个驱动来说，poll函数就是e1000_clean->e1000_clean_rx_irq。这个函数是真正用于处理网卡接收数据包的工作。

static bool e1000_clean_rx_irq(struct e1000_adapter *adapter,

struct e1000_rx_ring *rx_ring,

int *work_done, int work_to_do)

{

...... ......

/* 得到当前需要处理buffer*/

i = rx_ring->next_to_clean;

rx_desc = E1000_RX_DESC(*rx_ring, i);

buffer_info = &rx_ring->buffer_info[i];

while (rx_desc->status & E1000_RXD_STAT_DD) {

struct sk_buff *skb;

u8 status;

if (*work_done >= work_to_do) //如果已经poll到足够的包，可以跳出返回

break;

(*work_done)++;

rmb(); /* read
descriptor and rx_buffer_info after status DD */

/* 得到数据包buffer对应的skb buffer结构地址 */

status = rx_desc->status;

skb = buffer_info->skb;

buffer_info->skb = NULL;

/*
然后做一些网卡硬件相关，及一些sanity check
*/
...... ......

/*

设置skb->pkt_type：PACKET_BROADCAST等；

即数据链路层协议类型

*/

skb->protocol = eth_type_trans(skb, netdev);

/* 将数据包传递给上层，并做一些通用数据链路层的处理 */

e1000_receive_skb(adapter, status, rx_desc->special, skb);

next_desc:

/* 处理下一个数据包 */

...... ......

}

/* 更新统计信息等*/

...... ......

return cleaned;

}

在这个函数中，真正的从网卡buffer中取出数据包，然后根据硬件的特性做一些特定处理，并简单的设置了数据包的一些field，完成L1的操作，设置好L2的报头。这时，数据包已经为TCP/IP协议栈所需要的skb_buff结构。

然后调用e1000_receive_skb->netif_receive_skb->__netif_receive_skb

static int __netif_receive_skb(struct sk_buff *skb)

{

struct packet_type *ptype, *pt_prev;

rx_handler_func_t *rx_handler;

struct net_device *orig_dev;

struct net_device *master;

struct net_device *null_or_orig;

struct net_device *orig_or_bond;

int ret = NET_RX_DROP;

__be16 type;

/* 为skb打时间戳 */

if (!netdev_tstamp_prequeue)

net_timestamp_check(skb);

/* vlan下硬件加速处理 */

if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))

return NET_RX_SUCCESS;

/* if we've
gotten here through NAPI, check netpoll */

if (netpoll_receive_skb(skb))

return NET_RX_DROP;

/* 设置skb的iif为接收网卡的索引 */

if (!skb->skb_iif)

skb->skb_iif = skb->dev->ifindex;

/*

* bonding note: skbs received on inactive slaves should only

* be delivered to pkt handlers that are exact matches. Also

* the deliver_no_wcard flag will be set. If packet
handlers

* are sensitive to duplicate packets these skbs will need to

* be dropped at the handler. The vlan accel path may have

* already set the deliver_no_wcard flag.

*/

/*关于网卡的bond的处理，这个feature我只是了解，所以略过 */

null_or_orig = NULL;

orig_dev = skb->dev;

master = ACCESS_ONCE(orig_dev->master);

if (skb->deliver_no_wcard)

null_or_orig = orig_dev;

else if (master) {

if (skb_bond_should_drop(skb, master)) {

skb->deliver_no_wcard = 1;

null_or_orig = orig_dev; /* deliver
only exact match */

} else

skb->dev = master;

}

__this_cpu_inc(softnet_data.processed);

/* 初始化l3 header 和 l4 header 的地址*/

skb_reset_network_header(skb);

skb_reset_transport_header(skb);

/* 得到mac地址长度，准确来说是2层地址的长度 */

skb->mac_len = skb->network_header - skb->mac_header;

pt_prev = NULL;

rcu_read_lock();

/*

省略一些不太相关的代码

*/

...... ......

/*

通过2层协议类型作为key，得到相应链表。

*/

type = skb->protocol;

list_for_each_entry_rcu(ptype,

&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {

if (ptype->type == type && (ptype->dev == null_or_orig ||

ptype->dev == skb->dev || ptype->dev == orig_dev ||

ptype->dev == orig_or_bond)) {

if (pt_prev) //找到匹配的协议类型，上传给L3层

ret = deliver_skb(skb, pt_prev, orig_dev);

pt_prev = ptype;

}

}

if (pt_prev) {

ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);

} else {

kfree_skb(skb);

/* Jamal, now you
will not able to escape explaining

* me how you were going to use this. :-)

*/

ret = NET_RX_DROP;

}

out:

rcu_read_unlock();

return ret;

}

现在基本上已经比较详细的学习了L2层的数据包处理流程。当然，还有很多很多的细节没有涉及，道路还很漫长啊。

内容来自用户分享和网络整理，不保证内容的准确性，如有侵权内容，可联系管理员处理

标签：

相关文章推荐

新的分享

章节导航