Linux内核分析 - 网络:netif_receive_skb平台报文入口函数详解
2017-11-10 10:48
661 查看
网络收包流程从网卡驱动开始,一直往上,涉及NAPI、GRO、RPS等特性,但是一般最后都会调用__netif_receive_skb函数:
函数主要有几个处理:
1、vlan报文的处理,主要是循环把vlan头剥掉,如果qinq场景,两个vlan都会被剥掉;
2、交给rx_handler处理,例如OVS、linux bridge等;
3、ptype_all处理,例如抓包程序、raw socket等;
4、ptype_base处理,交给协议栈处理,例如ip、arp、rarp等;
static int __netif_receive_skb(struct sk_buff *skb)
{
struct packet_type *ptype, *pt_prev;
rx_handler_func_t *rx_handler;
struct net_device *orig_dev;
struct net_device *null_or_dev;
bool deliver_exact = false;
int ret = NET_RX_DROP;
__be16 type;
if (!netdev_tstamp_prequeue)
net_timestamp_check(skb);
trace_netif_receive_skb(skb);
if (netpoll_receive_skb(skb))
return NET_RX_DROP;
if (!skb->skb_iif)
skb->skb_iif = skb->dev->ifindex;
orig_dev = skb->dev;
skb_reset_network_header(skb); //把L3、L4的头都指向data数据结构,到这里的时候skb已经处理完L2层的头了
skb_reset_transport_header(skb);
skb_reset_mac_len(skb);
pt_prev = NULL;
rcu_read_lock();
another_round:
__this_cpu_inc(softnet_data.processed);
if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
skb = vlan_untag(skb);
if (unlikely(!skb))
goto out;
}
#ifdef CONFIG_NET_CLS_ACT
if (skb->tc_verd
& TC_NCLS) {
skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
goto ncls;
}
#endif
list_for_each_entry_rcu(ptype, &ptype_all, list) {
//把包交给特定协议相关的处理函数前,先调用ptype_all中注册的函数
if (!ptype->dev || ptype->dev == skb->dev) {
//最常见的为tcpdump,该工具就是从这里拿到所有收到的包的
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype; //pt_prev的加入是为了优化,只有当找到下一个匹配的时候,才执行这一次的回调函数
}
}
#ifdef CONFIG_NET_CLS_ACT
skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
if (!skb)
goto out;
ncls:
#endif
rx_handler = rcu_dereference(skb->dev->rx_handler);
//由具体驱动决定
if (rx_handler) {
if (pt_prev) {
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL;
}
switch (rx_handler(&skb)) {
case RX_HANDLER_CONSUMED:
goto out;
case RX_HANDLER_ANOTHER:
goto another_round;
case RX_HANDLER_EXACT:
deliver_exact = true;
case RX_HANDLER_PASS:
break;
default:
BUG();
}
}
if (vlan_tx_tag_present(skb)) {
if (pt_prev) {
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL;
}
if (vlan_do_receive(&skb)) {
ret = __netif_receive_skb(skb);
goto out;
} else if (unlikely(!skb))
goto out;
}
/* deliver only exact match when indicated */
null_or_dev = deliver_exact ? skb->dev : NULL;
type = skb->protocol;
list_for_each_entry_rcu(ptype,
&ptype_base[ntohs(type) &
PTYPE_HASH_MASK], list) {
if (ptype->type == type &&
(ptype->dev == null_or_dev || ptype->dev == skb->dev ||
ptype->dev == orig_dev)) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
//atomic_inc(&skb->users);
pt_prev = ptype;
}
}
if (pt_prev) {
ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
//一般的最后这一次没有引用计数的增加,直接调用函数
} else {
atomic_long_inc(&skb->dev->rx_dropped);
kfree_skb(skb);
/* Jamal, now you will not able to escape explaining
* me how you were going to use this. :-)
*/
ret = NET_RX_DROP;
}
out:
rcu_read_unlock();
return ret;
}
该函数涉及两个全局变量:
static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
static struct list_head ptype_all __read_mostly
看几个常见的packet_type,这些都在相应的协议初始化的时候调用dev_add_pack加入到特性的链表中:
static struct packet_type ip_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_IP),
.func = ip_rcv,
.gso_send_check = inet_gso_send_check,
.gso_segment = inet_gso_segment,
.gro_receive = inet_gro_receive,
.gro_complete = inet_gro_complete,
};
static struct packet_type arp_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_ARP),
.func = arp_rcv,
}
在ip_rcv函数中会对L3头做一些有效性检测:
int ip_rcv(struct sk_buff *skb, struct
net_device *dev, struct packet_type *pt, struct
net_device *orig_dev)
{
const struct iphdr *iph;
u32 len;
/* When the interface is in promisc. mode, drop
all the crap
* that it receives, do not try to analyse
it.
*/
if (skb->pkt_type == PACKET_OTHERHOST)
//驱动根据MAC地址设置的,如果MAC地址不是本机的话,在这里丢弃。
goto drop;
IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len);
if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
goto out;
}
if (!pskb_may_pull(skb, sizeof(struct
iphdr)))
goto inhdr_error;
iph = ip_hdr(skb);
/*
* RFC1122: 3.2.1.2
MUST silently discard any IP frame that fails the checksum.
*
* Is the datagram acceptable?
*
* 1. Length at least the size of an ip header
* 2. Version of 4
* 3. Checksums correctly. [Speed
optimisation for later, skip loopback checksums]
* 4. Doesn't have a bogus length
*/
if (iph->ihl < 5 || iph->version != 4)
goto inhdr_error;
if (!pskb_may_pull(skb, iph->ihl*4))
goto inhdr_error;
iph = ip_hdr(skb);
if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
//校验ip头是否正确
goto inhdr_error;
len = ntohs(iph->tot_len);
//iph中的大小是真正的大小,skb中len的大小是驱动中设置的,当包很小的时候,会进行填充,因此会比iph中的大
if (skb->len < len) {//以r8169为例,如果收到udp的包负载为1,则iph中的大小为20+8+1=29。但是此时skb->len=46=64(min)-14-4(vlan)
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS);
goto drop;
} else if (len < (iph->ihl*4))
goto inhdr_error;
/* Our transport medium may have padded the buffer out. Now we
know it
* is IP we can trim to the true length
of the frame.
* Note this now means skb->len holds
ntohs(iph->tot_len).
*/
if (pskb_trim_rcsum(skb, len)) {
//去除填充的数据
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
goto drop;
}
/* Remove any debris in the socket control block */
memset(IPCB(skb), 0, sizeof(struct
inet_skb_parm));
/* Must drop socket now because of tproxy. */
skb_orphan(skb);
return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, dev, NULL,
ip_rcv_finish);
inhdr_error:
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
drop:
kfree_skb(skb);
out:
return NET_RX_DROP;
}
然后调用ip_rcv_finish:
static int ip_rcv_finish(struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
/*
* Initialise the virtual path cache for the packet. It describes
* how the packet travels inside Linux networking.
*/
if (skb_dst(skb) == NULL) {
int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,//路由寻找,根据目的地址判断是本地接收还是转发(使能forward的话)
iph->tos, skb->dev);
if (unlikely(err)) {
if (err == -EHOSTUNREACH)
IP_INC_STATS_BH(dev_net(skb->dev),
IPSTATS_MIB_INADDRERRORS);
else if (err == -ENETUNREACH)
IP_INC_STATS_BH(dev_net(skb->dev),
IPSTATS_MIB_INNOROUTES);
else if (err == -EXDEV)
NET_INC_STATS_BH(dev_net(skb->dev),
LINUX_MIB_IPRPFILTER);
goto drop;
}
}
#ifdef CONFIG_IP_ROUTE_CLASSID
if (unlikely(skb_dst(skb)->tclassid)) {
struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
u32 idx = skb_dst(skb)->tclassid;
st[idx&0xFF].o_packets++;
st[idx&0xFF].o_bytes += skb->len;
st[(idx>>16)&0xFF].i_packets++;
st[(idx>>16)&0xFF].i_bytes += skb->len;
}
#endif
if (iph->ihl > 5
&& ip_rcv_options(skb))
goto drop;
rt = skb_rtable(skb);
if (rt->rt_type == RTN_MULTICAST) {
IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INMCAST,
skb->len);
} else if (rt->rt_type == RTN_BROADCAST)
IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INBCAST,
skb->len);
return dst_input(skb); //skb_dst(skb)->input(skb);路由寻找过程中赋值,本地接收的话为:ip_local_deliver
drop:
kfree_skb(skb);
return NET_RX_DROP;
}
int ip_local_deliver(struct sk_buff *skb)
{
/*
* Reassemble IP fragments.
*/
if (ip_is_fragment(ip_hdr(skb))) {
if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))
return 0;
}
return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
ip_local_deliver_finish);
}
略过ip defrag流程,直接调用ip_local_deliver_finish,该函数根据L3头指定的L4协议,调用特定的函数:
static int ip_local_deliver_finish(struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
__skb_pull(skb, ip_hdrlen(skb));
//增加data,略过L3头,此时data指向L4头
/* Point into the IP datagram, just
past the header. */
skb_reset_transport_header(skb);
rcu_read_lock();
{
int protocol = ip_hdr(skb)->protocol;
//L4类型,如TCP或者UDP
int hash, raw;
const struct net_protocol *ipprot;
resubmit:
raw = raw_local_deliver(skb, protocol);
//
hash = protocol & (MAX_INET_PROTOS - 1);
ipprot = rcu_dereference(inet_protos[hash]);
//udp_protocol
if (ipprot != NULL) {
int ret;
if (!net_eq(net, &init_net) && !ipprot->netns_ok) {
if (net_ratelimit())
printk("%s: proto %d isn't netns-ready\n",
__func__, protocol);
kfree_skb(skb);
goto out;
}
if (!ipprot->no_policy) {
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
kfree_skb(skb);
goto out;
}
nf_reset(skb);
}
ret = ipprot->handler(skb);
//udp_rcv
if (ret < 0) {
protocol = -ret;
goto resubmit;
}
IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
} else {
if (!raw) {
if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
IP_INC_STATS_BH(net, IPSTATS_MIB_INUNKNOWNPROTOS);
icmp_send(skb, ICMP_DEST_UNREACH,
ICMP_PROT_UNREACH, 0);
}
} else
IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
kfree_skb(skb);
}
}
out:
rcu_read_unlock();
return 0;
}
udp调用udp_rcv,最后调用__udp4_lib_rcv:
int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
int proto)
{
struct sock *sk;
struct udphdr *uh;
unsigned short ulen;
struct rtable *rt = skb_rtable(skb);
__be32 saddr, daddr;
struct net *net = dev_net(skb->dev);
/*
* Validate the packet.
*/
if (!pskb_may_pull(skb, sizeof(struct udphdr)))
goto drop; /* No space for header. */
uh = udp_hdr(skb);
ulen = ntohs(uh->len);
saddr = ip_hdr(skb)->saddr;
daddr = ip_hdr(skb)->daddr;
if (ulen > skb->len)
goto short_packet;
if (proto == IPPROTO_UDP) {
/* UDP validates ulen. */
if (ulen < sizeof(*uh) || pskb_trim_rcsum(skb, ulen))
goto short_packet;
uh = udp_hdr(skb);
}
if (udp4_csum_init(skb, uh, proto))
goto csum_error;
if (rt->rt_flags
& (RTCF_BROADCAST|RTCF_MULTICAST))
return __udp4_lib_mcast_deliver(net, skb, uh,
saddr, daddr, udptable);
sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
//根据ip地址以及端口号查找对应的sock数据结构
//接收进程在对应的链表中睡眠
if (sk != NULL) {
//不为空说明有对应的进程在等待这数据
int ret = udp_queue_rcv_skb(sk, skb);
sock_put(sk);
/* a return value > 0 means to resubmit the input, but
* it wants the return to be -protocol, or 0
*/
if (ret > 0)
return -ret;
return 0;
}
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
goto drop;
nf_reset(skb);
/* No socket. Drop packet silently, if checksum
is wrong */
if (udp_lib_checksum_complete(skb))
goto csum_error;
UDP_INC_STATS_BH(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
/*
* Hmm. We got an UDP packet to a port to which we
* don't wanna listen. Ignore it.
*/
kfree_skb(skb);
return 0;
}
首先看一下sock的hash查找函数:__udp4_lib_lookup_skb,该函数涉及hash表的一些查找,主要看一下具体的匹配函数:
static inline int compute_score(struct sock *sk, struct
net *net, __be32 saddr,
unsigned short hnum,
__be16 sport, __be32 daddr, __be16 dport, int dif)
{
int score = -1;
if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum &&
!ipv6_only_sock(sk)) {
struct inet_sock *inet = inet_sk(sk);
score = (sk->sk_family == PF_INET ? 1 : 0);
//一般为PF_INET
if (inet->inet_rcv_saddr) {
//bind指定地址的话有设置,否则为INADDR_ANY
if (inet->inet_rcv_saddr != daddr)
return -1;
score += 2;
}
if (inet->inet_daddr) {
//一般为0,参考inet_bind函数
if (inet->inet_daddr != saddr)
return -1;
score += 2;
}
if (inet->inet_dport) {
//一般为0
if (inet->inet_dport != sport)
return -1;
score += 2;
}
if (sk->sk_bound_dev_if) {
//一般为0
if (sk->sk_bound_dev_if != dif)
return -1;
score += 2;
}
}
return score;
}
该函数使用端口号寻找hash表中项,然后根据各个参数决定score,score大于-1表示找到对应的sock
找到sock后,去掉一些有效性检测,udp_queue_rcv_skb的逻辑如下:
if (sk_rcvqueues_full(sk, skb))
//超过限值,sk->sk_rmem_alloc
goto drop;
rc = 0;
bh_lock_sock(sk);
if (!sock_owned_by_user(sk))
rc = __udp_queue_rcv_skb(sk, skb);
else if (sk_add_backlog(sk, skb)) {
bh_unlock_sock(sk);
goto drop;
}
bh_unlock_sock(sk)
分成两种情况:
1)sk没有被人占用,则把skb加入sk_receive_queue,然后唤醒等待的进程。
2)如果sk被人占用,则把skb加入backlog链表,释放sk的时候会处理这种流程
先看第一种情况:
int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
int err;
int skb_len;
unsigned long flags;
struct sk_buff_head *list = &sk->sk_receive_queue;
//获取链表头
/* Cast sk->rcvbuf to unsigned... It
is pointless, but reduces
number of warnings when compiling with -W --ANK
*/
if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
(unsigned)sk->sk_rcvbuf) {
atomic_inc(&sk->sk_drops);
trace_sock_rcvqueue_full(sk, skb);
return -ENOMEM;
}
err = sk_filter(sk, skb);
if (err)
return err;
if (!sk_rmem_schedule(sk, skb->truesize)) {
atomic_inc(&sk->sk_drops);
return -ENOBUFS;
}
skb->dev = NULL;
skb_set_owner_r(skb, sk);
/* Cache the SKB length before we tack it onto the receive
* queue. Once it is added it no longer belongs to us and
* may be freed by other threads of control pulling packets
* from the queue.
*/
skb_len = skb->len;
/* we escape from rcu protected region, make sure we dont leak
* a norefcounted dst
*/
skb_dst_force(skb);
spin_lock_irqsave(&list->lock, flags);
skb->dropcount = atomic_read(&sk->sk_drops);
__skb_queue_tail(list, skb);
spin_unlock_irqrestore(&list->lock, flags);
if (!sock_flag(sk, SOCK_DEAD))
sk->sk_data_ready(sk, skb_len);
//sock_init_data初始化的时候赋值为:sock_def_readable
return 0;
}
static void sock_def_readable(struct sock *sk, int len)
{
struct socket_wq *wq;
rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
if (wq_has_sleeper(wq))
wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
POLLRDNORM | POLLRDBAND);
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
rcu_read_unlock();
}
再看第二种情况,加入到对应的链表:
static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb)
{
/* dont let skb dst not refcounted, we are going to leave rcu lock */
skb_dst_force(skb);
if (!sk->sk_backlog.tail)
sk->sk_backlog.head = skb;
else
sk->sk_backlog.tail->next = skb;
sk->sk_backlog.tail = skb;
skb->next = NULL;
}
释放sock的时候会判断该链表:
void release_sock(struct sock *sk)
{
/*
* The sk_lock has mutex_unlock() semantics:
*/
mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
spin_lock_bh(&sk->sk_lock.slock);
if (sk->sk_backlog.tail)
__release_sock(sk);
sk->sk_lock.owned = 0;
if (waitqueue_active(&sk->sk_lock.wq))
wake_up(&sk->sk_lock.wq);
spin_unlock_bh(&sk->sk_lock.slock);
}
__release_sock会遍历tail对应链表上的所有skb,分别调用sk_backlog_rcv函数:
static void __release_sock(struct sock *sk)
__releases(&sk->sk_lock.slock)
__acquires(&sk->sk_lock.slock)
{
struct sk_buff *skb = sk->sk_backlog.head;
do {
sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
bh_unlock_sock(sk);
do {
struct sk_buff *next = skb->next;
WARN_ON_ONCE(skb_dst_is_noref(skb));
skb->next = NULL;
sk_backlog_rcv(sk, skb);
//sk->sk_backlog_rcv(sk, skb)=sk->sk_prot->backlog_rcv
/*
* We are in process context here with softirqs
* disabled, use cond_resched_softirq() to preempt.
* This is safe to do because
we've taken the backlog
* queue private:
*/
cond_resched_softirq();
skb = next;
} while (skb != NULL);
bh_lock_sock(sk);
} while ((skb = sk->sk_backlog.head) != NULL);
/*
* Doing the zeroing here guarantee we can not loop forever
* while a wild producer attempts to flood us.
*/
sk->sk_backlog.len = 0;
}
对于udp为__udp_queue_rcv_skb:
static int __udp_queue_rcv_skb(struct sock *sk, struct
sk_buff *skb)
{
int rc;
if (inet_sk(sk)->inet_daddr)
sock_rps_save_rxhash(sk, skb->rxhash);
rc = ip_queue_rcv_skb(sk, skb);
//调用sock_queue_rcv_skb,回到第一种处理情况
if (rc < 0) {
int is_udplite = IS_UDPLITE(sk);
/* Note that an ENOMEM error is charged
twice */
if (rc == -ENOMEM)
UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
is_udplite);
UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
kfree_skb(skb);
trace_udp_fail_queue_rcv_skb(rc, sk);
return -1;
}
return 0;
}
一句话总结,对应udp而言,__netif_receive_skb把底层传上来的skb放到sock对应的sk_receive_queue链表中,然后唤醒等待数据的进程。
ARP报文处理:
在netif_receive_skb()函数中,可以看出处理的是像ARP、IP这些链路层以上的协议,那么,链路层报头是在哪里去掉的呢?答案是网卡驱动中,在调用netif_receive_skb()前,
skb->protocol = eth_type_trans(skb, bp->dev);
该函数对处理后skb>data跳过以太网报头,由mac_header指示以太网报头:
进入netif_receive_skb()函数
list_for_each_entry_rcu(ptype,&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list)
按照协议类型依次由相应的协议模块进行处理,而所以的协议模块处理都会注册在ptype_base中,实际是链表结构。
net/core/dev.c
static struct list_head ptype_base __read_mostly; /* Taps */
而相应的协议模块是通过dev_add_pack()函数加入的:
void dev_add_pack(struct packet_type *pt)
{
int hash;
spin_lock_bh(&ptype_lock);
if (pt->type == htons(ETH_P_ALL))
list_add_rcu(&pt->list, &ptype_all);
else {
hash = ntohs(pt->type) & PTYPE_HASH_MASK;
list_add_rcu(&pt->list, &ptype_base[hash]);
}
spin_unlock_bh(&ptype_lock);
}
以ARP处理为例
该模块的定义,它会在arp_init()中注册进ptype_base链表中:
static struct packet_type arp_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_ARP),
.func = arp_rcv,
};
然后在根据报文的TYPE来在ptype_base中查找相应协议模块进行处理时,实际调用arp_rcv()进行接收
arp_rcv() --> arp_process()
arp = arp_hdr(skb);
……
arp_ptr= (unsigned char *)(arp+1);
sha= arp_ptr;
arp_ptr += dev->addr_len;
memcpy(&sip, arp_ptr, 4);
arp_ptr += 4;
arp_ptr += dev->addr_len;
memcpy(&tip, arp_ptr, 4);
操作后这指针位置:
然后判断是ARP请求报文,这时先查询路由表ip_route_input()
if (arp->ar_op == htons(ARPOP_REQUEST) &&
ip_route_input(skb, tip, sip, 0, dev) == 0)
在ip_route_input()函数中,先在cache中查询是否存在相应的路由表项:
hash = rt_hash(daddr, saddr, iif, rt_genid(net));
缓存的路由项在内核中组织成hash表的形式,因此在查询时,先算出的hash值,再用该项-
rt_hash_table[hash].chain即可。这里可以看到,缓存路由项包括了源IP地址、目的IP地址、网卡号。
如果在缓存中没有查到匹配项,或指定不查询cache,则查询路由表ip_route_input_slow();
进入ip_route_input_slow()函数,最终调用fib_lookup()得到查询结果fib_result
if ((err = fib_lookup(net, &fl, &res)) != 0)
如果结果fib_result合法,则需要更新路由缓存,将此次查询结果写入缓存
hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
在查找完路由表后,回到arp_process()函数,如果路由项指向本地,则应由本机接收该报文:
if (addr_type == RTN_LOCAL) {
……
if (!dont_send) {
n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
if (n) {
arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
neigh_release(n);
}
}
goto out;
}
首先更新邻居表neigh_event_ns(),然后发送ARP响应 –
arp_send。
至此,大致的ARP流程完成。由于ARP部分涉及到路由表以及邻居表,这都是很大的概念。
函数主要有几个处理:
1、vlan报文的处理,主要是循环把vlan头剥掉,如果qinq场景,两个vlan都会被剥掉;
2、交给rx_handler处理,例如OVS、linux bridge等;
3、ptype_all处理,例如抓包程序、raw socket等;
4、ptype_base处理,交给协议栈处理,例如ip、arp、rarp等;
static int __netif_receive_skb(struct sk_buff *skb)
{
struct packet_type *ptype, *pt_prev;
rx_handler_func_t *rx_handler;
struct net_device *orig_dev;
struct net_device *null_or_dev;
bool deliver_exact = false;
int ret = NET_RX_DROP;
__be16 type;
if (!netdev_tstamp_prequeue)
net_timestamp_check(skb);
trace_netif_receive_skb(skb);
if (netpoll_receive_skb(skb))
return NET_RX_DROP;
if (!skb->skb_iif)
skb->skb_iif = skb->dev->ifindex;
orig_dev = skb->dev;
skb_reset_network_header(skb); //把L3、L4的头都指向data数据结构,到这里的时候skb已经处理完L2层的头了
skb_reset_transport_header(skb);
skb_reset_mac_len(skb);
pt_prev = NULL;
rcu_read_lock();
another_round:
__this_cpu_inc(softnet_data.processed);
if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
skb = vlan_untag(skb);
if (unlikely(!skb))
goto out;
}
#ifdef CONFIG_NET_CLS_ACT
if (skb->tc_verd
& TC_NCLS) {
skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
goto ncls;
}
#endif
list_for_each_entry_rcu(ptype, &ptype_all, list) {
//把包交给特定协议相关的处理函数前,先调用ptype_all中注册的函数
if (!ptype->dev || ptype->dev == skb->dev) {
//最常见的为tcpdump,该工具就是从这里拿到所有收到的包的
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = ptype; //pt_prev的加入是为了优化,只有当找到下一个匹配的时候,才执行这一次的回调函数
}
}
#ifdef CONFIG_NET_CLS_ACT
skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
if (!skb)
goto out;
ncls:
#endif
rx_handler = rcu_dereference(skb->dev->rx_handler);
//由具体驱动决定
if (rx_handler) {
if (pt_prev) {
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL;
}
switch (rx_handler(&skb)) {
case RX_HANDLER_CONSUMED:
goto out;
case RX_HANDLER_ANOTHER:
goto another_round;
case RX_HANDLER_EXACT:
deliver_exact = true;
case RX_HANDLER_PASS:
break;
default:
BUG();
}
}
if (vlan_tx_tag_present(skb)) {
if (pt_prev) {
ret = deliver_skb(skb, pt_prev, orig_dev);
pt_prev = NULL;
}
if (vlan_do_receive(&skb)) {
ret = __netif_receive_skb(skb);
goto out;
} else if (unlikely(!skb))
goto out;
}
/* deliver only exact match when indicated */
null_or_dev = deliver_exact ? skb->dev : NULL;
type = skb->protocol;
list_for_each_entry_rcu(ptype,
&ptype_base[ntohs(type) &
PTYPE_HASH_MASK], list) {
if (ptype->type == type &&
(ptype->dev == null_or_dev || ptype->dev == skb->dev ||
ptype->dev == orig_dev)) {
if (pt_prev)
ret = deliver_skb(skb, pt_prev, orig_dev);
//atomic_inc(&skb->users);
pt_prev = ptype;
}
}
if (pt_prev) {
ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
//一般的最后这一次没有引用计数的增加,直接调用函数
} else {
atomic_long_inc(&skb->dev->rx_dropped);
kfree_skb(skb);
/* Jamal, now you will not able to escape explaining
* me how you were going to use this. :-)
*/
ret = NET_RX_DROP;
}
out:
rcu_read_unlock();
return ret;
}
该函数涉及两个全局变量:
static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
static struct list_head ptype_all __read_mostly
看几个常见的packet_type,这些都在相应的协议初始化的时候调用dev_add_pack加入到特性的链表中:
static struct packet_type ip_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_IP),
.func = ip_rcv,
.gso_send_check = inet_gso_send_check,
.gso_segment = inet_gso_segment,
.gro_receive = inet_gro_receive,
.gro_complete = inet_gro_complete,
};
static struct packet_type arp_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_ARP),
.func = arp_rcv,
}
在ip_rcv函数中会对L3头做一些有效性检测:
int ip_rcv(struct sk_buff *skb, struct
net_device *dev, struct packet_type *pt, struct
net_device *orig_dev)
{
const struct iphdr *iph;
u32 len;
/* When the interface is in promisc. mode, drop
all the crap
* that it receives, do not try to analyse
it.
*/
if (skb->pkt_type == PACKET_OTHERHOST)
//驱动根据MAC地址设置的,如果MAC地址不是本机的话,在这里丢弃。
goto drop;
IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len);
if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
goto out;
}
if (!pskb_may_pull(skb, sizeof(struct
iphdr)))
goto inhdr_error;
iph = ip_hdr(skb);
/*
* RFC1122: 3.2.1.2
MUST silently discard any IP frame that fails the checksum.
*
* Is the datagram acceptable?
*
* 1. Length at least the size of an ip header
* 2. Version of 4
* 3. Checksums correctly. [Speed
optimisation for later, skip loopback checksums]
* 4. Doesn't have a bogus length
*/
if (iph->ihl < 5 || iph->version != 4)
goto inhdr_error;
if (!pskb_may_pull(skb, iph->ihl*4))
goto inhdr_error;
iph = ip_hdr(skb);
if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
//校验ip头是否正确
goto inhdr_error;
len = ntohs(iph->tot_len);
//iph中的大小是真正的大小,skb中len的大小是驱动中设置的,当包很小的时候,会进行填充,因此会比iph中的大
if (skb->len < len) {//以r8169为例,如果收到udp的包负载为1,则iph中的大小为20+8+1=29。但是此时skb->len=46=64(min)-14-4(vlan)
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS);
goto drop;
} else if (len < (iph->ihl*4))
goto inhdr_error;
/* Our transport medium may have padded the buffer out. Now we
know it
* is IP we can trim to the true length
of the frame.
* Note this now means skb->len holds
ntohs(iph->tot_len).
*/
if (pskb_trim_rcsum(skb, len)) {
//去除填充的数据
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
goto drop;
}
/* Remove any debris in the socket control block */
memset(IPCB(skb), 0, sizeof(struct
inet_skb_parm));
/* Must drop socket now because of tproxy. */
skb_orphan(skb);
return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, dev, NULL,
ip_rcv_finish);
inhdr_error:
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
drop:
kfree_skb(skb);
out:
return NET_RX_DROP;
}
然后调用ip_rcv_finish:
static int ip_rcv_finish(struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
/*
* Initialise the virtual path cache for the packet. It describes
* how the packet travels inside Linux networking.
*/
if (skb_dst(skb) == NULL) {
int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,//路由寻找,根据目的地址判断是本地接收还是转发(使能forward的话)
iph->tos, skb->dev);
if (unlikely(err)) {
if (err == -EHOSTUNREACH)
IP_INC_STATS_BH(dev_net(skb->dev),
IPSTATS_MIB_INADDRERRORS);
else if (err == -ENETUNREACH)
IP_INC_STATS_BH(dev_net(skb->dev),
IPSTATS_MIB_INNOROUTES);
else if (err == -EXDEV)
NET_INC_STATS_BH(dev_net(skb->dev),
LINUX_MIB_IPRPFILTER);
goto drop;
}
}
#ifdef CONFIG_IP_ROUTE_CLASSID
if (unlikely(skb_dst(skb)->tclassid)) {
struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
u32 idx = skb_dst(skb)->tclassid;
st[idx&0xFF].o_packets++;
st[idx&0xFF].o_bytes += skb->len;
st[(idx>>16)&0xFF].i_packets++;
st[(idx>>16)&0xFF].i_bytes += skb->len;
}
#endif
if (iph->ihl > 5
&& ip_rcv_options(skb))
goto drop;
rt = skb_rtable(skb);
if (rt->rt_type == RTN_MULTICAST) {
IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INMCAST,
skb->len);
} else if (rt->rt_type == RTN_BROADCAST)
IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INBCAST,
skb->len);
return dst_input(skb); //skb_dst(skb)->input(skb);路由寻找过程中赋值,本地接收的话为:ip_local_deliver
drop:
kfree_skb(skb);
return NET_RX_DROP;
}
int ip_local_deliver(struct sk_buff *skb)
{
/*
* Reassemble IP fragments.
*/
if (ip_is_fragment(ip_hdr(skb))) {
if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))
return 0;
}
return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
ip_local_deliver_finish);
}
略过ip defrag流程,直接调用ip_local_deliver_finish,该函数根据L3头指定的L4协议,调用特定的函数:
static int ip_local_deliver_finish(struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
__skb_pull(skb, ip_hdrlen(skb));
//增加data,略过L3头,此时data指向L4头
/* Point into the IP datagram, just
past the header. */
skb_reset_transport_header(skb);
rcu_read_lock();
{
int protocol = ip_hdr(skb)->protocol;
//L4类型,如TCP或者UDP
int hash, raw;
const struct net_protocol *ipprot;
resubmit:
raw = raw_local_deliver(skb, protocol);
//
hash = protocol & (MAX_INET_PROTOS - 1);
ipprot = rcu_dereference(inet_protos[hash]);
//udp_protocol
if (ipprot != NULL) {
int ret;
if (!net_eq(net, &init_net) && !ipprot->netns_ok) {
if (net_ratelimit())
printk("%s: proto %d isn't netns-ready\n",
__func__, protocol);
kfree_skb(skb);
goto out;
}
if (!ipprot->no_policy) {
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
kfree_skb(skb);
goto out;
}
nf_reset(skb);
}
ret = ipprot->handler(skb);
//udp_rcv
if (ret < 0) {
protocol = -ret;
goto resubmit;
}
IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
} else {
if (!raw) {
if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
IP_INC_STATS_BH(net, IPSTATS_MIB_INUNKNOWNPROTOS);
icmp_send(skb, ICMP_DEST_UNREACH,
ICMP_PROT_UNREACH, 0);
}
} else
IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
kfree_skb(skb);
}
}
out:
rcu_read_unlock();
return 0;
}
udp调用udp_rcv,最后调用__udp4_lib_rcv:
int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
int proto)
{
struct sock *sk;
struct udphdr *uh;
unsigned short ulen;
struct rtable *rt = skb_rtable(skb);
__be32 saddr, daddr;
struct net *net = dev_net(skb->dev);
/*
* Validate the packet.
*/
if (!pskb_may_pull(skb, sizeof(struct udphdr)))
goto drop; /* No space for header. */
uh = udp_hdr(skb);
ulen = ntohs(uh->len);
saddr = ip_hdr(skb)->saddr;
daddr = ip_hdr(skb)->daddr;
if (ulen > skb->len)
goto short_packet;
if (proto == IPPROTO_UDP) {
/* UDP validates ulen. */
if (ulen < sizeof(*uh) || pskb_trim_rcsum(skb, ulen))
goto short_packet;
uh = udp_hdr(skb);
}
if (udp4_csum_init(skb, uh, proto))
goto csum_error;
if (rt->rt_flags
& (RTCF_BROADCAST|RTCF_MULTICAST))
return __udp4_lib_mcast_deliver(net, skb, uh,
saddr, daddr, udptable);
sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable);
//根据ip地址以及端口号查找对应的sock数据结构
//接收进程在对应的链表中睡眠
if (sk != NULL) {
//不为空说明有对应的进程在等待这数据
int ret = udp_queue_rcv_skb(sk, skb);
sock_put(sk);
/* a return value > 0 means to resubmit the input, but
* it wants the return to be -protocol, or 0
*/
if (ret > 0)
return -ret;
return 0;
}
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
goto drop;
nf_reset(skb);
/* No socket. Drop packet silently, if checksum
is wrong */
if (udp_lib_checksum_complete(skb))
goto csum_error;
UDP_INC_STATS_BH(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
/*
* Hmm. We got an UDP packet to a port to which we
* don't wanna listen. Ignore it.
*/
kfree_skb(skb);
return 0;
}
首先看一下sock的hash查找函数:__udp4_lib_lookup_skb,该函数涉及hash表的一些查找,主要看一下具体的匹配函数:
static inline int compute_score(struct sock *sk, struct
net *net, __be32 saddr,
unsigned short hnum,
__be16 sport, __be32 daddr, __be16 dport, int dif)
{
int score = -1;
if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum &&
!ipv6_only_sock(sk)) {
struct inet_sock *inet = inet_sk(sk);
score = (sk->sk_family == PF_INET ? 1 : 0);
//一般为PF_INET
if (inet->inet_rcv_saddr) {
//bind指定地址的话有设置,否则为INADDR_ANY
if (inet->inet_rcv_saddr != daddr)
return -1;
score += 2;
}
if (inet->inet_daddr) {
//一般为0,参考inet_bind函数
if (inet->inet_daddr != saddr)
return -1;
score += 2;
}
if (inet->inet_dport) {
//一般为0
if (inet->inet_dport != sport)
return -1;
score += 2;
}
if (sk->sk_bound_dev_if) {
//一般为0
if (sk->sk_bound_dev_if != dif)
return -1;
score += 2;
}
}
return score;
}
该函数使用端口号寻找hash表中项,然后根据各个参数决定score,score大于-1表示找到对应的sock
找到sock后,去掉一些有效性检测,udp_queue_rcv_skb的逻辑如下:
if (sk_rcvqueues_full(sk, skb))
//超过限值,sk->sk_rmem_alloc
goto drop;
rc = 0;
bh_lock_sock(sk);
if (!sock_owned_by_user(sk))
rc = __udp_queue_rcv_skb(sk, skb);
else if (sk_add_backlog(sk, skb)) {
bh_unlock_sock(sk);
goto drop;
}
bh_unlock_sock(sk)
分成两种情况:
1)sk没有被人占用,则把skb加入sk_receive_queue,然后唤醒等待的进程。
2)如果sk被人占用,则把skb加入backlog链表,释放sk的时候会处理这种流程
先看第一种情况:
int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
int err;
int skb_len;
unsigned long flags;
struct sk_buff_head *list = &sk->sk_receive_queue;
//获取链表头
/* Cast sk->rcvbuf to unsigned... It
is pointless, but reduces
number of warnings when compiling with -W --ANK
*/
if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
(unsigned)sk->sk_rcvbuf) {
atomic_inc(&sk->sk_drops);
trace_sock_rcvqueue_full(sk, skb);
return -ENOMEM;
}
err = sk_filter(sk, skb);
if (err)
return err;
if (!sk_rmem_schedule(sk, skb->truesize)) {
atomic_inc(&sk->sk_drops);
return -ENOBUFS;
}
skb->dev = NULL;
skb_set_owner_r(skb, sk);
/* Cache the SKB length before we tack it onto the receive
* queue. Once it is added it no longer belongs to us and
* may be freed by other threads of control pulling packets
* from the queue.
*/
skb_len = skb->len;
/* we escape from rcu protected region, make sure we dont leak
* a norefcounted dst
*/
skb_dst_force(skb);
spin_lock_irqsave(&list->lock, flags);
skb->dropcount = atomic_read(&sk->sk_drops);
__skb_queue_tail(list, skb);
spin_unlock_irqrestore(&list->lock, flags);
if (!sock_flag(sk, SOCK_DEAD))
sk->sk_data_ready(sk, skb_len);
//sock_init_data初始化的时候赋值为:sock_def_readable
return 0;
}
static void sock_def_readable(struct sock *sk, int len)
{
struct socket_wq *wq;
rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
if (wq_has_sleeper(wq))
wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
POLLRDNORM | POLLRDBAND);
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
rcu_read_unlock();
}
再看第二种情况,加入到对应的链表:
static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb)
{
/* dont let skb dst not refcounted, we are going to leave rcu lock */
skb_dst_force(skb);
if (!sk->sk_backlog.tail)
sk->sk_backlog.head = skb;
else
sk->sk_backlog.tail->next = skb;
sk->sk_backlog.tail = skb;
skb->next = NULL;
}
释放sock的时候会判断该链表:
void release_sock(struct sock *sk)
{
/*
* The sk_lock has mutex_unlock() semantics:
*/
mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
spin_lock_bh(&sk->sk_lock.slock);
if (sk->sk_backlog.tail)
__release_sock(sk);
sk->sk_lock.owned = 0;
if (waitqueue_active(&sk->sk_lock.wq))
wake_up(&sk->sk_lock.wq);
spin_unlock_bh(&sk->sk_lock.slock);
}
__release_sock会遍历tail对应链表上的所有skb,分别调用sk_backlog_rcv函数:
static void __release_sock(struct sock *sk)
__releases(&sk->sk_lock.slock)
__acquires(&sk->sk_lock.slock)
{
struct sk_buff *skb = sk->sk_backlog.head;
do {
sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
bh_unlock_sock(sk);
do {
struct sk_buff *next = skb->next;
WARN_ON_ONCE(skb_dst_is_noref(skb));
skb->next = NULL;
sk_backlog_rcv(sk, skb);
//sk->sk_backlog_rcv(sk, skb)=sk->sk_prot->backlog_rcv
/*
* We are in process context here with softirqs
* disabled, use cond_resched_softirq() to preempt.
* This is safe to do because
we've taken the backlog
* queue private:
*/
cond_resched_softirq();
skb = next;
} while (skb != NULL);
bh_lock_sock(sk);
} while ((skb = sk->sk_backlog.head) != NULL);
/*
* Doing the zeroing here guarantee we can not loop forever
* while a wild producer attempts to flood us.
*/
sk->sk_backlog.len = 0;
}
对于udp为__udp_queue_rcv_skb:
static int __udp_queue_rcv_skb(struct sock *sk, struct
sk_buff *skb)
{
int rc;
if (inet_sk(sk)->inet_daddr)
sock_rps_save_rxhash(sk, skb->rxhash);
rc = ip_queue_rcv_skb(sk, skb);
//调用sock_queue_rcv_skb,回到第一种处理情况
if (rc < 0) {
int is_udplite = IS_UDPLITE(sk);
/* Note that an ENOMEM error is charged
twice */
if (rc == -ENOMEM)
UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
is_udplite);
UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
kfree_skb(skb);
trace_udp_fail_queue_rcv_skb(rc, sk);
return -1;
}
return 0;
}
一句话总结,对应udp而言,__netif_receive_skb把底层传上来的skb放到sock对应的sk_receive_queue链表中,然后唤醒等待数据的进程。
ARP报文处理:
在netif_receive_skb()函数中,可以看出处理的是像ARP、IP这些链路层以上的协议,那么,链路层报头是在哪里去掉的呢?答案是网卡驱动中,在调用netif_receive_skb()前,
skb->protocol = eth_type_trans(skb, bp->dev);
该函数对处理后skb>data跳过以太网报头,由mac_header指示以太网报头:
进入netif_receive_skb()函数
list_for_each_entry_rcu(ptype,&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list)
按照协议类型依次由相应的协议模块进行处理,而所以的协议模块处理都会注册在ptype_base中,实际是链表结构。
net/core/dev.c
static struct list_head ptype_base __read_mostly; /* Taps */
而相应的协议模块是通过dev_add_pack()函数加入的:
void dev_add_pack(struct packet_type *pt)
{
int hash;
spin_lock_bh(&ptype_lock);
if (pt->type == htons(ETH_P_ALL))
list_add_rcu(&pt->list, &ptype_all);
else {
hash = ntohs(pt->type) & PTYPE_HASH_MASK;
list_add_rcu(&pt->list, &ptype_base[hash]);
}
spin_unlock_bh(&ptype_lock);
}
以ARP处理为例
该模块的定义,它会在arp_init()中注册进ptype_base链表中:
static struct packet_type arp_packet_type __read_mostly = {
.type = cpu_to_be16(ETH_P_ARP),
.func = arp_rcv,
};
然后在根据报文的TYPE来在ptype_base中查找相应协议模块进行处理时,实际调用arp_rcv()进行接收
arp_rcv() --> arp_process()
arp = arp_hdr(skb);
……
arp_ptr= (unsigned char *)(arp+1);
sha= arp_ptr;
arp_ptr += dev->addr_len;
memcpy(&sip, arp_ptr, 4);
arp_ptr += 4;
arp_ptr += dev->addr_len;
memcpy(&tip, arp_ptr, 4);
操作后这指针位置:
然后判断是ARP请求报文,这时先查询路由表ip_route_input()
if (arp->ar_op == htons(ARPOP_REQUEST) &&
ip_route_input(skb, tip, sip, 0, dev) == 0)
在ip_route_input()函数中,先在cache中查询是否存在相应的路由表项:
hash = rt_hash(daddr, saddr, iif, rt_genid(net));
缓存的路由项在内核中组织成hash表的形式,因此在查询时,先算出的hash值,再用该项-
rt_hash_table[hash].chain即可。这里可以看到,缓存路由项包括了源IP地址、目的IP地址、网卡号。
如果在缓存中没有查到匹配项,或指定不查询cache,则查询路由表ip_route_input_slow();
进入ip_route_input_slow()函数,最终调用fib_lookup()得到查询结果fib_result
if ((err = fib_lookup(net, &fl, &res)) != 0)
如果结果fib_result合法,则需要更新路由缓存,将此次查询结果写入缓存
hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
在查找完路由表后,回到arp_process()函数,如果路由项指向本地,则应由本机接收该报文:
if (addr_type == RTN_LOCAL) {
……
if (!dont_send) {
n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
if (n) {
arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
neigh_release(n);
}
}
goto out;
}
首先更新邻居表neigh_event_ns(),然后发送ARP响应 –
arp_send。
至此,大致的ARP流程完成。由于ARP部分涉及到路由表以及邻居表,这都是很大的概念。
相关文章推荐
- Linux内核分析 - 网络[三]:从netif_receive_skb()说起
- Linux 网络协议栈开发代码分析篇之数据收发(一) —— netif_receive_skb()函数
- Linux内核分析 - 网络[三]:从netif_receive_skb()说起
- Linux内核分析 - 网络[三]:从netif_receive_skb()说起
- Linux内核分析 - 网络[三]:从netif_receive_skb()说起
- Linux内核分析 - 网络[三]:从netif_receive_skb()说起
- netif_receive_skb 函数注解
- netif_receive_skb 函数注解
- netif_receive_skb 函数注解
- netif_receive_skb 函数注解
- netif_receive_skb 函数注解
- netif_receive_skb 函数解析
- netif_receive_skb 函数解析
- 网卡驱动5-做一个与外界交互的虚拟网卡4(netif_receive_skb和非napi分析)
- Linux内核--网络协议栈深入分析(二)--sk_buff的操作函数
- Linux内核分析 - 网络[二]:网卡驱动接收报文
- 网络驱动程序 各个函数详解及图解 DM9000A网卡驱动框架源码分析
- netif_stop_subqueue函数分析
- Linux内核分析 - 网络[二]:网卡驱动接收报文
- Linux内核--网络协议栈深入分析(二)--sk_buff的操作函数