您的位置:首页 > 运维架构 > Linux

Linux内核协议栈对于timewait状态的处理

2013-09-10 18:01 316 查看
最近在做操作系统升级时,发现升级后的系统处于TIME_WAIT状态的连接数明显增多(内核版本 2.6.18 -> 2.6.32)。

原因

2.6.18 与 2.6.32 的
diff
结果

net/ipv4/inet_timewait_sock.c

C

@@ -178,15 +212,14 @@
need_timer = 0;
if (inet_twdr_do_twkill_work(twdr, twdr->slot)) {
twdr->thread_slots |= (1 << twdr->slot);
- mb();
schedule_work(&twdr->twkill_work);
need_timer = 1;
} else {
/* We purged the entire slot, anything left? */
if (twdr->tw_count)
need_timer = 1;
//这句话位置的变动引起TIME_WAIT状态增多
+ twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1));
}
- twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1));
if (need_timer)
mod_timer(&twdr->tw_timer, jiffies + twdr->period);
out:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19

    @@
-178,15
+212,14
@@

      need_timer
= 0;
      if
(inet_twdr_do_twkill_work(twdr,
twdr->slot))
{

        twdr->thread_slots
|= (1
<< twdr->slot);
    -  
mb();

        schedule_work(&twdr->twkill_work);
        need_timer
= 1;

      }
else {
        /* We purged the entire slot, anything left?  */  

        if
(twdr->tw_count)
          need_timer
= 1;

        //这句话位置的变动引起TIME_WAIT状态增多
    +  
twdr->slot
= ((twdr->slot
+ 1)
& (INET_TWDR_TWKILL_SLOTS
- 1));

      }  

    -
twdr->slot
= ((twdr->slot
+ 1)
& (INET_TWDR_TWKILL_SLOTS
- 1));

      if
(need_timer)
        mod_timer(&twdr->tw_timer,
jiffies +
twdr->period);

    out:
 

导致TIME_WAIT状态增多,正是由于

twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1));


位置的改变。

具体分析

1. 关键数据结构

inet_timewait_death_row: 用于管理timewait控制块的数据结构,位置: include/net/inet_timewait_sock.h。

C

struct inet_timewait_death_row {
/* Short-time timewait calendar */
int twcal_hand;
int twcal_jiffie;
struct timer_list twcal_timer;
struct hlist_head twcal_row[INET_TWDR_RECYCLE_SLOTS];

spinlock_t death_lock;
int tw_count;
int period;
u32 thread_slots;
struct work_struct twkill_work;
struct timer_list tw_timer;
int slot;
//INET_TWDR_TWKILL_SLOTS 值为 8
struct hlist_head cells[INET_TWDR_TWKILL_SLOTS];
struct inet_hashinfo *hashinfo;
int sysctl_tw_recycle;
int sysctl_max_tw_buckets;
};

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21

struct
inet_timewait_death_row
{

    /* Short-time timewait calendar */
    int        
twcal_hand;

    int        
twcal_jiffie;
    struct
timer_list   twcal_timer;

    struct
hlist_head   twcal_row[INET_TWDR_RECYCLE_SLOTS];
 

    spinlock_t      death_lock;
    int        
tw_count;

    int        
period;
    u32        
thread_slots;

    struct
work_struct  twkill_work;
    struct
timer_list   tw_timer;

    int        
slot;
    //INET_TWDR_TWKILL_SLOTS 值为 8

    struct
hlist_head   cells[INET_TWDR_TWKILL_SLOTS];
    struct
inet_hashinfo    *hashinfo;

    int        
sysctl_tw_recycle;
    int        
sysctl_max_tw_buckets;

};
 

此数据结构,可以分为两部分看,一部分处理
tw_recycle
开启时timewait块的快速回收,另一部分为未开启时用于等待时间较长的timewait块的回收。由于系统没有开启
tw_recycle
, 因此我们主要关注等待时间较长的timewait块回收。

用于等待时间较长的主要成员变量:

int period
: tw_timer 定时器的超时时间固定值为 TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,其中
TCP_TIMEWAIT_LEN 为 60 * HZ (60s),INET_TWDR_TWKILL_SLOTS 为 8。

u32 thread_slots
: 用于标识未完成的timewait块的位图。

struct work_struct twkill_work
: 分批删除(默认值为每次删除100个)cells中timewait块时的工作队列。

struct timer_list tw_timer
: 定时器,每过 period,触发一次
inet_twdr_hangman()


以下是此数据结构的初始化:

C

struct inet_timewait_death_row tcp_death_row = {
.sysctl_max_tw_buckets = NR_FILE * 2,
.period = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
.death_lock = __SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock),
.hashinfo = &tcp_hashinfo,
.tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0,
(unsigned long)&tcp_death_row),
.twkill_work = __WORK_INITIALIZER(tcp_death_row.twkill_work,
inet_twdr_twkill_work,
&tcp_death_row),
/* Short-time timewait calendar */

.twcal_hand = -1,
.twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
(unsigned long)&tcp_death_row),
};

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17

struct
inet_timewait_death_row tcp_death_row
= {

    .sysctl_max_tw_buckets
= NR_FILE
* 2,
    .period  
= TCP_TIMEWAIT_LEN
/ INET_TWDR_TWKILL_SLOTS,

    .death_lock
= __SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock),
    .hashinfo
= &tcp_hashinfo,

    .tw_timer
= TIMER_INITIALIZER(inet_twdr_hangman,
0,
                (unsigned
long)&tcp_death_row),

    .twkill_work
= __WORK_INITIALIZER(tcp_death_row.twkill_work,
                
inet_twdr_twkill_work,

                 &tcp_death_row),
    /* Short-time timewait calendar */

 
    .twcal_hand
= -1,

    .twcal_timer
= TIMER_INITIALIZER(inet_twdr_twcal_tick,
0,
                (unsigned
long)&tcp_death_row),

};
 

inet_timewait_sock: 用于组成 tcp_timewait_sock 结构,其前部是
sock_common 的前部。位置: include/net/inet_timewait_sock.h。

C

struct inet_timewait_sock {
/*
* Now struct sock also uses sock_common, so please just
* don't add nothing before this first member (__tw_common) --acme
*/
struct sock_common __tw_common;
#define tw_family __tw_common.skc_family
#define tw_state __tw_common.skc_state
#define tw_reuse __tw_common.skc_reuse
#define tw_bound_dev_if __tw_common.skc_bound_dev_if
#define tw_node __tw_common.skc_node
#define tw_bind_node __tw_common.skc_bind_node
#define tw_refcnt __tw_common.skc_refcnt
#define tw_hash __tw_common.skc_hash
#define tw_prot __tw_common.skc_prot
volatile unsigned char tw_substate;
/* 3 bits hole, try to pack */
unsigned char tw_rcv_wscale;
/* Socket demultiplex comparisons on incoming packets. */
/* these five are in inet_sock */
__u16 tw_sport;
__u32 tw_daddr __attribute__((aligned(INET_TIMEWAIT_ADDRCMP_ALIGN_BYTES)));
__u32 tw_rcv_saddr;
__u16 tw_dport;
__u16 tw_num;
/* And these are ours. */
__u8 tw_ipv6only:1;
/* 15 bits hole, try to pack */
__u16 tw_ipv6_offset;
int tw_timeout;
unsigned long tw_ttd;
struct inet_bind_bucket *tw_tb;
struct hlist_node tw_death_node;
};

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35

struct
inet_timewait_sock
{

    /*  
     * Now struct sock also uses sock_common, so please just

     * don't add nothing before this first member (__tw_common) --acme
     */

    struct
sock_common  __tw_common;
#define tw_family   __tw_common.skc_family

#define tw_state    __tw_common.skc_state
#define tw_reuse    __tw_common.skc_reuse

#define tw_bound_dev_if   __tw_common.skc_bound_dev_if
#define tw_node     __tw_common.skc_node

#define tw_bind_node    __tw_common.skc_bind_node
#define tw_refcnt   __tw_common.skc_refcnt

#define tw_hash     __tw_common.skc_hash
#define tw_prot     __tw_common.skc_prot

    volatile
unsigned char  tw_substate;
    /* 3 bits hole, try to pack */

    unsigned
char   tw_rcv_wscale;
    /* Socket demultiplex comparisons on incoming packets. */

    /* these five are in inet_sock */
    __u16    
tw_sport;

    __u32     tw_daddr
__attribute__((aligned(INET_TIMEWAIT_ADDRCMP_ALIGN_BYTES)));
    __u32    
tw_rcv_saddr;

    __u16     tw_dport;
    __u16    
tw_num;

    /* And these are ours. */
    __u8      tw_ipv6only:1;

    /* 15 bits hole, try to pack */
    __u16    
tw_ipv6_offset;

    int    
tw_timeout;
    unsigned
long   tw_ttd;

    struct
inet_bind_bucket
*tw_tb;
    struct
hlist_node tw_death_node;

};
 

此数据结构暂时只需要知道 tw_substate 即可。 tw_substate : TCP状态迁移到
FIN_WAIT2 或 TIME_WAIT 状态时,协议栈会用 timewait 块取代 tcp_sock 块,因为这两种状态都需要由定时器处理,超时立即释放。其对外状态都表现为
TIME_WAIT , 但其内部状态还是有分别,通过 tw_substate 进行区分。

2. timewait块释放时的逻辑

inet_twdr_hangman() 此函数是定时器到期时执行的函数,用于释放timewait块。

C

void inet_twdr_hangman(unsigned long data)
{
struct inet_timewait_death_row *twdr;
int unsigned need_timer;

twdr = (struct inet_timewait_death_row *)data;
spin_lock(&twdr->death_lock);

if (twdr->tw_count == 0)
goto out;

need_timer = 0;

//inet_twdr_do_twkill_work 释放timewait块的具体函数,每次释放100个
//释放完成返回0, 否则返回1
if (inet_twdr_do_twkill_work(twdr, twdr->slot)) {
//一次遍历未完全删除timewait块时,剩余的time块放入twkill_work的工作队列中处理。
//thread_slots标识未完成的timewait块
twdr->thread_slots |= (1 << twdr->slot);
mb();
schedule_work(&twdr->twkill_work);
//未删除所有timewait块,需要重新调度定时器
need_timer = 1;
} else {
/* We purged the entire slot, anything left? */
if (twdr->tw_count)
need_timer = 1;
}

//此句是关键,代码出自2.6.18内核,不管定时器例程一次有没有释放完timewait块,都进行 + 1 操作
twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1));
if (need_timer)
mod_timer(&twdr->tw_timer, jiffies + twdr->period);
out:
spin_unlock(&twdr->death_lock);
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37

void
inet_twdr_hangman(unsigned
long data)

{
    struct
inet_timewait_death_row
*twdr;

    int
unsigned need_timer;
 

    twdr
= (struct
inet_timewait_death_row
*)data;
    spin_lock(&twdr->death_lock);

 
    if
(twdr->tw_count
== 0)

        goto
out;
 

    need_timer
= 0;
 

    //inet_twdr_do_twkill_work 释放timewait块的具体函数,每次释放100个
    //释放完成返回0, 否则返回1

    if
(inet_twdr_do_twkill_work(twdr,
twdr->slot))
{
        //一次遍历未完全删除timewait块时,剩余的time块放入twkill_work的工作队列中处理。

        //thread_slots标识未完成的timewait块
        twdr->thread_slots
|= (1
<< twdr->slot);

        mb();
        schedule_work(&twdr->twkill_work);

        //未删除所有timewait块,需要重新调度定时器
        need_timer
= 1;

    }
else {
      /* We purged the entire slot, anything left?  */

      if
(twdr->tw_count)
          need_timer
= 1;

    }
 

    //此句是关键,代码出自2.6.18内核,不管定时器例程一次有没有释放完timewait块,都进行 + 1 操作
    twdr->slot
= ((twdr->slot
+ 1)
& (INET_TWDR_TWKILL_SLOTS
- 1));

    if
(need_timer)
        mod_timer(&twdr->tw_timer,
jiffies +
twdr->period);

  out:
      spin_unlock(&twdr->death_lock);

}
 

用于慢timewait块释放的逻辑可参考下图:



3. 结论

按照 2.6.18 中的逻辑,如果一次没有全部删除一个slot中的timewait控制块, twdr->slot 仍然会执行
+ 1 操作。此时如果有一个tcp_sock进入FIN_WAIT2状态,则此时的timewait(tw_substate = fin_wait2)块会被放在上一个slot中,而此时有一个线程正在处理那个队列,因此会导致处于FIN_WAIT2状态的timewait块被提前释放,若此时对端的FIN分节到达,协议栈会回复一个RST分节。

为了修复此BUG,2.6.32 协议栈中修改了 twdr->slot + 1 的时机,每次必须完全释放一个slot中所有的timewait块后,才会进行
+ 1 操作。这也就是说协议栈不保证在 TCP_TWKILL_PERIOD 周期内,移动一个格子,所以当系统繁忙时,会导致timewait块的等待时间大于
TCP_TIMEWAIT_LEN
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: