Linux内核协议栈对于timewait状态的处理
2013-09-10 18:01
316 查看
最近在做操作系统升级时,发现升级后的系统处于TIME_WAIT状态的连接数明显增多(内核版本 2.6.18 -> 2.6.32)。
net/ipv4/inet_timewait_sock.c
C
@@ -178,15 +212,14 @@
need_timer = 0;
if (inet_twdr_do_twkill_work(twdr, twdr->slot)) {
twdr->thread_slots |= (1 << twdr->slot);
- mb();
schedule_work(&twdr->twkill_work);
need_timer = 1;
} else {
/* We purged the entire slot, anything left? */
if (twdr->tw_count)
need_timer = 1;
//这句话位置的变动引起TIME_WAIT状态增多
+ twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1));
}
- twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1));
if (need_timer)
mod_timer(&twdr->tw_timer, jiffies + twdr->period);
out:
导致TIME_WAIT状态增多,正是由于
位置的改变。
C
struct inet_timewait_death_row {
/* Short-time timewait calendar */
int twcal_hand;
int twcal_jiffie;
struct timer_list twcal_timer;
struct hlist_head twcal_row[INET_TWDR_RECYCLE_SLOTS];
spinlock_t death_lock;
int tw_count;
int period;
u32 thread_slots;
struct work_struct twkill_work;
struct timer_list tw_timer;
int slot;
//INET_TWDR_TWKILL_SLOTS 值为 8
struct hlist_head cells[INET_TWDR_TWKILL_SLOTS];
struct inet_hashinfo *hashinfo;
int sysctl_tw_recycle;
int sysctl_max_tw_buckets;
};
此数据结构,可以分为两部分看,一部分处理
用于等待时间较长的主要成员变量:
TCP_TIMEWAIT_LEN 为 60 * HZ (60s),INET_TWDR_TWKILL_SLOTS 为 8。
inet_twdr_hangman()。
以下是此数据结构的初始化:
C
struct inet_timewait_death_row tcp_death_row = {
.sysctl_max_tw_buckets = NR_FILE * 2,
.period = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
.death_lock = __SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock),
.hashinfo = &tcp_hashinfo,
.tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0,
(unsigned long)&tcp_death_row),
.twkill_work = __WORK_INITIALIZER(tcp_death_row.twkill_work,
inet_twdr_twkill_work,
&tcp_death_row),
/* Short-time timewait calendar */
.twcal_hand = -1,
.twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
(unsigned long)&tcp_death_row),
};
inet_timewait_sock: 用于组成 tcp_timewait_sock 结构,其前部是
sock_common 的前部。位置: include/net/inet_timewait_sock.h。
C
struct inet_timewait_sock {
/*
* Now struct sock also uses sock_common, so please just
* don't add nothing before this first member (__tw_common) --acme
*/
struct sock_common __tw_common;
#define tw_family __tw_common.skc_family
#define tw_state __tw_common.skc_state
#define tw_reuse __tw_common.skc_reuse
#define tw_bound_dev_if __tw_common.skc_bound_dev_if
#define tw_node __tw_common.skc_node
#define tw_bind_node __tw_common.skc_bind_node
#define tw_refcnt __tw_common.skc_refcnt
#define tw_hash __tw_common.skc_hash
#define tw_prot __tw_common.skc_prot
volatile unsigned char tw_substate;
/* 3 bits hole, try to pack */
unsigned char tw_rcv_wscale;
/* Socket demultiplex comparisons on incoming packets. */
/* these five are in inet_sock */
__u16 tw_sport;
__u32 tw_daddr __attribute__((aligned(INET_TIMEWAIT_ADDRCMP_ALIGN_BYTES)));
__u32 tw_rcv_saddr;
__u16 tw_dport;
__u16 tw_num;
/* And these are ours. */
__u8 tw_ipv6only:1;
/* 15 bits hole, try to pack */
__u16 tw_ipv6_offset;
int tw_timeout;
unsigned long tw_ttd;
struct inet_bind_bucket *tw_tb;
struct hlist_node tw_death_node;
};
此数据结构暂时只需要知道 tw_substate 即可。 tw_substate : TCP状态迁移到
FIN_WAIT2 或 TIME_WAIT 状态时,协议栈会用 timewait 块取代 tcp_sock 块,因为这两种状态都需要由定时器处理,超时立即释放。其对外状态都表现为
TIME_WAIT , 但其内部状态还是有分别,通过 tw_substate 进行区分。
C
void inet_twdr_hangman(unsigned long data)
{
struct inet_timewait_death_row *twdr;
int unsigned need_timer;
twdr = (struct inet_timewait_death_row *)data;
spin_lock(&twdr->death_lock);
if (twdr->tw_count == 0)
goto out;
need_timer = 0;
//inet_twdr_do_twkill_work 释放timewait块的具体函数,每次释放100个
//释放完成返回0, 否则返回1
if (inet_twdr_do_twkill_work(twdr, twdr->slot)) {
//一次遍历未完全删除timewait块时,剩余的time块放入twkill_work的工作队列中处理。
//thread_slots标识未完成的timewait块
twdr->thread_slots |= (1 << twdr->slot);
mb();
schedule_work(&twdr->twkill_work);
//未删除所有timewait块,需要重新调度定时器
need_timer = 1;
} else {
/* We purged the entire slot, anything left? */
if (twdr->tw_count)
need_timer = 1;
}
//此句是关键,代码出自2.6.18内核,不管定时器例程一次有没有释放完timewait块,都进行 + 1 操作
twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1));
if (need_timer)
mod_timer(&twdr->tw_timer, jiffies + twdr->period);
out:
spin_unlock(&twdr->death_lock);
}
用于慢timewait块释放的逻辑可参考下图:
+ 1 操作。此时如果有一个tcp_sock进入FIN_WAIT2状态,则此时的timewait(tw_substate = fin_wait2)块会被放在上一个slot中,而此时有一个线程正在处理那个队列,因此会导致处于FIN_WAIT2状态的timewait块被提前释放,若此时对端的FIN分节到达,协议栈会回复一个RST分节。
为了修复此BUG,2.6.32 协议栈中修改了 twdr->slot + 1 的时机,每次必须完全释放一个slot中所有的timewait块后,才会进行
+ 1 操作。这也就是说协议栈不保证在 TCP_TWKILL_PERIOD 周期内,移动一个格子,所以当系统繁忙时,会导致timewait块的等待时间大于
TCP_TIMEWAIT_LEN。
原因
2.6.18 与 2.6.32 的diff结果
net/ipv4/inet_timewait_sock.c
C
@@ -178,15 +212,14 @@
need_timer = 0;
if (inet_twdr_do_twkill_work(twdr, twdr->slot)) {
twdr->thread_slots |= (1 << twdr->slot);
- mb();
schedule_work(&twdr->twkill_work);
need_timer = 1;
} else {
/* We purged the entire slot, anything left? */
if (twdr->tw_count)
need_timer = 1;
//这句话位置的变动引起TIME_WAIT状态增多
+ twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1));
}
- twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1));
if (need_timer)
mod_timer(&twdr->tw_timer, jiffies + twdr->period);
out:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 | @@ -178,15 +212,14 @@ need_timer = 0; if (inet_twdr_do_twkill_work(twdr, twdr->slot)) { twdr->thread_slots |= (1 << twdr->slot); - mb(); schedule_work(&twdr->twkill_work); need_timer = 1; } else { /* We purged the entire slot, anything left? */ if (twdr->tw_count) need_timer = 1; //这句话位置的变动引起TIME_WAIT状态增多 + twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1)); } - twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1)); if (need_timer) mod_timer(&twdr->tw_timer, jiffies + twdr->period); out: |
twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1));
位置的改变。
具体分析
1. 关键数据结构
inet_timewait_death_row: 用于管理timewait控制块的数据结构,位置: include/net/inet_timewait_sock.h。C
struct inet_timewait_death_row {
/* Short-time timewait calendar */
int twcal_hand;
int twcal_jiffie;
struct timer_list twcal_timer;
struct hlist_head twcal_row[INET_TWDR_RECYCLE_SLOTS];
spinlock_t death_lock;
int tw_count;
int period;
u32 thread_slots;
struct work_struct twkill_work;
struct timer_list tw_timer;
int slot;
//INET_TWDR_TWKILL_SLOTS 值为 8
struct hlist_head cells[INET_TWDR_TWKILL_SLOTS];
struct inet_hashinfo *hashinfo;
int sysctl_tw_recycle;
int sysctl_max_tw_buckets;
};
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 | struct inet_timewait_death_row { /* Short-time timewait calendar */ int twcal_hand; int twcal_jiffie; struct timer_list twcal_timer; struct hlist_head twcal_row[INET_TWDR_RECYCLE_SLOTS]; spinlock_t death_lock; int tw_count; int period; u32 thread_slots; struct work_struct twkill_work; struct timer_list tw_timer; int slot; //INET_TWDR_TWKILL_SLOTS 值为 8 struct hlist_head cells[INET_TWDR_TWKILL_SLOTS]; struct inet_hashinfo *hashinfo; int sysctl_tw_recycle; int sysctl_max_tw_buckets; }; |
tw_recycle开启时timewait块的快速回收,另一部分为未开启时用于等待时间较长的timewait块的回收。由于系统没有开启
tw_recycle, 因此我们主要关注等待时间较长的timewait块回收。
用于等待时间较长的主要成员变量:
int period: tw_timer 定时器的超时时间固定值为 TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,其中
TCP_TIMEWAIT_LEN 为 60 * HZ (60s),INET_TWDR_TWKILL_SLOTS 为 8。
u32 thread_slots: 用于标识未完成的timewait块的位图。
struct work_struct twkill_work: 分批删除(默认值为每次删除100个)cells中timewait块时的工作队列。
struct timer_list tw_timer: 定时器,每过 period,触发一次
inet_twdr_hangman()。
以下是此数据结构的初始化:
C
struct inet_timewait_death_row tcp_death_row = {
.sysctl_max_tw_buckets = NR_FILE * 2,
.period = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
.death_lock = __SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock),
.hashinfo = &tcp_hashinfo,
.tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0,
(unsigned long)&tcp_death_row),
.twkill_work = __WORK_INITIALIZER(tcp_death_row.twkill_work,
inet_twdr_twkill_work,
&tcp_death_row),
/* Short-time timewait calendar */
.twcal_hand = -1,
.twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0,
(unsigned long)&tcp_death_row),
};
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 | struct inet_timewait_death_row tcp_death_row = { .sysctl_max_tw_buckets = NR_FILE * 2, .period = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS, .death_lock = __SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock), .hashinfo = &tcp_hashinfo, .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0, (unsigned long)&tcp_death_row), .twkill_work = __WORK_INITIALIZER(tcp_death_row.twkill_work, inet_twdr_twkill_work, &tcp_death_row), /* Short-time timewait calendar */ .twcal_hand = -1, .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0, (unsigned long)&tcp_death_row), }; |
sock_common 的前部。位置: include/net/inet_timewait_sock.h。
C
struct inet_timewait_sock {
/*
* Now struct sock also uses sock_common, so please just
* don't add nothing before this first member (__tw_common) --acme
*/
struct sock_common __tw_common;
#define tw_family __tw_common.skc_family
#define tw_state __tw_common.skc_state
#define tw_reuse __tw_common.skc_reuse
#define tw_bound_dev_if __tw_common.skc_bound_dev_if
#define tw_node __tw_common.skc_node
#define tw_bind_node __tw_common.skc_bind_node
#define tw_refcnt __tw_common.skc_refcnt
#define tw_hash __tw_common.skc_hash
#define tw_prot __tw_common.skc_prot
volatile unsigned char tw_substate;
/* 3 bits hole, try to pack */
unsigned char tw_rcv_wscale;
/* Socket demultiplex comparisons on incoming packets. */
/* these five are in inet_sock */
__u16 tw_sport;
__u32 tw_daddr __attribute__((aligned(INET_TIMEWAIT_ADDRCMP_ALIGN_BYTES)));
__u32 tw_rcv_saddr;
__u16 tw_dport;
__u16 tw_num;
/* And these are ours. */
__u8 tw_ipv6only:1;
/* 15 bits hole, try to pack */
__u16 tw_ipv6_offset;
int tw_timeout;
unsigned long tw_ttd;
struct inet_bind_bucket *tw_tb;
struct hlist_node tw_death_node;
};
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 | struct inet_timewait_sock { /* * Now struct sock also uses sock_common, so please just * don't add nothing before this first member (__tw_common) --acme */ struct sock_common __tw_common; #define tw_family __tw_common.skc_family #define tw_state __tw_common.skc_state #define tw_reuse __tw_common.skc_reuse #define tw_bound_dev_if __tw_common.skc_bound_dev_if #define tw_node __tw_common.skc_node #define tw_bind_node __tw_common.skc_bind_node #define tw_refcnt __tw_common.skc_refcnt #define tw_hash __tw_common.skc_hash #define tw_prot __tw_common.skc_prot volatile unsigned char tw_substate; /* 3 bits hole, try to pack */ unsigned char tw_rcv_wscale; /* Socket demultiplex comparisons on incoming packets. */ /* these five are in inet_sock */ __u16 tw_sport; __u32 tw_daddr __attribute__((aligned(INET_TIMEWAIT_ADDRCMP_ALIGN_BYTES))); __u32 tw_rcv_saddr; __u16 tw_dport; __u16 tw_num; /* And these are ours. */ __u8 tw_ipv6only:1; /* 15 bits hole, try to pack */ __u16 tw_ipv6_offset; int tw_timeout; unsigned long tw_ttd; struct inet_bind_bucket *tw_tb; struct hlist_node tw_death_node; }; |
FIN_WAIT2 或 TIME_WAIT 状态时,协议栈会用 timewait 块取代 tcp_sock 块,因为这两种状态都需要由定时器处理,超时立即释放。其对外状态都表现为
TIME_WAIT , 但其内部状态还是有分别,通过 tw_substate 进行区分。
2. timewait块释放时的逻辑
inet_twdr_hangman() 此函数是定时器到期时执行的函数,用于释放timewait块。C
void inet_twdr_hangman(unsigned long data)
{
struct inet_timewait_death_row *twdr;
int unsigned need_timer;
twdr = (struct inet_timewait_death_row *)data;
spin_lock(&twdr->death_lock);
if (twdr->tw_count == 0)
goto out;
need_timer = 0;
//inet_twdr_do_twkill_work 释放timewait块的具体函数,每次释放100个
//释放完成返回0, 否则返回1
if (inet_twdr_do_twkill_work(twdr, twdr->slot)) {
//一次遍历未完全删除timewait块时,剩余的time块放入twkill_work的工作队列中处理。
//thread_slots标识未完成的timewait块
twdr->thread_slots |= (1 << twdr->slot);
mb();
schedule_work(&twdr->twkill_work);
//未删除所有timewait块,需要重新调度定时器
need_timer = 1;
} else {
/* We purged the entire slot, anything left? */
if (twdr->tw_count)
need_timer = 1;
}
//此句是关键,代码出自2.6.18内核,不管定时器例程一次有没有释放完timewait块,都进行 + 1 操作
twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1));
if (need_timer)
mod_timer(&twdr->tw_timer, jiffies + twdr->period);
out:
spin_unlock(&twdr->death_lock);
}
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 | void inet_twdr_hangman(unsigned long data) { struct inet_timewait_death_row *twdr; int unsigned need_timer; twdr = (struct inet_timewait_death_row *)data; spin_lock(&twdr->death_lock); if (twdr->tw_count == 0) goto out; need_timer = 0; //inet_twdr_do_twkill_work 释放timewait块的具体函数,每次释放100个 //释放完成返回0, 否则返回1 if (inet_twdr_do_twkill_work(twdr, twdr->slot)) { //一次遍历未完全删除timewait块时,剩余的time块放入twkill_work的工作队列中处理。 //thread_slots标识未完成的timewait块 twdr->thread_slots |= (1 << twdr->slot); mb(); schedule_work(&twdr->twkill_work); //未删除所有timewait块,需要重新调度定时器 need_timer = 1; } else { /* We purged the entire slot, anything left? */ if (twdr->tw_count) need_timer = 1; } //此句是关键,代码出自2.6.18内核,不管定时器例程一次有没有释放完timewait块,都进行 + 1 操作 twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1)); if (need_timer) mod_timer(&twdr->tw_timer, jiffies + twdr->period); out: spin_unlock(&twdr->death_lock); } |
3. 结论
按照 2.6.18 中的逻辑,如果一次没有全部删除一个slot中的timewait控制块, twdr->slot 仍然会执行+ 1 操作。此时如果有一个tcp_sock进入FIN_WAIT2状态,则此时的timewait(tw_substate = fin_wait2)块会被放在上一个slot中,而此时有一个线程正在处理那个队列,因此会导致处于FIN_WAIT2状态的timewait块被提前释放,若此时对端的FIN分节到达,协议栈会回复一个RST分节。
为了修复此BUG,2.6.32 协议栈中修改了 twdr->slot + 1 的时机,每次必须完全释放一个slot中所有的timewait块后,才会进行
+ 1 操作。这也就是说协议栈不保证在 TCP_TWKILL_PERIOD 周期内,移动一个格子,所以当系统繁忙时,会导致timewait块的等待时间大于
TCP_TIMEWAIT_LEN。
相关文章推荐
- 唯快不破:TCP/IP详解--如何处理TIME_WAIT状态
- TCP/IP详解--如何处理TIME_WAIT状态
- 内核处理time_wait状态详解
- 7.6 TIME_WAIT状态处理
- c#中的socket中的time_wait状态处理方法
- TCP之 TIME_WAIT和CLOSE_WAIT 状态 的原因分析和处理
- linux服务器出现大量的TIME_WAIT状态的TCP连接的处理办法
- 内核处理time_wait状态详解(转)
- TIME_WAIT状态下对接收到的数据包如何处理
- 当出现大量timewait状态的连接时,该如何处理?
- 网络:TCP通讯之 time_wait 状态
- 端口状态说明 LISTENING、ESTABLISHED、TIME_WAIT及CLOSE_WAIT
- 理解tcp关闭连接中的time_wait状态
- TCP连接状态:CLOSE_WAIT和TIME_WAIT
- 001_TCP/IP TIME_WAIT状态原理
- 一个解除TCP连接的TIME_WAIT状态限制的简便方法
- 端口状态说明 LISTENING、ESTABLISHED、TIME_WAIT及CLOSE_WAIT
- TIME-WAIT状态
- time-wait状态产生的原因
- TCP连接中的TIME_WAIT状态