您的位置:首页 > 运维架构 > Linux

Reading notes about low-resolution timer implementation on linux.

2011-11-21 22:59 267 查看
Reading notes about low-resolution timer implementation on linux.

Author: Honggang Yang(Joseph) <ganggexiongqi@gmail.com>

Kernel Version: Linux 3.1.1

===================================================================


REF: Professional Linux Kernel Architecture

+ Essential Linux Device Drivers

+ Understanding the Linux Kernel 3

===============================

Contents:

1. INIT the base structures

2. Dynamic timers register, modification and deletion

2.1 Register and modification

2.2 Dynamic timers deletion

3. Dynamic timer handling
[b]4. A whole view of the low-resolution timer system

[/b]

5. Demon of how to use dynamic timer in your modules

--------------------------------------------

Checking for timer functions is always done by deferrable functions that

may be executed a long time after they have been activated, the kernel

cannot ensure that timer functions will start right at their expiration times.

It can only ensure that they are executed either at the proper time or

after with a delay up to a few hundreds of milliseconds. For this reason,

timer are not appropriate for real-time applications in which expiration times

must be strictly enforced.[ULK3, Pg 244]

--------- Dynamic timer management structures -------

struct tvec {

struct list_head vec[TVN_SIZE];

};

struct tvec_root {

struct list_head vec[TVR_SIZE];

};

struct tvec_base {

spinlock_t lock;

struct timer_list *running_timer; // pointer to the timer being proccessed now

/* It records the time(in jiffies) by which all timers of the structure

* were executed.

*/

unsigned long timer_jiffies;

unsigned long next_timer; //????

struct tvec_root tv1;

struct tvec tv2;

struct tvec tv3;

struct tvec tv4;

struct tvec tv5;

} ____cacheline_aligned;

struct timer_list {

/*

* All fields that change during normal runtime grouped to the

* same cacheline

*/

struct list_head entry;

unsigned long expires;

struct tvec_base *base;

void (*function)(unsigned long);

unsigned long data;

int slack;

#ifdef CONFIG_TIMER_STATS

int start_pid;

void *start_site;

char start_comm[16];

#endif

#ifdef CONFIG_LOCKDEP

struct lockdep_map lockdep_map;

#endif

};

---------------

1. INIT the base structures

Call Tree:

-----------------------------

start_kernel

init_timers

timer_cpu_notify

init_timers_cpu

hrtimers_init

timekeeping_init

time_init*

late_time_init*


------------------

struct tvec_base boot_tvec_bases; /* Only for the boot CPU */

EXPORT_SYMBOL(boot_tvec_bases);

static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;

/*

* Init the per cpu var @tvec_bases for each CPU.

* ------------------

* Each CPU has a struct tvec_base structure. As we don't know the number

* of CPU in our system, we only defined @boot_tvec_bases for the first CPU.

* Other CPUs' tvec_base structures will be allocated by kmmalloc_node().

* @tvec_bases->timer_jiffies and @tvec_bases->next_timer are initialized

* to @jiffies.

* ----------------

* kernel/timer.c

*/

1595 static int __cpuinit init_timers_cpu(int cpu)

1596 {

1597 int j;

1598 struct tvec_base *base;

1599 static char __cpuinitdata tvec_base_done[NR_CPUS];

1600

1601 if (!tvec_base_done[cpu]) {

1602 static char boot_done;

1603

1604 if (boot_done) {

1605 /*

1606 * The APs use this path later in boot

1607 */

1608 base = kmalloc_node(sizeof(*base),

1609 GFP_KERNEL | __GFP_ZERO,

1610 cpu_to_node(cpu));

1611 if (!base)

1612 return -ENOMEM;

1613

1614 /* Make sure that tvec_base is 2 byte aligned */

1615 if (tbase_get_deferrable(base)) {

1616 WARN_ON(1);

1617 kfree(base);

1618 return -ENOMEM;

1619 }

1620 per_cpu(tvec_bases, cpu) = base;

1621 } else {

1622 /*

1623 * This is for the boot CPU - we use compile-time

1624 * static initialisation because per-cpu memory isn't

1625 * ready yet and because the memory allocators are not

1626 * initialised either.

1627 */

1628 boot_done = 1;

1629 base = &boot_tvec_bases;

1630 }

1631 tvec_base_done[cpu] = 1;

1632 } else {

1633 base = per_cpu(tvec_bases, cpu);

1634 }

1635

1636 spin_lock_init(&base->lock);

1637

1638 for (j = 0; j < TVN_SIZE; j++) {

1639 INIT_LIST_HEAD(base->tv5.vec + j);

1640 INIT_LIST_HEAD(base->tv4.vec + j);

1641 INIT_LIST_HEAD(base->tv3.vec + j);

1642 INIT_LIST_HEAD(base->tv2.vec + j);

1643 }

1644 for (j = 0; j < TVR_SIZE; j++)

1645 INIT_LIST_HEAD(base->tv1.vec + j);

1646

1647 base->timer_jiffies = jiffies;

1648 base->next_timer = base->timer_jiffies;

1649 return 0;

1650 }

2. Dynamic timers register, modification and deletion

2.1 Register and modification


First, initialize a timer_list object. Then register it to the system by calling

add_timer(). When the timer expire, the timer's callback function will

be called in the soft interrupt envirenment.

Related functions:

init_timer()

add_timer()

---------------

call tree:

#define init_timer(timer)\

init_timer_key((timer), NULL, NULL)

init_timer (init_timer_key)

__init_timer

add_timer

mod_timer

__mod_timer

internal_add_timer

---------------


static void __init_timer(struct timer_list *timer,

const char *name,

struct lock_class_key *key)

{

timer->entry.next = NULL;

timer->base = __raw_get_cpu_var(tvec_bases);

timer->slack = -1;

#ifdef CONFIG_TIMER_STATS

timer->start_site = NULL;

timer->start_pid = -1;

memset(timer->start_comm, 0, TASK_COMM_LEN);

#endif

lockdep_init_map(&timer->lockdep_map, name, key, 0);

}

/**

* add_timer - start a timer

* @timer: the timer to be added

*

* The kernel will do a ->function(->data) callback from the

* timer interrupt at the ->expires point in the future. The

* current time is 'jiffies'.

*

* The timer's ->expires, ->function (and if the handler uses it, ->data)

* fields must be set prior calling this function.

*

* Timers with an ->expires field in the past will be executed in the next

* timer tick.

*/

void add_timer(struct timer_list *timer)

{

BUG_ON(timer_pending(timer));

mod_timer(timer, timer->expires);

}

EXPORT_SYMBOL(add_timer);

/**

* mod_timer - modify a timer's timeout

* @timer: the timer to be modified

* @expires: new timeout in jiffies

*

* mod_timer() is a more efficient way to update the expire field of an

* active timer (if the timer is inactive it will be activated)

*

* mod_timer(timer, expires) is equivalent to:

*

* del_timer(timer); timer->expires = expires; add_timer(timer);

*

* Note that if there are multiple unserialized concurrent users of the

* same timer, then mod_timer() is the only safe way to modify the timeout,

* since add_timer() cannot modify an already running timer.

*

* The function returns whether it has modified a pending timer or not.

* (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an

* active timer returns 1.)

*/

int mod_timer(struct timer_list *timer, unsigned long expires)

{

expires = apply_slack(timer, expires);

/*

* This is a common optimization triggered by the

* networking code - if the timer is re-modified

* to be the same thing then just return:

*/

if (timer_pending(timer) && timer->expires == expires)

return 1;

return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);

}

EXPORT_SYMBOL(mod_timer);

static inline int

__mod_timer(struct timer_list *timer, unsigned long expires,

bool pending_only, int pinned)

{

struct tvec_base *base, *new_base;

unsigned long flags;

int ret = 0 , cpu;

timer_stats_timer_set_start_info(timer);

BUG_ON(!timer->function);

base = lock_timer_base(timer, &flags);

if (timer_pending(timer)) {

detach_timer(timer, 0);

if (timer->expires == base->next_timer &&

!tbase_get_deferrable(timer->base))

base->next_timer = base->timer_jiffies;

ret = 1;

} else {

if (pending_only)

goto out_unlock;

}

debug_activate(timer, expires);

cpu = smp_processor_id();

#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)

if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu))

cpu = get_nohz_timer_target();

#endif

new_base = per_cpu(tvec_bases, cpu);

if (base != new_base) {

/*

* We are trying to schedule the timer on the local CPU.

* However we can't change timer's base while it is running,

* otherwise del_timer_sync() can't detect that the timer's

* handler yet has not finished. This also guarantees that

* the timer is serialized wrt itself.

*/

if (likely(base->running_timer != timer)) {

/* See the comment in lock_timer_base() */

timer_set_base(timer, NULL);

spin_unlock(&base->lock);

base = new_base;

spin_lock(&base->lock);

timer_set_base(timer, base);

}

}

timer->expires = expires;

if (time_before(timer->expires, base->next_timer) &&

!tbase_get_deferrable(timer->base))

base->next_timer = timer->expires;

internal_add_timer(base, timer);

out_unlock:

spin_unlock_irqrestore(&base->lock, flags);

return ret;

}

static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)

{

unsigned long expires = timer->expires;

unsigned long idx = expires - base->timer_jiffies;

struct list_head *vec;

if (idx < TVR_SIZE) {

int i = expires & TVR_MASK;

vec = base->tv1.vec + i;

} else if (idx < 1 << (TVR_BITS + TVN_BITS)) {

int i = (expires >> TVR_BITS) & TVN_MASK;

vec = base->tv2.vec + i;

} else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {

int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;

vec = base->tv3.vec + i;

} else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {

int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;

vec = base->tv4.vec + i;

} else if ((signed long) idx < 0) {

/*

* Can happen if you add a timer with expires == jiffies,

* or you set a timer to go off in the past

*/

vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK);

} else {

int i;

/* If the timeout is larger than 0xffffffff on 64-bit

* architectures then we use the maximum timeout:

*/

if (idx > 0xffffffffUL) {

idx = 0xffffffffUL;

expires = idx + base->timer_jiffies;

}

i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;

vec = base->tv5.vec + i;

}

/*

* Timers are FIFO:

*/

list_add_tail(&timer->entry, vec);

}

2.2 Dynamic timers deletion

Being asynchronously activated, dynamic timers are prone to race

conditions. More info. refer. ULK pg 246.

call tree:

del_timer

timer_pending

detach_timer

--------------------------


/**

* del_timer - deactive a timer.

* @timer: the timer to be deactivated

*

* del_timer() deactivates a timer - this works on both active and inactive

* timers.

*

* The function returns whether it has deactivated a pending timer or not.

* (ie. del_timer() of an inactive timer returns 0, del_timer() of an

* active timer returns 1.)

*/

int del_timer(struct timer_list *timer)

{

struct tvec_base *base;

unsigned long flags;

int ret = 0;

timer_stats_timer_clear_start_info(timer);

if (timer_pending(timer)) {

base = lock_timer_base(timer, &flags);

if (timer_pending(timer)) {

detach_timer(timer, 1);

if (timer->expires == base->next_timer &&

!tbase_get_deferrable(timer->base))

base->next_timer = base->timer_jiffies;

ret = 1;

}

spin_unlock_irqrestore(&base->lock, flags);

}

return ret;

}

EXPORT_SYMBOL(del_timer);

/**

* timer_pending - is a timer pending?

* @timer: the timer in question

*

* timer_pending will tell whether a given timer is currently pending,

* or not. Callers must ensure serialization wrt. other operations done

* to this timer, eg. interrupt contexts, or other CPUs on SMP.

*

* return value: 1 if the timer is pending, 0 if not.

*/

static inline int timer_pending(const struct timer_list * timer)

{

return timer->entry.next != NULL;

}

//arch/x86/include/asm/x86_init.h

/**

* struct x86_init_timers - platform specific timer setup

* @setup_perpcu_clockev: set up the per cpu clock event device for the

* boot cpu

* @tsc_pre_init: platform function called before TSC init

* @timer_init: initialize the platform timer (default PIT/HPET)

* @wallclock_init: init the wallclock device

*/

struct x86_init_timers {

void (*setup_percpu_clockev)(void);

void (*tsc_pre_init)(void);

void (*timer_init)(void);

void (*wallclock_init)(void);

};

static inline void detach_timer(struct timer_list *timer,

int clear_pending)

{

struct list_head *entry = &timer->entry;

debug_deactivate(timer);

__list_del(entry->prev, entry->next);

if (clear_pending)

entry->next = NULL;

entry->prev = LIST_POISON2;

}

3. Dynamic timer handling

Despite the clever data structures, handling software timers is a time-cons-

suming activity that should not be performed by the timer interrupt hanler.

In this version of Linux this activity is carried on by a deferrable function,

namely the TIMER_SOFTIRQ softirq.[UKL pg 248]

Call Tree:

tick_handle_periodic


tick_periodic

call tree:

tick_periodic | tick_nohz_handler| tick_sched_timer ...

update_process_times

run_local_timers

raise_softirq(TIMER_SOFTIRQ)// trigger the softirq timer handler

run_timer_softirq

__run_timers

------------------------


/*

* Called from the timer interrupt handler to charge one tick to the current

* process. user_tick is 1 if the tick is user time, 0 for system.

*/

void update_process_times(int user_tick)

{

struct task_struct *p = current;

int cpu = smp_processor_id();

/* Note: this timer irq context must be accounted for as well. */

account_process_tick(p, user_tick);

run_local_timers();//****

rcu_check_callbacks(cpu, user_tick);

printk_tick();

#ifdef CONFIG_IRQ_WORK

if (in_irq())

irq_work_run();

#endif

scheduler_tick();

run_posix_cpu_timers(p);

}

/*

* Called by the local, per-CPU timer interrupt on SMP.

*/

void run_local_timers(void)

{

hrtimer_run_queues();

raise_softirq(TIMER_SOFTIRQ); //***

}

/*

* This function runs timers and the timer-tq in bottom half context.

*/

static void run_timer_softirq(struct softirq_action *h)

{

struct tvec_base *base = __this_cpu_read(tvec_bases);

hrtimer_run_pending();

/* base->timer_jiffies is the earlist expired timer's expire value */

if (time_after_eq(jiffies, base->timer_jiffies))

__run_timers(base);

}

#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)

/**

* __run_timers - run all expired timers (if any) on this CPU.

* @base: the timer vector to be processed.

*

* This function cascades all vectors and executes all expired timer

* vectors.

*/

static inline void __run_timers(struct tvec_base *base)

{

struct timer_list *timer;

spin_lock_irq(&base->lock);

/*

* If the kernel has missed a number of timers in the past, they are dealt

* with now by processing all pointers that expired between the last execution

* point(base->timer_jiffes) and the current time(jiffies).

*/

while (time_after_eq(jiffies, base->timer_jiffies)) {

struct list_head work_list;

struct list_head *head = &work_list;

int index = base->timer_jiffies & TVR_MASK;

/*

* Cascade timers:

* The cascade function is used to replenish the timer lists with timers

* from higher groups.

*/

if (!index &&

(!cascade(base, &base->tv2, INDEX(0))) &&

(!cascade(base, &base->tv3, INDEX(1))) &&

!cascade(base, &base->tv4, INDEX(2)))

cascade(base, &base->tv5, INDEX(3));

/*

* All timers located in the first group at the corresponding position

* for the timer_jiffies value are copied into a temporary list and

* therefore removed from the original data structures.

*/

++base->timer_jiffies;

list_replace_init(base->tv1.vec + index, &work_list);

while (!list_empty(head)) {

void (*fn)(unsigned long);

unsigned long data;

timer = list_first_entry(head, struct timer_list,entry);

fn = timer->function;

data = timer->data;

timer_stats_account_timer(timer);

base->running_timer = timer;

/* Detach the timer from the temporary list */

detach_timer(timer, 1);

spin_unlock_irq(&base->lock);

/*

* excute the timer's callback function

*/

call_timer_fn(timer, fn, data);

spin_lock_irq(&base->lock);

}

}

base->running_timer = NULL;

spin_unlock_irq(&base->lock);

}

4. A whole view of the low-resolution timer system

The high-resolution timer mechanism is based on clock events, whereas the

low-resolution timer mechanism utiltizes periodic events that can either come

directly form a low-resolution clock or from the high-resolution subsystem.

Two important tasks for which low-resolution timers assume responsibility

are:

1> Handle the golbal jiffies counter. The value is incremented periodically

(or at least it looks periodical to most parts of the kernel) and reprents

a particularly simple form of time reference. As we know the dynamic

timers implementation is based on the jiffies counter value.

2> Perform per-process accounting. This also includes handling of classical

low-resolution timers(dynamic timers), which can be associated with

any process.

Overview of periodic low-resolution timer interrupts

Early kernels in the 2.6 series directly hooked into the timer interrupt to

start timer activation and process accounting, but this has been somewhat

complicated by the introduction of the generic clock framework(we will

talk this later).



The details differ for other architectures, but the principle is nevertheless the

same. A particular architecture proceeds is usually set upt in time_init()

which is called at boot time to initialize the fundamental low-resolution

timekeeping. The periodic clock is set up to operate at HZ ticks per second.

IA-32 register timer_interrupt() as the interrupt handler, whereas AMD64 uses

timer_event_interrupt(). Both functions notify the generic, architecture-independant

time processing layers of the kernel by calling the event handler of the

so called global clock. The handler will set the ball rolling for periodic

low-resolution timekeeping by calling the following two functions.

- do_time() is responsible for system-wide, global tasks: Update the

jiffies value, and handle process accounting. On a multiprocessor system,

one particular CPU is selected to perform both tasks, and all other

CPUs are not concerned with them.

- update_process_times() needs to be performed by every CPU on SMP

systems. Besides process accounting, it activates and expires all registered

classical low-resolution timers and provides the scheduler with a sense

of time. Timer activation and expiration is triggered by calling

run_local_timers. The function, in turn, raises the softIRQ TIMER_SOFTIRQ,

and the handler function is responsible to run the low-resolution timers.

Before we inspect the detail of the two function mentioned above,

jiffies_64 and jiffies will be introduced.

The global variable jiffies_64( an integer varible with 64bits on all architectures)

is incremented by 1. It specifies the exact number of timer interrupt since the

system started. Its value is increased with constant regurlarity when dynamic

ticks are disabled. If dynamic ticks are active, more than one tick period

can have passed since the last update. jiffies is a variable of the unsigned

long type and is therefore only 4 bytes long on the 32-bits processors.

jiffies and jiffies_64 match in their less significant bits and therefore point

to the same memory location or the same register. The two are the synonymous

on 64-bits machines. This means that the jiffies_64 ++ operation will also

increase jiffies by 1.

- do_time()

Call tree of do_time()

do_time

jiffies_64 ++

update_times

update_wall_time

calc_load


1097 /*

1098 * The 64-bit jiffies value is not atomic - you MUST NOT read it

1099 * without sampling the sequence number in xtime_lock.

1100 * jiffies is defined in the linker script...

1101 */

1102 void do_timer(unsigned long ticks)

1103 {

1104 jiffies_64 += ticks;

/*

* Updates the wall time that specifies how long the system has

* already been up and running. In contrast to the jiffies mechanism,

* the wall clock uses a human readable format(nanoseconds) to

* reprent the current time.

*/

1105 update_wall_time();

/*

* Update the system load statistics that specify how many tasks

* have on average been waiting on the run queue in a ready-to-run

* state during the last 1, 5, and, 15 minutes.

*/

1106 calc_global_load(ticks);

1107 }

- update_process_times()



Call Tree:

update_process_times

account_process_tick

run_local_timers // this is what we concerned

rcu_check_callbacks

scheduler_tick()

run_posix_cpu_timers


struct task_struct {

...

cputime_t utime, stime; /*@utime denotes the ticks spent in User mode

*@stime denotes the ticks spent in Kernel mode

*/

...

}

1282 /*

1283 * Called from the timer interrupt handler to charge one tick to the current

1284 * process. user_tick is 1 if the tick is user time, 0 for system.

1285 */

1286 void update_process_times(int user_tick)

1287 {

1288 struct task_struct *p = current;

1289 int cpu = smp_processor_id();

1290

1291 /* Note: this timer irq context must be accounted for as well. */

/*

* We don't want to go into the detail of this functin. Here you only

* need to know it update the values for CPU time consumed in task

* structure.

*/

1292 account_process_tick(p, user_tick);

/*

* Actives and expire the low-resolution timers. Recall that this is

* discassed in detail above.

*/

1293 run_local_timers();///**********************

1294 rcu_check_callbacks(cpu, user_tick);

1295 printk_tick();

1296 #ifdef CONFIG_IRQ_WORK

1297 if (in_irq())

1298 irq_work_run();

1299 #endif

1300 scheduler_tick();

1301 run_posix_cpu_timers(p);

1302 }

5. Demon of how to use dynamic timer in your modules

#include <linux/timer.h>

struct timer_list my_timer;

init_timer(&my_timer);

my_timer.expire = jiffies +sec*HZ; /* sec is the timeout in number of sec. */

my_timer.function = timer_func; /* callback function of @my_timer */

my_timer.data = can_be_devid; /* Parameter to your callback function @timer_func */

add_timer(&my_timer); /* start the timer */

static void timer_func(unsigned long func_parameter)

{

/* do work to be done periodically */

...

mod_timer(...);

}
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: