linux 调度器负载均衡
2017-02-06 19:02
495 查看
每个cpu上都有一个运行队列rq,运行队列里面又具体再分成rt_rq、cfs_rq等,
在运行的过程中每个cpu上面task数量是不同的,这个用负载来衡量。cpu上rq内task
越多,代表这个cpu 负载越重。
负载均衡就是用来将task均匀分布在每个cpu上,使每个cpu负载大致均衡,提升task相应速度。
负载均衡中有两个主要的操作,pull和push。
当本地rq上task数量较少时,执行pull操作,从其他rq上取task到本地rq运行。
push是本地rq上task数量较多,需要push到其他rq上运行。
RT调度器中,RT task按照优先级依次插入每个cpu的rt_rq 优先级队列中,
task的出队入队操作比较简单,就是队列的操作,
因此它的负载均衡的处理也比较简单,我们先看下RT调度器的负载均衡。
/*
遍历所有cpu,将其他cpu上第二高的rt task 移到当前cpu上,
这样就可以从当前cpu上选择一个最高优先级的rt task运行(未考虑cpu affine???)
1、进程是其所在的run_queue中优先级第二高的(优先级最高的进程必定正在运行,不需要移动);
2、进程的优先级比当前run_queue中最高优先级的进程还要高;
3、进程允许在当前CPU上运行(没有亲和性限制affine);
*/
static int pull_rt_task(struct rq *this_rq)
{
int this_cpu = this_rq->cpu, ret = 0, cpu;
struct task_struct *p;
struct rq *src_rq;
if (likely(!rt_overloaded(this_rq)))
return 0;
/*
遍历this_rq的cpu set,将目标cpu第二高优先级的task移到this_rq
*/
for_each_cpu(cpu, this_rq->rd->rto_mask) {
if (this_cpu == cpu)
continue;
src_rq = cpu_rq(cpu);
/*
* Don't bother taking the src_rq->lock if the next highest
* task is known to be lower-priority than our current task.
* This may look racy, but if this value is about to go
* logically higher, the src_rq will push this task away.
* And if its going logically lower, we do not care
*/
//等待src_rq 主动push ?
if (src_rq->rt.highest_prio.next >=
this_rq->rt.highest_prio.curr)
continue;
/*
* We can potentially drop this_rq's lock in
* double_lock_balance, and another CPU could
* alter this_rq
*/
double_lock_balance(this_rq, src_rq);
/*
* Are there still pullable RT tasks?
*/
//src_rq 中运行的rt task数量太少,没有必要pull
if (src_rq->rt.rt_nr_running <= 1)
goto skip;
//查找srq_rq中第二高优先级task,可能是同优先级的其他未运行task也可能是更低的优先级task
p = pick_next_highest_task_rt(src_rq, this_cpu);
/*
* Do we have an RT task that preempts
* the to-be-scheduled task?
*/
//将所有cpu遍历一遍,找到可以迁移task中优先级最高的rt task
if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
WARN_ON(p == src_rq->curr);
WARN_ON(!p->on_rq);
/*
* There's a chance that p is higher in priority
* than what's currently running on its cpu.
* This is just that p is wakeing up and hasn't
* had a chance to schedule. We only pull
* p if it is lower in priority than the
* current task on the run queue
*/
if (p->prio < src_rq->curr->prio)
goto skip;
ret = 1;
deactivate_task(src_rq, p, 0);//将p移到src_rq的尾部
set_task_cpu(p, this_cpu);
activate_task(this_rq, p, 0); //将p移到所在rq的队尾,同时将p加入this_rq的task cache list中
/*
* We continue with the search, just in
* case there's an even higher prio task
* in another runqueue. (low likelihood
* but possible)
*/
}
skip:
double_unlock_balance(this_rq, src_rq);
}
return ret;
}
/* Return the second highest RT task, NULL otherwise */
static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
{
struct task_struct *next = NULL;
struct sched_rt_entity *rt_se;
struct rt_prio_array *array;
struct rt_rq *rt_rq;
int idx;
//leaf_rt_rq_list 遍历
for_each_leaf_rt_rq(rt_rq, rq) {
array = &rt_rq->active;
idx = sched_find_first_bit(array->bitmap);
next_idx:
if (idx >= MAX_RT_PRIO)
continue;
if (next && next->prio <= idx)
continue;
list_for_each_entry(rt_se, array->queue + idx, run_list) {//遍历优先级队列queue
struct task_struct *p;
//实体是否为task
if (!rt_entity_is_task(rt_se))
continue;
p = rt_task_of(rt_se);
//p未运行,并且所在cpu属于rq的cpu set,就是找到了合适的task
if (pick_rt_task(rq, p, cpu)) {
next = p;
break;
}
}
//获取下个优先级队列
if (!next) {
idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
goto next_idx;
}
}
return next;
}
在运行的过程中每个cpu上面task数量是不同的,这个用负载来衡量。cpu上rq内task
越多,代表这个cpu 负载越重。
负载均衡就是用来将task均匀分布在每个cpu上,使每个cpu负载大致均衡,提升task相应速度。
负载均衡中有两个主要的操作,pull和push。
当本地rq上task数量较少时,执行pull操作,从其他rq上取task到本地rq运行。
push是本地rq上task数量较多,需要push到其他rq上运行。
RT调度器中,RT task按照优先级依次插入每个cpu的rt_rq 优先级队列中,
task的出队入队操作比较简单,就是队列的操作,
因此它的负载均衡的处理也比较简单,我们先看下RT调度器的负载均衡。
/*
遍历所有cpu,将其他cpu上第二高的rt task 移到当前cpu上,
这样就可以从当前cpu上选择一个最高优先级的rt task运行(未考虑cpu affine???)
1、进程是其所在的run_queue中优先级第二高的(优先级最高的进程必定正在运行,不需要移动);
2、进程的优先级比当前run_queue中最高优先级的进程还要高;
3、进程允许在当前CPU上运行(没有亲和性限制affine);
*/
static int pull_rt_task(struct rq *this_rq)
{
int this_cpu = this_rq->cpu, ret = 0, cpu;
struct task_struct *p;
struct rq *src_rq;
if (likely(!rt_overloaded(this_rq)))
return 0;
/*
遍历this_rq的cpu set,将目标cpu第二高优先级的task移到this_rq
*/
for_each_cpu(cpu, this_rq->rd->rto_mask) {
if (this_cpu == cpu)
continue;
src_rq = cpu_rq(cpu);
/*
* Don't bother taking the src_rq->lock if the next highest
* task is known to be lower-priority than our current task.
* This may look racy, but if this value is about to go
* logically higher, the src_rq will push this task away.
* And if its going logically lower, we do not care
*/
//等待src_rq 主动push ?
if (src_rq->rt.highest_prio.next >=
this_rq->rt.highest_prio.curr)
continue;
/*
* We can potentially drop this_rq's lock in
* double_lock_balance, and another CPU could
* alter this_rq
*/
double_lock_balance(this_rq, src_rq);
/*
* Are there still pullable RT tasks?
*/
//src_rq 中运行的rt task数量太少,没有必要pull
if (src_rq->rt.rt_nr_running <= 1)
goto skip;
//查找srq_rq中第二高优先级task,可能是同优先级的其他未运行task也可能是更低的优先级task
p = pick_next_highest_task_rt(src_rq, this_cpu);
/*
* Do we have an RT task that preempts
* the to-be-scheduled task?
*/
//将所有cpu遍历一遍,找到可以迁移task中优先级最高的rt task
if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
WARN_ON(p == src_rq->curr);
WARN_ON(!p->on_rq);
/*
* There's a chance that p is higher in priority
* than what's currently running on its cpu.
* This is just that p is wakeing up and hasn't
* had a chance to schedule. We only pull
* p if it is lower in priority than the
* current task on the run queue
*/
if (p->prio < src_rq->curr->prio)
goto skip;
ret = 1;
deactivate_task(src_rq, p, 0);//将p移到src_rq的尾部
set_task_cpu(p, this_cpu);
activate_task(this_rq, p, 0); //将p移到所在rq的队尾,同时将p加入this_rq的task cache list中
/*
* We continue with the search, just in
* case there's an even higher prio task
* in another runqueue. (low likelihood
* but possible)
*/
}
skip:
double_unlock_balance(this_rq, src_rq);
}
return ret;
}
/* Return the second highest RT task, NULL otherwise */
static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
{
struct task_struct *next = NULL;
struct sched_rt_entity *rt_se;
struct rt_prio_array *array;
struct rt_rq *rt_rq;
int idx;
//leaf_rt_rq_list 遍历
for_each_leaf_rt_rq(rt_rq, rq) {
array = &rt_rq->active;
idx = sched_find_first_bit(array->bitmap);
next_idx:
if (idx >= MAX_RT_PRIO)
continue;
if (next && next->prio <= idx)
continue;
list_for_each_entry(rt_se, array->queue + idx, run_list) {//遍历优先级队列queue
struct task_struct *p;
//实体是否为task
if (!rt_entity_is_task(rt_se))
continue;
p = rt_task_of(rt_se);
//p未运行,并且所在cpu属于rq的cpu set,就是找到了合适的task
if (pick_rt_task(rq, p, cpu)) {
next = p;
break;
}
}
//获取下个优先级队列
if (!next) {
idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
goto next_idx;
}
}
return next;
}
/* * If the current CPU has more than one RT task, see if the non * running task can migrate over to a CPU that is running a task * of lesser priority. */ /* 把当前run_queue中多余的实时进程推给其他run_queue 1、每次push一个进程,这个进程的优先级在当前run_queue中是第二高的(优先级最高的进程必定正在运行,不需要移动); 2、目标run_queue上正在运行的不是实时进程(是普通进程),或者是top-N中优先级最低的实时进程,且优先级低于被push的进程; 3、被push的进程允许在目标CPU上运行(没有亲和性限制); 4、满足条件的目标run_queue可能存在多个(可能多个CPU上都没有实时进程在运行),应该选择与当前CPU最具亲缘性的一组CPU中的第一个CPU所对应的run_queue作为push的目标(顺着sched_domain--调度域--逐步往上,找到第一个包含目标CPU的sched_domain。见后面关于sched_domain的描述); */ static int push_rt_task(struct rq *rq) { struct task_struct *next_task; struct rq *lowest_rq; int ret = 0; //当前rq未超载,直接返回 if (!rq->rt.overloaded) return 0; //从当前rq中选择一个task push给其他运行队列 next_task = pick_next_pushable_task(rq); if (!next_task) return 0; retry: if (unlikely(next_task == rq->curr)) { WARN_ON(1); return 0; } /* * It's possible that the next_task slipped in of * higher priority than current. If that's the case * just reschedule current. */ if (unlikely(next_task->prio < rq->curr->prio)) { resched_task(rq->curr); return 0; } /* We might release rq lock */ get_task_struct(next_task); /* find_lock_lowest_rq locks the rq if found */ //查找负载最小的运行队列 lowest_rq = find_lock_lowest_rq(next_task, rq); //未找到lowest_rq, 是否要继续查找 if (!lowest_rq) { struct task_struct *task; /* * find_lock_lowest_rq releases rq->lock * so it is possible that next_task has migrated. * * We need to make sure that the task is still on the same * run-queue and is also still the next task eligible for * pushing. */ task = pick_next_pushable_task(rq); //无法迁移的话直接退出 if (task_cpu(next_task) == rq->cpu && task == next_task) { /* * The task hasn't migrated, and is still the next * eligible task, but we failed to find a run-queue * to push it to. Do not retry in this case, since * other cpus will pull from us when ready. */ goto out; } if (!task) /* No more tasks, just exit */ goto out; /* * Something has shifted, try again. */ //最初选定的next_task 已经发生改变,重新选择一个pushable task put_task_struct(next_task); next_task = task; goto retry; } deactivate_task(rq, next_task, 0);//从rq中移除next_task set_task_cpu(next_task, lowest_rq->cpu);//设置next_task 移到lowset-rq->cpu activate_task(lowest_rq, next_task, 0);//将next_task 添加到lowset_rq中 ret = 1; resched_task(lowest_rq->curr);//重新调度lowest_rq 的task double_unlock_balance(rq, lowest_rq); out: put_task_struct(next_task); return ret; } /* Will lock the rq it finds */ //task is the push task,rq is current cpu rq static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) { struct rq *lowest_rq = NULL; int tries; int cpu; for (tries = 0; tries < RT_MAX_TRIES; tries++) { cpu = find_lowest_rq(task); if ((cpu == -1) || (cpu == rq->cpu)) break; lowest_rq = cpu_rq(cpu); /* if the prio of this runqueue changed, try again */ if (double_lock_balance(rq, lowest_rq)) { /* * We had to unlock the run queue. In * the mean time, task could have * migrated already or had its affinity changed. * Also make sure that it wasn't scheduled on its rq. */ if (unlikely(task_rq(task) != rq || !cpumask_test_cpu(lowest_rq->cpu, tsk_cpus_allowed(task)) || task_running(rq, task) || !task->on_rq)) { double_unlock_balance(rq, lowest_rq); lowest_rq = NULL; break; } } /* If this rq is still suitable use it. */ //两个rq都是实时进程,并且lowest_rq 正在运行的task优先级更高, //不需要其他进一步调整,直接返回lowest_rq, if (lowest_rq->rt.highest_prio.curr > task->prio) break; /* try again */ double_unlock_balance(rq, lowest_rq); lowest_rq = NULL; } return lowest_rq; } //查找负载最小的运行队列,task是pushed task static int find_lowest_rq(struct task_struct *task) { struct sched_domain *sd; struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); int this_cpu = smp_processor_id();//当前运行的cpu int cpu = task_cpu(task);//task所在cpu if(this_cpu!=cpu){ printk(KERN_ERR "this_cpu:%d task cpu:%d\n",this_cpu,cpu); } if (sched_enable_hmp) return find_lowest_rq_hmp(task); /* Make sure the mask is initialized first */ if (unlikely(!lowest_mask)) return -1; //task不允许在其他cpu运行 if (task->nr_cpus_allowed == 1) return -1; /* No other targets possible */ //查找task 优先级最低的cpu set if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) return -1; /* No targets found */ /* * At this point we have built a mask of cpus representing the * lowest priority tasks in the system. Now we want to elect * the best one based on our affinity and topology. * * We prioritize the last cpu that the task executed on since * it is most likely cache-hot in that location. */ //找到一组cpu后根据cpu亲缘关系跟拓扑结构选择最优的cpu, //基于cache原因,优先选择task上次执行的cpu if (cpumask_test_cpu(cpu, lowest_mask)) return cpu; /* * Otherwise, we consult the sched_domains span maps to figure * out which cpu is logically closest to our hot cache data. */ if (!cpumask_test_cpu(this_cpu, lowest_mask)) this_cpu = -1; /* Skip this_cpu opt if not among lowest */ rcu_read_lock(); //从跟task所在cpu具有亲缘关系的sched_domain中查找 for_each_domain(cpu, sd) { if (sd->flags & SD_WAKE_AFFINE) { int best_cpu; /* * "this_cpu" is cheaper to preempt than a * remote processor. */ if (this_cpu != -1 && cpumask_test_cpu(this_cpu, sched_domain_span(sd))) { rcu_read_unlock(); return this_cpu; } best_cpu = cpumask_first_and(lowest_mask, sched_domain_span(sd)); if (best_cpu < nr_cpu_ids) { rcu_read_unlock(); return best_cpu; } } } rcu_read_unlock(); /* * And finally, if there were no matches within the domains * just give the caller *something* to work with from the compatible * locations. */ if (this_cpu != -1) return this_cpu; cpu = cpumask_any(lowest_mask); if (cpu < nr_cpu_ids) return cpu; return -1; }
相关文章推荐
- 使用Linux实现负载均衡zz
- Linux下双网卡绑定技术实现负载均衡
- 利用Linux架构负载均衡(Load balancer)系统(二)
- apache2与resin 3.1.6 在linux下的整合与负载均衡
- Linux负载均衡专题(个人心得和笔记按照下面的步骤就能建立可用的集群)
- 用LVS构架负载均衡Linux集群实例
- Linux 服务器下多网卡的负载均衡
- Linux 调度器内幕
- 利用Linux架构负载均衡(Load balancer)系统(一)
- Linux 调度器内幕
- Linux 负载均衡 推荐
- 利用Linux架构负载均衡(Load balancer)系统(一)
- Linux 调度器内幕
- 利用Linux架构负载均衡(Load balancer)系统(二)
- Linux 调度器内幕
- RedHat linux+apache+tomcat+mod_jk 负载均衡安装说明
- Linux下配置tomcat负载均衡
- 负载均衡开源网站http://www.linuxvirtualserver.org/
- 利用Linux架构负载均衡(Load balancer)系统