Skip to main content
 首页 » 操作系统

Linux 调度器之负载均衡—load_balance()函数分析

2022年07月19日198leader

一、概述

1. 负载均衡的情景包括 tick balance、nohz idle balance 和 new idle balance,最终都会汇聚到 load_balance 函数来完成具体的负载均衡工作。


二、load_balance 相关数据结构

1. struct lb_env

在负载均衡的时候,通过 lb_env 结构来表示本次负载均衡的上下文:

//fair.c 
struct lb_env { 
    //要进行负载均衡的domain 
    struct sched_domain    *sd; 
 
    //此sd中最忙的cpu和rq,均衡目标就是从其中拉取任务 
    struct rq        *src_rq; 
    int            src_cpu; 
 
    /* 
     * 本次均衡的目标CPU,均衡尝试从sd中的最忙的cpu的rq上拉取任务到dst cpu的rq上, 
     * 第一轮均衡的dst cpu通常为发起均衡的cpu,但后续若有需要,可以从新设定为local 
     * group中其它的cpu. 
     */ 
    int            dst_cpu; 
    struct rq        *dst_rq; 
 
    //dst cpu所在sched group的cpu mask,MC层级就是dst cpu自己,DIE层级是其cluster. 
    struct cpumask        *dst_grpmask; 
    /* 
     * 一般而言,均衡的dst cpu是发起均衡的cpu,但如果由于affinity原因,src上有任务 
     * 无法迁移到dst cpu从而无法完成负载均衡操作时,会从dst cpu的logcal group中选出 
     * 一个新的cpu发起第二轮负载均衡。 
     */ 
    int            new_dst_cpu; 
    //均衡时dst cpu的idle状态,其会影响负载均衡的走向 
    enum cpu_idle_type    idle; 
    /* 
     * 对此成员的解释需要结合migration_type成员, calculate_imbalance: 
     * migrate_load:表示要迁移的负载量 
     * migrate_util:表示要迁移的utility 
     * migrate_task:MC:表示要迁移的任务个数,DIE: busiest group需要增加的idle cpu个数 
     * migrate_misfit:设定为1,表示一次迁移一个任务 
     * group_imbalanced:设定为1,表示一次迁移一个任务 
     */ 
    long            imbalance; 
    /* The set of CPUs under consideration for load-balancing */ 
    /* 
     * 负载均衡过程会有多轮操作,不同轮次的操作会涉及不同cpus,此成员表示此次均衡 
     * 有哪些cpus参与 
     */ 
    struct cpumask        *cpus; 
 
    /* 
     * 负载均衡标志,位掩码。LBF_NOHZ_STATS 和 LBF_NOHZ_AGAIN 主要用于均衡过程中更 
     * 新nohz状态。当选中的最忙的cpu上所有任务都由于affinity无法迁移时会设置 
     * LBF_ALL_PINNED,此时会寻找次忙的cpu进行下一轮均衡。LBF_NEED_BREAK 主要用于 
     * 减短均衡过程中关中断的时间的。 
     */ 
    unsigned int        flags; 
 
    /* 
     * 当确定要迁移任务时,load_balance()会循环遍历src rq上的cfs task链表来确定迁移 
     * 的任务数量。loop用于跟踪循环次数,其值不能超过loop_max成员。 
     */ 
    unsigned int        loop; 
    /* 
     * 如果一次迁移的任务比较多,那么每迁移 sched_nr_migrate_break 个任务就要休息一 
     * 下,让关中断的临界区小一点。 
     */ 
    unsigned int        loop_break; 
    unsigned int        loop_max; 
 
    enum fbq_type        fbq_type; 
    /* 
     * 要达到sd负载均衡的目标,本次迁移的类型是什么,迁移一定量的负载、一定量的utility、 
     * 一些任务还是misfit task。见 imbalance 成员的解释。 
     */ 
    enum migration_type    migration_type; 
    //需要迁移的任务会挂到这个链表中 
    struct list_head    tasks; 
    struct rq_flags        *src_rq_rf; 
};

2、struct sd_lb_stats

在负载均衡的时候,通过 sd_lb_stats 结构来表示 sched domain 的负载统计信息:

struct sd_lb_stats { 
    //该sd中最忙的那sg,非local group 
    struct sched_group *busiest; 
    //均衡时用于标记sd中哪个group是local group,即dst cpu所在的group 
    struct sched_group *local; 
    //此sd中所有sg的负载之和。若无特别说明,这里的负载指的是cfs任务的负载 
    unsigned long total_load; 
    //此sd中所有sg的cpu算力之和(可用于cfs任务的算力) 
    unsigned long total_capacity; 
    //该sd中所有sg的平均负载 
    unsigned long avg_load; 
    //标记任务应该先去到同cluster的cpu 
    unsigned int prefer_sibling; 
    //该sd中最忙的那个sg的负载统计信息 
    struct sg_lb_stats busiest_stat; 
    //dst cpu所在的本地sg的负载统计信息 
    struct sg_lb_stats local_stat; 
};

3、struct sg_lb_stats

在负载均衡的时候,通过 sg_lb_stats 结构来表示 sched group 的负载统计信息:

struct sg_lb_stats { 
    /* 
     * 该sg上所有cpu的平均负载。仅在sg处于 group_overloaded 
     * 状态下才计算该值,方便计算迁移负载量 
     */ 
    unsigned long avg_load; 
    //该sg上所有cpu的负载之和 
    unsigned long group_load; 
    //该sg上所有cpu的可用于cfs任务的算力之和 
    unsigned long group_capacity; 
    //该sg上所有cpu的利用率之和 
    unsigned long group_util; 
    //该sg上所有cpu的运行负载之和 
    unsigned long group_runnable; 
    //该sg上所有任务的数量,包括rt、dl任务 
    unsigned int sum_nr_running; 
    //该sg上所有cfs任务的数量 
    unsigned int sum_h_nr_running; 
    //该sg中idle cpu的数量 
    unsigned int idle_cpus; 
    //该sg中cpu的数量 
    unsigned int group_weight; 
    //该sg在负载均衡时所处的状态 
    enum group_type group_type; 
    //标记任务需要被迁移到偏爱的cpu, update_sg_lb_stats中判断了若sd指定了 SD_ASYM_PACKING 才可能赋值,是不会赋值的 
    unsigned int group_asym_packing; 
    //该sg中至少有一个cpu上有misfit task,这里记录该sg所有cpu的misfit task load的最大值 
    unsigned long group_misfit_task_load; 
};

4、struct sched_group_capacity

用来描述 sched group 的算力信息:

struct sched_group_capacity { 
    //引用计算,可能多个sd共享一个sg和sgc 
    atomic_t        ref; 
    //该sg中可用于cfs任务的总算力(约为此sg中各个cpu的算力之和) 
    unsigned long        capacity; 
    //该sg中最小可用于cfs任务的算力(对于单个cpu而言的) 
    unsigned long        min_capacity; 
    //该sg中最大可用于cfs任务的算力(对于单个cpu而言的) 
    unsigned long        max_capacity; 
    //下一次更新算力的时间点 
    unsigned long        next_update; 
    //该sg中是否有由于affinity原因产生不均衡的问题 
    int            imbalance; 
#ifdef CONFIG_SCHED_DEBUG 
    //MC层级的是每个cpu的id,DIE层级的是每个cluster的首个cpu的id 
    int            id; 
#endif 
    //该sg包含的cpu 
    unsigned long        cpumask[]; 
};

三、load_balance 函数

先整体看下 load_balance(),之后再对其各个逻辑进行介绍

/* 
 * 参数: 
 * this_cpu/this_rq:发起本次负载均衡的cpu和其对应的rq 
 * sd:本次均衡的范围,即本次均衡要保证该sd上各个sg处于负载均衡状态 
 * idle:this_cpu在发起均衡时所处于的状态,通过这个状态可以识别是 new idle blance 还是 tick blance. 
 * continue_balancing:均衡是从发起cpu的base domain开始,不断向上,直到顶层sd,此参数用来控制是否继续进行上层sd的均衡。 
 * 
 * 返回值:本次负载均衡迁移的任务总数 
 */ 
static int load_balance(int this_cpu, struct rq *this_rq, 
            struct sched_domain *sd, enum cpu_idle_type idle, 
            int *continue_balancing) 
{ 
    int ld_moved, cur_ld_moved, active_balance = 0; 
    struct sched_domain *sd_parent = sd->parent; //上级sd,即DIE层级 
    struct sched_group *group; 
    struct rq *busiest; 
    struct rq_flags rf; 
    //这里是唯一使用位置,先使用后赋值,per-cpu的全局变量 
    struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); 
 
    struct lb_env env = { 
        .sd        = sd, 
        .dst_cpu    = this_cpu, //dst cpu通常是发起均衡的cpu 
        .dst_rq        = this_rq, 
        .dst_grpmask    = sched_group_span(sd->groups), //MC:就是本cpu,DIE:同cluster的cpus 
        .idle        = idle, 
        .loop_break    = sched_nr_migrate_break, 
        .cpus        = cpus, 
        .fbq_type    = all, 
        .tasks        = LIST_HEAD_INIT(env.tasks), 
    }; 
 
    /* 
     * 只在active的cpu之间做均衡,active就是非isolate和非offline的cpu 
     * 
     * 由于是第一轮均衡,sd的所有cpu都要参与,后续若发现一些异常状况, 
     * 比如affinity导致无法完成任务迁移,那么会清除选定的busiest cpu, 
     * 跳转到redo标号处进行新的一轮均衡。 
     * 
     * MC: 是一个cluster的cpu, DIE:是所有的cpu。也就是说若传参sd是MC 
     * 层级的就只在dst cpu cluster内部均衡,若是DIE层级的就在所有cluster 
     * 的核之间均衡。 
     */ 
    cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask); 
 
    //对应的idle type 的 balance计算加1,在cat /proc/schedstat 中打印 
    schedstat_inc(sd->lb_count[idle]); 
 
redo: 
    //对哪些cpu可以发起均衡做一个限制 
    if (!should_we_balance(&env)) { 
        /*如果判断为不适合均衡了,那么后续更高层sd的均衡也不需要进行了,将其设置为0*/ 
        *continue_balancing = 0; 
        goto out_balanced; 
    } 
 
    /*在该sd中寻找最忙的sg,如果没有找到就退出本level的均衡*/ 
    group = find_busiest_group(&env); 
    if (!group) { 
        schedstat_inc(sd->lb_nobusyg[idle]); 
        goto out_balanced; 
    } 
 
    /*在找出的最忙的sg中寻找最忙的cpu,如果没有找到就退出本level的均衡*/ 
    busiest = find_busiest_queue(&env, group); 
    if (!busiest) { 
        schedstat_inc(sd->lb_nobusyq[idle]); 
        goto out_balanced; 
    } 
 
    /* 
     * 至此就找到了最忙的src cpu, dst cpu就是发起均衡的cpu, 至此,就可以发起第一轮负载均衡了。 
     * 找出的最忙的cpu不能是发起均衡的cpu 
     */ 
    BUG_ON(busiest == env.dst_rq); 
 
    //增加统计计数 
    schedstat_add(sd->lb_imbalance[idle], env.imbalance); 
 
    //将找到的最忙的cpu更新到lb_env这个均衡上下文中 
    env.src_cpu = busiest->cpu; 
    env.src_rq = busiest; 
 
    /*要从busiest cpu迁移任务到this cpu, 至少要有可拉取的任务*/ 
    ld_moved = 0; 
    if (busiest->nr_running > 1) { 
        /* 
         * Attempt to move tasks. If find_busiest_group has found 
         * an imbalance but busiest->nr_running <= 1, the group is 
         * still unbalanced. ld_moved simply stays zero, so it is 
         * correctly treated as an imbalance. 
         */ 
        /* 
         * 拉取任务之前先假定all pinned标志,若后续在can_migrate_task()中发现至少有一个任务可 
         * 以迁移到dst cpu上时就清除这个标志 
         */ 
        env.flags |= LBF_ALL_PINNED; 
        /*  
         * loop_max就是扫描src rq上runnable任务的次数,取busiest->nr_running,但是被钳位在 
         * sysctl_sched_nr_migrate上,因为一次迁移任务不宜过多,因为关中断时间不宜过长。 
         */ 
        env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); 
 
        /* 
         * 这个标号和redo不同,不需要判断是否需要balance和寻找最忙cpu,只需要继续扫描busiest 
         * rq上的任务列表,寻找适合迁移的任务。 
         */ 
more_balance: 
        rq_lock_irqsave(busiest, &rf); 
        env.src_rq_rf = &rf; 
        //更新 busiest->clock 
        update_rq_clock(busiest); 
 
        /* 
         * cur_ld_moved - load moved in current iteration 
         * ld_moved     - cumulative load moved across iterations 
         */ 
        /* 
         * 至此,我们已经确定了从busiest cpu的rq中搬移若干 load/util/task到dst rq。不过无 
         * 论是load还是util,最后还是要转成任务。 
         * 此函数用来从busiest cpu的rq中摘取适合的任务,并把这些任务挂入lb_env->tasks链表 
         * 中。由于关中断时长的问题,此函数也不会一次性把所有任务迁移到dest cpu上。 
         */ 
        cur_ld_moved = detach_tasks(&env); 
 
        /* 
         * We've detached some tasks from busiest_rq. Every 
         * task is masked "TASK_ON_RQ_MIGRATING", so we can safely 
         * unlock busiest->lock, and we are able to be sure 
         * that nobody can manipulate the tasks in parallel. 
         * See task_rq_lock() family for the details. 
         */ 
 
        rq_unlock(busiest, &rf); 
 
        /* 
         * 将 detach_tasks() 摘下的任务挂入到 dst rq上去。由于 detach_tasks、attach_tasks 会 
         * 进行多轮,ld_moved 记录了总共迁移的任务数量,cur_ld_moved 是本轮迁移的任务数 
         */ 
        if (cur_ld_moved) { 
            attach_tasks(&env); 
            ld_moved += cur_ld_moved; 
        } 
 
        local_irq_restore(rf.flags); 
 
        /* 
         * 在任务迁移过程中,src cpu 也就是找出的最忙的那个cpu的中断是关闭的,为了降低这个关 
         * 中断的时间,迁移大量任务的时候需要break一下。就是上面的关中断。 
         * detach_tasks 中判断扫描src rq的次数大于 env->loop_break 时置此标志位并退出它那次循环 
         */ 
        if (env.flags & LBF_NEED_BREAK) { 
            env.flags &= ~LBF_NEED_BREAK; 
            goto more_balance; 
        } 
 
        /* 
         * Revisit (affine) tasks on src_cpu that couldn't be moved to 
         * us and move them to an alternate dst_cpu in our sched_group 
         * where they can run. The upper limit on how many times we 
         * iterate on same src_cpu is dependent on number of CPUs in our 
         * sched_group. 
         * 
         * This changes load balance semantics a bit on who can move 
         * load to a given_cpu. In addition to the given_cpu itself 
         * (or a ilb_cpu acting on its behalf where given_cpu is 
         * nohz-idle), we now have balance_cpu in a position to move 
         * load to given_cpu. In rare situations, this may cause 
         * conflicts (balance_cpu and given_cpu/ilb_cpu deciding 
         * _independently_ and at _same_ time to move some load to 
         * given_cpu) causing exceess load to be moved to given_cpu. 
         * This however should not happen so much in practice and 
         * moreover subsequent load balance cycles should correct the 
         * excess load moved. 
         */ 
        /* 
         * 至此,已经完成了对 src rq上任务列表 loop_max 次的扫描,要看情况是否要发起下一轮次的均衡 
         * 
         * LBF_DST_PINNED 标志是在     can_migrate_task()中判断dst cpu不再任务的cpu亲和性中时设置的 
         * 上面 detach_task() 会一直循环直到 env.imbalance<=0,否则就是有任务不能被迁移到dst cpu。 
         * 
         * 如果sd仍然未达均衡状态,并且在之前的均衡过程中,有因为affinity的原因导致任务无法迁移到dst cpu, 
         * 这时候要继续在src rq上搜索任务,迁移到备选的dst cpu,因此,这里再次发起均衡操作。这里的均衡上 
         * 下文的dst cpu改为备选的cpu,loop也被清零,重新开始扫描。 
         */ 
        if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { 
 
            /* Prevent to re-select dst_cpu via env's CPUs */ 
            /* 
             * 将dst cpu从 env.cpus 中清除,避免重新被选中为dst cpu,这个被踢出去的dst cpu不会再参与接下来 
             * 有affinity限制任务的均衡了。 
             */ 
            __cpumask_clear_cpu(env.dst_cpu, env.cpus); 
            /* 
             * env.new_dst_cpu是在detach_task-->can_migrate_task()中判断赋值的,并用LBF_DST_PINNED表识有 
             * 可用new_dst_cpu,MC层级中只有dst cpu就不会赋值,只有DIE层级可能会赋值。 
             */ 
            env.dst_rq     = cpu_rq(env.new_dst_cpu); 
            env.dst_cpu     = env.new_dst_cpu; 
            env.flags    &= ~LBF_DST_PINNED; 
            env.loop     = 0; 
            env.loop_break     = sched_nr_migrate_break; 
 
            /* 
             * Go back to "more_balance" rather than "redo" since we 
             * need to continue with same src_cpu. 
             */ 
            goto more_balance; 
        } 
 
        /* 
         * We failed to reach balance because of affinity. 
         */ 
        //若还是上次sd层级存在,说明本轮是MC层级的balance 
        if (sd_parent) { 
            //指向DIE层级 
            int *group_imbalance = &sd_parent->groups->sgc->imbalance; 
            /* 
             * 如果本层级(MC层级)的sd以为affinity而无法达到均衡状态,需要把这个标志标记到上层sd->sg中,以便 
             * 在上层sd均衡的时候会判断该sg为imablanced,从而有更大的机会被选中为busiest group,从而解决sd的均 
             * 衡问题。 
             */ 
            if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) 
                *group_imbalance = 1; 
        } 
 
        /* All tasks on this runqueue were pinned by CPU affinity */ 
        /* 
         * 如果选中的busiest cpu的所有task都是通过affinity锁定在该cpu上,那么清除该cpu,以便下轮均衡不再考虑 
         * 该cpu。这种情况下需要搜索新的src cpu,因此跳转到redo 
        */ 
        if (unlikely(env.flags & LBF_ALL_PINNED)) { 
            __cpumask_clear_cpu(cpu_of(busiest), cpus); 
            /* 
             * Attempting to continue load balancing at the current 
             * sched_domain level only makes sense if there are 
             * active CPUs remaining as possible busiest CPUs to 
             * pull load from which are not contained within the 
             * destination group that is receiving any migrated 
             * load. 
             */ 
            //MC层级恒返回0,跳转; DIE层级此时需要参与均衡的cpu有与dst cpu不是处于同一cluster才会继续均衡。 
            if (!cpumask_subset(cpus, env.dst_grpmask)) { 
                env.loop = 0; 
                env.loop_break = sched_nr_migrate_break; 
                goto redo; 
            } 
            goto out_all_pinned; 
        } 
    } 
 
    /* 
     * 至此,src rq上cfs任务链表已经被遍历(也可能被遍历多次),基本上对runnable任务的扫描已经到位了,如果还 
     * 不行就只能考虑running task了,代码如下: 
     */ 
    if (!ld_moved) { 
        schedstat_inc(sd->lb_failed[idle]); 
        /* 
         * Increment the failure counter only on periodic balance. 
         * We do not want newidle balance, which can be very 
         * frequent, pollute the failure counter causing 
         * excessive cache_hot migrations and active balances. 
         */ 
        /* 
         * 经过上面一系列的操作但没有完成任何任务迁移,那么就累加均衡失败的计数,此计数会导致后续更激进的均衡, 
         * 比如迁移cache hot任务、启动active balance。 
         * 这里过滤掉new idle banlance只统计周期banlance的,因为new idle balnace次数太多,累计其失败次数会导致 
         * nr_balance_failed 过大,很容易触发更激进的均衡。 
         */ 
        if (idle != CPU_NEWLY_IDLE) 
            sd->nr_balance_failed++; 
 
        /* 
         * 判断是否需要启动active balance,就是判断是否需要将src cpu当前正在running的任务迁移到dst cpu,因为前面一番 
         * 折腾后发现无法迁移runnable的任务,那么就再考虑一下running的任务 
         */ 
        if (need_active_balance(&env)) { 
            unsigned long flags; 
 
            raw_spin_lock_irqsave(&busiest->lock, flags); 
 
            /* 
             * Don't kick the active_load_balance_cpu_stop, 
             * if the curr task on busiest CPU can't be moved to this_cpu: 
             */ 
            //尝试迁移前先判断一下src cpu上当前running的任务是否由于亲和性不能迁移到dst cpu. 
            if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) { 
                raw_spin_unlock_irqrestore(&busiest->lock, flags); 
                env.flags |= LBF_ALL_PINNED; 
                goto out_one_pinned; 
            } 
 
            /* 
             * ->active_balance synchronizes accesses to 
             * ->active_balance_work.  Once set, it's cleared 
             * only after active load balance is finished. 
             */ 
            //在busiest rq上设置active_balance标记 
            if (!busiest->active_balance) { 
                busiest->active_balance = 1; 
                busiest->push_cpu = this_cpu; 
                active_balance = 1; 
            } 
            raw_spin_unlock_irqrestore(&busiest->lock, flags); 
 
            if (active_balance) { 
                /* 
                 * 就是向 busiest cpu 的stop调度类的 "migration/X" 线程queue一个work,然后唤醒它,执行流程为 
                 * per-cpu cpu_stopper.thread --> smpboot_thread_fn --> cpu_stopper_thread --> fn(arg) --> active_load_balance_cpu_stop(busiest rq) 
                 */ 
                stop_one_cpu_nowait(cpu_of(busiest), active_load_balance_cpu_stop, busiest, &busiest->active_balance_work); 
            } 
 
            /* We've kicked active balancing, force task migration. */ 
            sd->nr_balance_failed = sd->cache_nice_tries+1; //TODO: 什么作用? 
        } 
    } else { 
        //至少完成了一个任务的迁移,重置均衡失败的计数 
        sd->nr_balance_failed = 0; 
    } 
 
    if (likely(!active_balance) || voluntary_active_balance(&env)) { 
        /* We were unbalanced, so reset the balancing interval */ 
        sd->balance_interval = sd->min_interval; 
    } else { 
        /* 
         * If we've begun active balancing, start to back off. This 
         * case may not be covered by the all_pinned logic if there 
         * is only 1 task on the busy runqueue (because we don't call 
         * detach_tasks). 
         */ 
        if (sd->balance_interval < sd->max_interval) 
            sd->balance_interval *= 2; //TODO: balance_interval 的具体作用? 
    } 
 
    goto out; 
 
//判断不适合均衡,没有找到最忙的rq都会跳转到这里 
out_balanced: 
    /* 
     * We reach balance although we may have faced some affinity 
     * constraints. Clear the imbalance flag only if other tasks got 
     * a chance to move and fix the imbalance. 
     * 翻译:尽管我们可能面临一些亲和力限制,但我们达到了平衡。 仅当其他任务有机会 
     * 移动并修复不平衡时才清除不平衡标志。 
     * 
     * 只有此次均衡sd是MC层级的,sd_parent才存在。跳转到这里时 LBF_ALL_PINNED还没有 
     * 机会被赋值上呢 
     */ 
    if (sd_parent && !(env.flags & LBF_ALL_PINNED)) { 
        int *group_imbalance = &sd_parent->groups->sgc->imbalance; 
        //这里MC层级的均衡,只要不是all pinned,又将其清除了 
        if (*group_imbalance) 
            *group_imbalance = 0; 
    } 
 
//在判断busiest cpu上由于亲和性没有一个任务可以迁移到dst cpu上时就跳到这里: 
out_all_pinned: 
    /* 
     * We reach balance because all tasks are pinned at this level so 
     * we can't migrate them. Let the imbalance flag set so parent level 
     * can try to migrate them. 
     */ 
    schedstat_inc(sd->lb_balanced[idle]); 
 
    sd->nr_balance_failed = 0; 
 
//最后的active balance发现src cpu上running的任务由于亲和性也不能迁移到dst cpu上就跳转到这里 
out_one_pinned: 
    ld_moved = 0; 
 
    /* 
     * newidle_balance() disregards balance intervals, so we could 
     * repeatedly reach this code, which would lead to balance_interval 
     * skyrocketting in a short amount of time. Skip the balance_interval 
     * increase logic to avoid that. 
     * 翻译:newidle_balance() 忽略平衡间隔,所以我们可以重复到达这段代码,######## 
     * 这会导致 balance_interval 在短时间内暴涨。 跳过 new idle balance的 
     * balance_interval 的增加逻辑以避免这种情况。 
     */ 
    if (env.idle == CPU_NEWLY_IDLE) 
        goto out; 
 
    /* tune up the balancing interval */ 
    if ((env.flags & LBF_ALL_PINNED && sd->balance_interval < MAX_PINNED_INTERVAL) || sd->balance_interval < sd->max_interval) 
        sd->balance_interval *= 2; 
out: 
    return ld_moved; 
}

四、判断是否应该执行均衡操作——should_we_balance()

/*0:不应该,1:应该*/ 
static int should_we_balance(struct lb_env *env) 
{ 
    struct sched_group *sg = env->sd->groups; 
    int cpu; 
 
    /* 
     * Ensure the balancing environment is consistent; can happen 
     * when the softirq triggers 'during' hotplug. 
     */ 
    //cpus是初始化为dst cpu的cluster(MC)或所有的cluster(DIE) 
    if (!cpumask_test_cpu(env->dst_cpu, env->cpus)) 
        return 0; 
 
    /* 
     * In the newly idle case, we will allow all the CPUs 
     * to do the newly idle load balance. 
     */ 
    //new idle类型的balance会被判定恒需要balance的 
    if (env->idle == CPU_NEWLY_IDLE) 
        return 1; 
 
    /* Try to find first idle CPU */ 
    /* MC: 只有发起均衡的一个cpu, DIE: 是发起均衡的cpu所在cluster的所有cpu */ 
    for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { 
        if (!idle_cpu(cpu)) 
            continue; 
 
        /* Are we the first idle CPU? */ 
        /* 
         * 找到第一个idle cpu,若是发起均衡的cpu,就判断为需要均衡,否则表示此group 
         * 中还有其它idle cpu, 就判断为不需要均衡。 
         */ 
        return cpu == env->dst_cpu; 
    } 
 
    /* Are we the first CPU of this group ? */ 
    /* 如果发起均衡的cpu所在的cluster没有idle cpu, 就判断 sg->sgc->cpumask 中的第一 
     * 个cpu是否是发起均衡的cpu,对于MC层级, sg->sgc->cpumask 中只有发起均衡的cpu自 
     * 己,所以都能返回需要均衡,若是DIE层级的话,只有发起均衡的cpu是cluster中的第一 
     * 个cpu才返回需要均衡。 
     * 
     * 资料:在non-base domain,每个group有多个cpu,如果每一个cpu都可以进行均衡,那么 
     * 均衡就太密集了,白白消耗CPU资源,所以限制只有第一个idle的cpu可以发起均衡,如果 
     * 没有idle的CPU,那么限制group中的第一个CPU可以发起均衡。 
     */ 
    return group_balance_cpu(sg) == env->dst_cpu; //返回sg->sgc->cpumask中的第一个cpu 
}

五、查找最繁忙的sg——find_busiest_group()

作用是如果存在 imbalance,就返回此sd中最忙的sg。同时也会计算为了达到均衡需要移动多少runnable load。

/******* find_busiest_group() helpers end here *********************/ 
 
/* 
 * Decision matrix according to the local and busiest group type: 
 * 
 * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded ################ 
 * has_spare        nr_idle   balanced   N/A    N/A  balanced   balanced 
 * fully_busy       nr_idle   nr_idle    N/A    N/A  balanced   balanced 
 * misfit_task      force     N/A        N/A    N/A  force      force 
 * asym_packing     force     force      N/A    N/A  force      force 
 * imbalanced       force     force      N/A    N/A  force      force 
 * overloaded       force     force      N/A    N/A  force      avg_load 
 * 
 * N/A :      Not Applicable because already filtered while updating 
 *            statistics. 
 * balanced : The system is balanced for these 2 groups. 
 * force :    Calculate the imbalance as load migration is probably needed. 
 * avg_load : Only if imbalance is significant enough. 
 * nr_idle :  dst_cpu is not busy and the number of idle CPUs is quite 
 *            different in groups. 
 */ 
 
/** 
 * find_busiest_group - Returns the busiest group within the sched_domain 
 * if there is an imbalance. 
 * 
 * Also calculates the amount of runnable load which should be moved 
 * to restore balance. 
 * 
 * @env: The load balancing environment. 
 * 
 * Return:    - The busiest group if imbalance exists. 
 */ 
static struct sched_group *find_busiest_group(struct lb_env *env) 
{ 
    struct sg_lb_stats *local, *busiest; 
    struct sd_lb_stats sds; 
 
    init_sd_lb_stats(&sds); 
 
    /* 
     * Compute the various statistics relevant for load balancing at 
     * this level. 
     */ 
    /* 
     * 负载信息都是不断的在变化,在寻找最繁忙group的时候,我们首先要更新sd负载均衡信息, 
     * 以便可以根据最新的负载情况来搜寻。 
     * 此函数会更新该 sd 上各个 sg 的负载和算力,得到local group以及 
     * 非local group最忙的那个group的均衡信息,以便后续给出最适合的均衡决策。 
     */ 
    update_sd_lb_stats(env, &sds); 
 
    /* 
     * 在系统没有进入 overutilized 状态之前,EAS起作用。如果EAS起作用,那么负载可能是不均衡的(考虑功耗), 
     * 因此,这时候不进行负载均衡,依赖task placement的结果。 
     */ 
    if (sched_energy_enabled()) { 
        struct root_domain *rd = env->dst_rq->rd; 
        int out_balance = 1; 
 
        trace_android_rvh_find_busiest_group(sds.busiest, env->dst_rq, &out_balance); // 
        /* 
         * 在系统没有进入 overutilized 状态之前,EAS起作用。如果EAS起作用,那么负载可能是不均衡 
         * 的(考虑功耗),因此,这时候不进行负载均衡(goto out_balanced),依赖task placement的结果。 
         * 
         * out_balance:还有一个hook可以决定是否使能EAS的情况下就算是没 overutilized 也进行均衡。 
         */ 
        if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized) && out_balance) 
            goto out_balanced; 
    } 
 
    /*找出 busiest sg 还要与 local sg 进行PK */ 
    local = &sds.local_stat; 
    busiest = &sds.busiest_stat; 
 
    /* There is no busy sibling group to pull tasks from */ 
    /* 
     * 如果没有找到最忙的那个group,说明当前sd中,其他的非local的最繁忙的 
     * group(后文称之busiest group)没有可以拉取到local group的任务,不需要均衡处理。 
     */ 
    if (!sds.busiest) 
        goto out_balanced; 
 
    /* Misfit tasks should be dealt with regardless of the avg load */ 
    /*Busiest group 中有 misfit task,那么必须要进行均衡,把 misfit task拉取到local group中*/ 
    if (busiest->group_type == group_misfit_task) 
        goto force_balance; 
 
    /* ASYM feature bypasses nice load balance check */ 
    if (busiest->group_type == group_asym_packing) 
        goto force_balance; 
 
    /* 
     * If the busiest group is imbalanced the below checks don't 
     * work because they assume all things are equal, which typically 
     * isn't true due to cpus_ptr constraints and the like. 
     */ 
    /* busiest group是一个由于cpu affinity导致的不均衡,MC层级均衡时发现均衡不了设置的 */ 
    if (busiest->group_type == group_imbalanced) 
        goto force_balance; 
 
    /* 
     * If the local group is busier than the selected busiest group don't try and pull any tasks. 
     */ 
    /* 
     * 如果local group比busiest group还要忙,那么不需要进行均衡(目前的均衡只能从其他group拉 
     * 任务到local group) 
     */ 
    if (local->group_type > busiest->group_type) 
        goto out_balanced; 
 
    /* 
     * When groups are overloaded, use the avg_load to ensure fairness between tasks. 
     */ 
    /*如果local group处于overloaded状态,那么需要通过avg_load的比拼来做均衡决策*/ 
    if (local->group_type == group_overloaded) { 
        /* 
         * If the local group is more loaded than the selected 
         * busiest group don't try to pull any tasks. 
         */ 
        /*如果local group的平均负载比busiest group还要高,那么不需要进行均衡*/ 
        if (local->avg_load >= busiest->avg_load) 
            goto out_balanced; 
 
        /* XXX broken for overlapping NUMA groups */ 
        sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) / sds.total_capacity; 
 
        /* 
         * Don't pull any tasks if this group is already above the 
         * domain average load. 
         */ 
        /*如果local group的平均负载高于sd的平均负载,那么也不需要进行均衡*/ 
        if (local->avg_load >= sds.avg_load) 
            goto out_balanced; 
 
        /* 
         * If the busiest group is more loaded, use imbalance_pct to be 
         * conservative. 
         */ 
        /* 
         * 虽然busiest group的平均负载高于local group,但是高的不多,那也不需要进行均衡, 
         * 毕竟均衡需要额外的开销。具体的门限是有sd的 imbalance_pct 确定的。 
         * 
         * 默认 busiest->avg_load <= 1.17 * local->avg_load 就不均衡。 
         */ 
        if (100 * busiest->avg_load <= env->sd->imbalance_pct * local->avg_load) 
            goto out_balanced; 
    } 
 
    /* Try to move all excess tasks to child's sibling domain*/ 
    if (sds.prefer_sibling && local->group_type == group_has_spare && busiest->sum_nr_running > local->sum_nr_running + 1) 
        goto force_balance; 
 
    /* 
     * 非 group_overloaded 不看平均负载,主要看idle cpu的情况。 
     * 这里处理busiest group没有overload的场景,这时候说明该 sd 中其他的group的 
     * 算力都是cover当前的任务负载,是否要进行均衡,主要看idle cpu的情况。 
     */ 
    if (busiest->group_type != group_overloaded) { 
        /* 
         * 反正busiest group当前算力能处理其rq上的任务,那么在本CPU繁忙的情况下没有必要进行均衡, 
         * 因为这时候关注的是idle cpu,即让更多的idle cpu参与运算,因此,如果本CPU不是idle cpu, 
         * 那么判断sd处于均衡状态。 
         */ 
        if (env->idle == CPU_NOT_IDLE) 
            /* 
             * If the busiest group is not overloaded (and as a 
             * result the local one too) but this CPU is already 
             * busy, let another idle CPU try to pull task. 
             */ 
            goto out_balanced; 
 
        /* 如果busiest group中的cpu和local group中的差不多或更多idle CPU,那么也没有必要进行均衡*/ 
        if (busiest->group_weight > 1 && local->idle_cpus <= (busiest->idle_cpus + 1)) 
            /* 
             * If the busiest group is not overloaded 
             * and there is no imbalance between this and busiest 
             * group wrt idle CPUs, it is balanced. The imbalance 
             * becomes significant if the diff is greater than 1 
             * otherwise we might end up to just move the imbalance 
             * on another group. Of course this applies only if 
             * there is more than 1 CPU per group. 
             */ 
            goto out_balanced; 
 
        /*如果busiest group中只有一个正在运行的cfs任务,那么也没有必要进行均衡*/ 
        if (busiest->sum_h_nr_running == 1) 
            /* busiest doesn't have any tasks waiting to run */ 
            goto out_balanced; 
    } 
 
force_balance: 
    /* Looks like there is an imbalance. Compute it */ 
    /* 此函数用来计算sd中不均衡程度 */ 
    calculate_imbalance(env, &sds); 
 
    return env->imbalance ? sds.busiest : NULL; 
 
out_balanced: 
    env->imbalance = 0; 
    return NULL; 
}

默认情况下若判断 rd 没有 overutilized 是不进行负载均衡的,但是有个hook,vendor可以更改此逻辑。

1. init_sd_lb_stats

static inline void init_sd_lb_stats(struct sd_lb_stats *sds) 
{ 
    /* 
     * Skimp on the clearing to avoid duplicate work. We can avoid clearing 
     * local_stat because update_sg_lb_stats() does a full clear/assignment. 
     * We must however set busiest_stat::group_type and 
     * busiest_stat::idle_cpus to the worst busiest group because 
     * update_sd_pick_busiest() reads these before assignment. 
     */ 
    *sds = (struct sd_lb_stats){ 
        .busiest = NULL, 
        .local = NULL, 
        .total_load = 0UL, 
        .total_capacity = 0UL, 
        .busiest_stat = { 
            .idle_cpus = UINT_MAX, 
            .group_type = group_has_spare, 
        }, 
    }; 
}

2. update_sd_lb_stats()

更新 sg 的算力。在base domain(MC domain)上,我们会更新发起均衡所在CPU的算力。注意:这里说的CPU算力指的是该CPU可以用于cfs任务的算力,即需要去掉由于thermal pressure而损失的和去掉RT/DL/IRQ消耗的算力。具体请参考 update_cpu_capacity 函数。在其他non-base domain(DIE domain)上,我们需要对本地 sg(发起均衡的CPU所在的group) 进行算力更新。这个比较简单,就是把child domain(即MC domain)的所有 sg 的算力加起来。更新后的算力保存在 sg 中的 sgc 成员中。

此函数前半段主要是遍历该 sd 所有的group,对其负载统计进行更新。更新完负载之后会选定两个 sg:其一是local group,另外一个是最繁忙的non-local group,然后进行进一步PK。

/** 
 * update_sd_lb_stats - Update sched_domain's statistics for load balancing. 
 * @env: The load balancing environment. 
 * @sds: variable to hold the statistics for this sched_domain. 
 */ 
static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) 
{ 
    struct sched_domain *child = env->sd->child; 
    struct sched_group *sg = env->sd->groups; 
    struct sg_lb_stats *local = &sds->local_stat; 
    struct sg_lb_stats tmp_sgs; 
    int sg_status = 0; 
 
#ifdef CONFIG_NO_HZ_COMMON 
    if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked)) //TODO: 看什么时候更新的? 
        env->flags |= LBF_NOHZ_STATS; 
#endif 
 
    do { 
        struct sg_lb_stats *sgs = &tmp_sgs; 
        int local_group; 
 
        //MC层级只有1个cpu,DIE层级是一个cluster的cpu 
        local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg)); 
        /* 
         * 更新算力没有必要更新的太频繁,这里做了两个限制: 
         * 1.只有 local group 才进行算力更新, 
         * 2.对于new idle类型的balance通过时间间隔来减少频繁的更新算力,这个时间间隔来自balance_interval: 
         *        jiffies + msecs_to_jiffies(sd->balance_interval)。 
         * 3.其它类型的idle可以更新算力 
         */ 
        if (local_group) { 
            sds->local = sg; 
            sgs = local; 
 
            if (env->idle != CPU_NEWLY_IDLE || time_after_eq(jiffies, sg->sgc->next_update)) 
                //更新sd->sg->sgc里面的相关capacity成员,DIE层级的MC里面的也一并更新 
                update_group_capacity(env->sd, env->dst_cpu); 
        } 
 
        /*上面是更新算力,这里是更新该sched group的负载统计*/ 
        update_sg_lb_stats(env, sg, sgs, &sg_status); 
 
        /* 
         * 在sched domain的各个group遍历中,我们需要两个group信息,一个是local group,另外一个就是 
         * non local group中的最忙的那个group。显然,如果是local group,不需要下面的比拼最忙的过程。 
         */ 
        if (local_group) 
            goto next_group; 
 
        //对于non local group的sg,和之前找到最忙的那个group进行PK,更忙的选中为busiest sg 
        if (update_sd_pick_busiest(env, sds, sg, sgs)) { 
            sds->busiest = sg; 
            sds->busiest_stat = *sgs; 
        } 
 
next_group: 
        /* Now, start updating sd_lb_stats */ 
        /* 累计各个sg的负载和算力到sds */ 
        sds->total_load += sgs->group_load; 
        sds->total_capacity += sgs->group_capacity; 
 
        //MC层级就是在本cluster的各cpu之间遍历,DIE层级是在各个cluster之间遍历 
        sg = sg->next; 
    } while (sg != env->sd->groups); //发起均衡的cpu所在的group就是最先遍历的sg 
 
 
    /* Tag domain that child domain prefers tasks go to siblings first */ 
    sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING; //这是什么操作? 
 
#ifdef CONFIG_NO_HZ_COMMON 
    if ((env->flags & LBF_NOHZ_AGAIN) && cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) { 
        WRITE_ONCE(nohz.next_blocked, jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD)); 
    } 
#endif 
 
    if (env->sd->flags & SD_NUMA) //无此flag,不执行 
        env->fbq_type = fbq_classify_group(&sds->busiest_stat); 
 
    /* 
     * 更新root domain的overload和overutil状态。对于顶层的sd,我们需要把各个sg的overload和 
     * overutil状态体现到root domain中。 
     */ 
    if (!env->sd->parent) {  
        //DIE层级的sd,rd是全局唯一的 
        struct root_domain *rd = env->dst_rq->rd; 
 
        /* update overload indicator if we are at root domain */ 
        WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD); 
 
        /* Update over-utilization (tipping point, U >= 0) indicator */ 
        WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED); 
        trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED); 
 
    } else if (sg_status & SG_OVERUTILIZED) { 
        //MC层级的sd,就只将overutilized标记到rd 
        struct root_domain *rd = env->dst_rq->rd; 
 
        WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED); 
        trace_sched_overutilized_tp(rd, SG_OVERUTILIZED); 
    } 
}

2.1. update_group_capacity()

更新一个 sg 的算力:

void update_group_capacity(struct sched_domain *sd, int cpu) 
{ 
    struct sched_domain *child = sd->child; 
    struct sched_group *group, *sdg = sd->groups; 
    unsigned long capacity, min_capacity, max_capacity; 
    unsigned long interval; 
 
    interval = msecs_to_jiffies(sd->balance_interval); 
    interval = clamp(interval, 1UL, max_load_balance_interval); 
    sdg->sgc->next_update = jiffies + interval; 
 
    //MC层级均衡时传参,只更新MC层级的即可。 
    if (!child) { 
        update_cpu_capacity(sd, cpu); 
        return; 
    } 
 
    /*下面是DIE层级传参时的update*/ 
    capacity = 0; 
    min_capacity = ULONG_MAX; 
    max_capacity = 0; 
 
    //MC和DIE都没有这个标志 
    if (child->flags & SD_OVERLAP) { 
        /* 
         * SD_OVERLAP domains cannot assume that child groups 
         * span the current group. 
         */ 
        for_each_cpu(cpu, sched_group_span(sdg)) { 
            unsigned long cpu_cap = capacity_of(cpu); 
 
            capacity += cpu_cap; 
            min_capacity = min(cpu_cap, min_capacity); 
            max_capacity = max(cpu_cap, max_capacity); 
        } 
    } else  { 
        /* 
         * !SD_OVERLAP domains can assume that child groups span the current group. 
         */ 
 
        group = child->groups; 
        do { 
            struct sched_group_capacity *sgc = group->sgc; 
 
            capacity += sgc->capacity; 
            min_capacity = min(sgc->min_capacity, min_capacity); 
            max_capacity = max(sgc->max_capacity, max_capacity); 
            //cluster内的各cpu之间遍历 
            group = group->next; 
        } while (group != child->groups); 
    } 
 
    sdg->sgc->capacity = capacity; //本cluster所有cpu的可用于cfs任务的算力之和 
    sdg->sgc->min_capacity = min_capacity; //本cluster单个cpu的可用于cfs任务最小的 
    sdg->sgc->max_capacity = max_capacity; //本cluster单个cpu的可用于cfs任务最大的 
}

2.1.1 更新 sd->sg->sgc

static void update_cpu_capacity(struct sched_domain *sd, int cpu) 
{ 
    //计算除去rt/dl/irq占用的算力和thermal pressure后还剩余的算力 
    unsigned long capacity = scale_rt_capacity(cpu); 
    struct sched_group *sdg = sd->groups; 
 
    //update_cpu_capacity: return per_cpu(cpu_scale, cpu) 即cat cpu_capacity 
    cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu); 
 
    if (!capacity) 
        capacity = 1; 
 
    trace_android_rvh_update_cpu_capacity(cpu, &capacity); 
    cpu_rq(cpu)->cpu_capacity = capacity; 
    trace_sched_cpu_capacity_tp(cpu_rq(cpu)); 
 
    //原生是三者赋一样的值,MC层级是一样的值,DIE层级的外层函数又会覆盖赋值 
    sdg->sgc->capacity = capacity; 
    sdg->sgc->min_capacity = capacity; 
    sdg->sgc->max_capacity = capacity; 
}

2.2. update_sg_lb_stats()

更新 sg 的负载:

/** 
 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 
 * @env: The load balancing environment. 
 * @group: sched_group whose statistics are to be updated. 
 * @sgs: variable to hold the statistics for this group. 
 * @sg_status: Holds flag indicating the status of the sched_group 
 */ 
static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, struct sg_lb_stats *sgs, int *sg_status) 
{ 
    int i, nr_running, local_group; 
 
    memset(sgs, 0, sizeof(*sgs)); 
 
    //MC层级只是cpu自己,DIE层级有一个cluster的cpu 
    local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group)); 
 
    for_each_cpu_and(i, sched_group_span(group), env->cpus) { 
        struct rq *rq = cpu_rq(i); 
 
        if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false)) 
            env->flags |= LBF_NOHZ_AGAIN; 
 
        /* 
         * sched group负载有三种,load、runnable load、util。把所有cpu上load、runnable load、util 
         * 累计起来就是sched group的负载。除了PELT跟踪的load avg信息,还统计了sched group中的cfs任 
         * 务和总任务数量。 
         */ 
        sgs->group_load += cpu_load(rq); //rq->cfs_rq.avg.load_avg 
        sgs->group_util += cpu_util(i); //max(rq->cfs_rq.avg.util_avg, rq->cfs_rq.avg.util_est.enqueued) #########三个都标红 
        sgs->group_runnable += cpu_runnable(rq); //rq->cfs_rq.avg.runnable_avg 
        sgs->sum_h_nr_running += rq->cfs.h_nr_running; 
 
        /* 
         * cfs_rq->nr_runing 记录cfs_rq上所有调度实体个数,不包含子就绪队列。cfs_rq->h_nr_running记录 
         * cfs_rq上所有调度实体的个数,包含 group se 对应 group cfs_rq 上的调度实体。 
         * 但这里是 rq->nr_running,还包含rt、dl的。 
         */ 
        nr_running = rq->nr_running; 
        sgs->sum_nr_running += nr_running; 
 
        /*只要该 sg 上有一个CPU上有2个及以上的任务,那么就标记该sched group为overload状态。*/ 
        if (nr_running > 1) 
            *sg_status |= SG_OVERLOAD; 
 
        /* 
         * 只要该 sg 上有一个CPU处于overutilized(原生util使用占比大于cpu当前算力的80%),那 
         * 么就标记该sg 为overutilized状态。 
         */ 
        if (cpu_overutilized(i)) 
            *sg_status |= SG_OVERUTILIZED; 
 
#ifdef CONFIG_NUMA_BALANCING 
        sgs->nr_numa_running += rq->nr_numa_running; 
        sgs->nr_preferred_running += rq->nr_preferred_running; 
#endif 
        /* 
         * No need to call idle_cpu() if nr_running is not 0 
         */ 
        /*统计该sched group中的idle cpu的个数*/ 
        if (!nr_running && idle_cpu(i)) { 
            sgs->idle_cpus++; 
            /* Idle cpu can't have misfit task */ 
            continue; 
        } 
 
        /* 
         * 当sd包括了算力不同的CPU(DIE),那么即便cpu上只有一个任务,但是如果该任务是misfit task那么 
         * 也标记sched group为overload状态,并记录sched group中最大的 misfit task load。需要注意的是: 
         * idle cpu不需要检测misfit task,此外,对于local group,也没有必要检测 misfit task,因为同一 
         * 个cluster,算力相同,不可能拉取misfit task到本cpu上。 
         */ 
        if (local_group) 
            continue; 
 
        /* Check for a misfit task on the cpu */ 
        //只有DIE层级有这个标志,rq->misfit_task_load 是对rq上正在运行的任务的描述 
        if (env->sd->flags & SD_ASYM_CPUCAPACITY && sgs->group_misfit_task_load < rq->misfit_task_load) { 
            sgs->group_misfit_task_load = rq->misfit_task_load; 
            *sg_status |= SG_OVERLOAD; 
        } 
    } 
 
    /* Check if dst CPU is idle and preferred to this group */ 
    //MC和DIE都没有指定 SD_ASYM_PACKING 标志,不执行 
    if (env->sd->flags & SD_ASYM_PACKING && env->idle != CPU_NOT_IDLE && sgs->sum_h_nr_running && 
            sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu)) { 
        sgs->group_asym_packing = 1; 
    } 
 
    //这两行是更新sg的总算力和cpu个数。再次强调一下,这里的capacity是指cpu可以用于cfs任务的算力。 
    sgs->group_capacity = group->sgc->capacity; 
    sgs->group_weight = group->group_weight; 
 
    //判断sg是否超载以及超载的类型,sd_init: MC和DIE的imbalance_pct都初始化为117 
    sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs); 
 
    /* Computing avg_load makes sense only when group is overloaded */ 
    /* 
     * 计算sg的平均负载(仅在group overloaded状态才计算)。在overload的情况下, 
     * 通过sg平均负载可以识别更繁忙的group。因为不同cluster算力不同,avg_load不同。 
     */ 
    if (sgs->group_type == group_overloaded) 
        sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) / sgs->group_capacity; 
}

2.3. update_sd_pick_busiest()

当前遍历的 sg 和之前选出的 busiest sg 进行PK,谁更忙谁被选中为 busiest sg,设置到 sds->busiest 中。

/** 
 * update_sd_pick_busiest - return 1 on busiest group 
 * @env: The load balancing environment. 
 * @sds: sched_domain statistics 
 * @sg: sched_group candidate to be checked for being the busiest 
 * @sgs: sched_group statistics 
 * 
 * Determine if @sg is a busier group than the previously selected busiest group. 
 * 
 * Return: %true if @sg is a busier group than the previously selected busiest group. %false otherwise. 
 */ 
static bool update_sd_pick_busiest(struct lb_env *env, struct sd_lb_stats *sds, 
    struct sched_group *sg, struct sg_lb_stats *sgs) 
{ 
    struct sg_lb_stats *busiest = &sds->busiest_stat; 
 
    /* Make sure that there is at least one task to pull */ 
    if (!sgs->sum_h_nr_running) 
        return false; 
 
    /* 
     * Don't try to pull misfit tasks we can't help. 
     * We can use max_capacity here as reduction in capacity on some 
     * CPUs in the group should either be possible to resolve 
     * internally or be covered by avg_load imbalance (eventually). 
     */ 
    if (sgs->group_type == group_misfit_task && 
            (!group_smaller_max_cpu_capacity(sg, sds->local) || sds->local_stat.group_type != group_has_spare)) 
        return false; 
 
    //sgs代表的sg的负载更重 
    if (sgs->group_type > busiest->group_type) 
        return true; 
 
    if (sgs->group_type < busiest->group_type) 
        return false; 
 
    /* 
     * The candidate and the current busiest group are the same type of 
     * group. Let check which one is the busiest according to the type. 
     */ 
    /* 下面就是两个sg的group_type相等,一样重的情况了。不同type进一步判断谁更忙的方法不同 */ 
    switch (sgs->group_type) { 
    case group_overloaded: 
        /* Select the overloaded group with highest avg_load. */ 
        /* 负载最重的一种状态是进一步去PK avg_load,哪个组的当前算力小,哪个组更忙 */ 
        if (sgs->avg_load <= busiest->avg_load) 
            return false; 
        break; 
 
    case group_imbalanced: 
        /* 
         * Select the 1st imbalanced group as we don't have any way to 
         * choose one more than another. 
         * 次忙的 group_imbalanced 单纯的选第一个 
         */ 
        return false; 
 
    case group_asym_packing: 
        /*  
         * Prefer to move from lowest priority CPU's work  
         * 第三忙的,参数1的cpu id小于参数2的为真 
         */ 
        if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu)) 
            return false; 
        break; 
 
    case group_misfit_task: 
        /* 
         * If we have more than one misfit sg go with the biggest misfit. 
         * 第四忙的,进一步PK正在运行的任务的util,大的更忙 
         */ 
        if (sgs->group_misfit_task_load < busiest->group_misfit_task_load) 
            return false; 
        break; 
 
    case group_fully_busy: 
        /* 
         * XXX for now avg_load is not computed and always 0 so we select the 1st one. 
         * 选择 avg_load 最高的fully busy group。 理论上,没有必要从这种组中拉出任务, 
         * 因为任务拥有它们需要的所有计算能力,但我们仍然可以通过减少访问共享硬件资源时 
         * 的争用来提高整体吞吐量。 
         * XXX 现在 avg_load 不计算并且总是 0 所以我们选择第一个。 
         * 
         * 也是PK谁的 avg_load 大谁更忙 
         */ 
        if (sgs->avg_load <= busiest->avg_load) 
            return false; 
        break; 
 
    case group_has_spare: 
        /* 
         * Select not overloaded group with lowest number of idle cpus 
         * and highest number of running tasks. We could also compare 
         * the spare capacity which is more stable but it can end up 
         * that the group has less spare capacity but finally more idle 
         * CPUs which means less opportunity to pull tasks. 
         * 
         * 哪个sg的idle cpu个数少,哪个相对忙一些,若idle cpu个数相同,哪 
         * 个sg中running的任务多,哪个相对忙一些。 
         */ 
        if (sgs->idle_cpus > busiest->idle_cpus) 
            return false; 
        else if ((sgs->idle_cpus == busiest->idle_cpus) && (sgs->sum_nr_running <= busiest->sum_nr_running)) 
            return false; 
 
        break; 
    } 
 
    /* 
     * Candidate sg has no more than one task per CPU and has higher 
     * per-CPU capacity. Migrating tasks to less capable CPUs may harm 
     * throughput. Maximize throughput, power/energy consequences are not 
     * considered. 
     */ 
    //只对于DIE层级有效 
    if ((env->sd->flags & SD_ASYM_CPUCAPACITY) && (sgs->group_type <= group_fully_busy) &&  
            (group_smaller_min_cpu_capacity(sds->local, sg))) 
        return false; 
 
    return true; 
} 
 
#define fits_capacity(cap, max)    ((cap) * 1280 < (max) * 1024) 
 
/* 
 * group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller 
 * per-CPU capacity than sched_group ref. 
 */ 
static inline bool group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref) 
{ 
    return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity); 
}

按闲忙次序定义的enum group_type结构:

/* 
 * 'group_type' describes the group of CPUs at the moment of load balancing. 
 * 
 * The enum is ordered by pulling priority, with the group with lowest priority 
 * first so the group_type can simply be compared when selecting the busiest 
 * group. See update_sd_pick_busiest(). 
 */ 
enum group_type { 
    /* The group has spare capacity that can be used to run more tasks.  */ 
    group_has_spare = 0, 
    /* 
     * The group is fully used and the tasks don't compete for more CPU 
     * cycles. Nevertheless, some tasks might wait before running. 
     */ 
    group_fully_busy, 
    /* 
     * SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity 
     * and must be migrated to a more powerful CPU. 
     */ 
    group_misfit_task, 
    /* 
     * SD_ASYM_PACKING only: One local CPU with higher capacity is available, 
     * and the task should be migrated to it instead of running on the 
     * current CPU. 
     */ 
    group_asym_packing, 
    /* 
     * The tasks' affinity constraints previously prevented the scheduler 
     * from balancing the load across the system. 
     */ 
    group_imbalanced, 
    /* 
     * The CPU is overloaded and can't provide expected CPU cycles to all 
     * tasks. 
     */ 
    group_overloaded 
};

3. calculate_imbalance()

一旦通过local group和busiest group的信息确定 sd 处于不均衡状态,就可以调用 calculate_imbalance 函数来计算通过什么方式(migrate task 还是migrate load/util)来恢复 sd 的负载均衡状态,也就是设定均衡上下文的 env->migration_type 和 env->imbalance 成员。具体迁移的负载量是综合考虑local group、busiest group 和 sd 的平均负载情况,确保迁移负载使 local group、busiest group向 sd 的平均负载靠拢。

/** 
 * calculate_imbalance - Calculate the amount of imbalance present within the 
 *             groups of a given sched_domain during load balance. 
 * @env: load balance environment 
 * @sds: statistics of the sched_domain whose imbalance is to be calculated. 
 */ 
static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) 
{ 
    struct sg_lb_stats *local, *busiest; 
 
    local = &sds->local_stat; 
    busiest = &sds->busiest_stat; 
 
    /* 
     * 如果busiest group上有misfit task,那么优先对其进行misfit任务迁移, 
     * 并且一次迁移一个misfit task。 
     */ 
    if (busiest->group_type == group_misfit_task) { 
        /* Set imbalance to allow misfit tasks to be balanced. */ 
        env->migration_type = migrate_misfit; 
        env->imbalance = 1; 
        return; 
    } 
 
    if (busiest->group_type == group_asym_packing) { 
        /* 
         * In case of asym capacity, we will try to migrate all load to 
         * the preferred CPU. 
         */ 
        env->migration_type = migrate_task; 
        env->imbalance = busiest->sum_h_nr_running; 
        return; 
    } 
 
    /* 
     * 果busiest group是因为 affinity 而导致的不均衡,那么通过通过迁移 
     * 任务来达到平衡,并且一次迁移一个任务。 
     */ 
    if (busiest->group_type == group_imbalanced) { 
        /* 
         * In the group_imb case we cannot rely on group-wide averages 
         * to ensure CPU-load equilibrium, try to move any task to fix 
         * the imbalance. The next load balance will take care of 
         * balancing back the system. 
         */ 
        env->migration_type = migrate_task; 
        env->imbalance = 1; 
        return; 
    } 
 
    /* 
     * Try to use spare capacity of local group without overloading it or emptying busiest. 
     */ 
    /* 
     * 上面的代码主要处理busiest group中的一些特殊情况,后面的代码主要分两段段来根据local group的 
     * 状态来进行不均衡的计算。我们首先看local group有空闲算力的情况,我们分成两段分析,第一段代码 
     * 如下: 
     * 如果local group有一些空闲算力,那么我们还是争取把它利用起来,只要迁移的负载量既不overload 
     * local group,也不会让busiest group变得无事可做。 
     */ 
    if (local->group_type == group_has_spare) { 
        /* 
         * 如果sd标记了 SD_SHARE_PKG_RESOURCES(MC),那么其在 task placement 的时候会 
         * 尽量选择idle cpu。这里load balance路径需要和placement对齐:不使用空闲capacity而是使用nr_running 
         * 来进行均衡。如果没有设置 SD_SHARE_PKG_RESOURCES(DIE) 那么考虑使用 migrate_util 方式来达到均衡。 
         */ 
        if ((busiest->group_type > group_fully_busy) && !(env->sd->flags & SD_SHARE_PKG_RESOURCES)) { //DIE层级的 
            /* 
             * If busiest is overloaded, try to fill spare 
             * capacity. This might end up creating spare capacity 
             * in busiest or busiest still being overloaded but 
             * there is no simple way to directly compute the 
             * amount of load to migrate in order to balance the system. 
             */ 
             /* 
             * 如果local group有一些空闲算力,busiest group又处于繁忙状态(大于full busy),同时 
             * 满足未设定 SD_SHARE_PKG_RESOURCES(DIE domain,MC domain需要使用 nr_running而不是 
             * util来进行均衡)。这种状态下,我们采用util来指导均衡,具体迁的 util 设定为local 
             * group当前空闲的算力。 
             */ 
            env->migration_type = migrate_util; 
            env->imbalance = max(local->group_capacity, local->group_util) - local->group_util; 
 
            /* 
             * In some cases, the group's utilization is max or even 
             * higher than capacity because of migrations but the 
             * local CPU is (newly) idle. There is at least one 
             * waiting task in this overloaded busiest group. Let's 
             * try to pull it. 
             */ 
            /* 
             * 有些场景下,local group的util大于其group capacity,根据上面计算的 imbalance 等于0 
             * (带钳位,意味着不需要均衡)。然而,在这种场景下,如果local cpu处于idle状态,那么需 
             * 要从 busiest group 迁移过来一个 runnable task,从而确保了性能。 
             */ 
            if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) { 
                env->migration_type = migrate_task; 
                env->imbalance = 1; 
            } 
 
            return; 
        } 
 
        /* 
         * 下面是有空闲算力的第二段代码 
         * 
         * 代码逻辑走到这里,说明busiest group也没有超载或是MC层级,这时候主要考虑的是任务的迁移,让 
         * sd中的idle cpu尽量的均衡。 
         * 对于base domain(MC层级,group只有一个CPU),我们还是希望任务散布在各个sg(cpu)上。因此,这时 
         * 候需要从 busiest group中迁移任务,保证迁移之后,local group 和 busiest group中的任务数量相等。 
         */ 
        if (busiest->group_weight == 1 || sds->prefer_sibling) { 
            unsigned int nr_diff = busiest->sum_nr_running; 
            /* 
             * When prefer sibling, evenly spread running tasks on groups. 
             * nr_diff = busiest->sum_nr_running - local->sum_nr_running 的绝对值 
             */ 
            env->migration_type = migrate_task; 
            lsub_positive(&nr_diff, local->sum_nr_running); 
            env->imbalance = nr_diff >> 1; 
        } else { 
            /* 
             * 如果group中有多个CPU,DIE层级的,那么我们的目标就是让local group 
             * 和busiest group中的idle cpu的数量相等。 
             */ 
            /* 
             * If there is no overload, we just want to even the number of idle cpus. 
             */ 
            env->migration_type = migrate_task; 
            env->imbalance = max_t(long, 0, (local->idle_cpus - busiest->idle_cpus) >> 1); 
        } 
 
        /* Consider allowing a small imbalance between NUMA groups */ 
        if (env->sd->flags & SD_NUMA) //无此标志不执行 
            env->imbalance = adjust_numa_imbalance(env->imbalance, busiest->sum_nr_running); 
 
        return; 
    } 
 
    /* 上面处理了local group有空闲算力的情况,下面的代码处理local group处于非 group_has_spare 状态的情况 */ 
 
    /* 
     * Local is fully busy but has to take more load to relieve the busiest group 
     */ 
    /* 
     * 如果local group没有空闲算力,但是也没有overloaded,在此条件下:可以从busiest group迁移一些负载过来, 
     * 但是这也许会导致local group进入overloaded状态。因此这里使用了avg_load来进一步确认是否进行负载迁移。 
     * 具体的判断方法是local group的平均负载是否大于sd的平均负载。 
     */ 
    if (local->group_type < group_overloaded) { 
        /* 
         * Local will become overloaded so the avg_load metrics are finally needed. 
         */ 
 
        local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) / local->group_capacity; 
 
        sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) / sds->total_capacity; 
        /* 
         * If the local group is more loaded than the selected 
         * busiest group don't try to pull any tasks. 
         */ 
        //local更忙一点,就不迁移 
        if (local->avg_load >= busiest->avg_load) { 
            env->imbalance = 0; 
            return; 
        } 
    } 
 
    /* 
     * Both group are or will become overloaded and we're trying to get all 
     * the CPUs to the average_load, so we don't want to push ourselves 
     * above the average load, nor do we wish to reduce the max loaded CPU 
     * below the average load. At the same time, we also don't want to 
     * reduce the group load below the group capacity. Thus we look for 
     * the minimum possible imbalance. 
     */ 
    /* 
     * 如果local group和busiest group都overloaded并且走入calculate imbalance,那么早就确认了 
     * busiest group的平均负载大于local group的平均负载。当local group或者busiest group都进 
     * 入(或者即将进入)overloaded状态,这时候采用迁移负载的方式进行均衡. 
     */ 
    env->migration_type = migrate_load; 
    env->imbalance = min((busiest->avg_load - sds->avg_load) * busiest->group_capacity, 
        (sds->avg_load - local->avg_load) * local->group_capacity) / 
        SCHED_CAPACITY_SCALE; 
}

六、在最忙的组中查找最繁忙的cpu——find_busiest_group()

find_busiest_queue 函数用来寻找 busiest group 中最繁忙的cpu。和 buiest group 在上面判断的 migrate type 相关,不同的type使用不同的方法来寻找busiest cpu:
migrate_load: 最忙cpu是 cpu load/cpu capacity 最大的那个cpu
migrate_util: 最忙cpu是util最大的那个cpu
migrate_task: 最忙cpu是任务最多的那个cpu
migrate_misfit: 最忙cpu是 misfit task load 最重的那个cpu.

一旦找到最忙的CPU,那么任务迁移的目标和源头都确定了,后续就可以通过detach tasks和attach tasks进行任务迁移了。

/* 
 * find_busiest_queue - find the busiest runqueue among the CPUs in the group. 
 */ 
static struct rq *find_busiest_queue(struct lb_env *env, struct sched_group *group) 
{ 
    struct rq *busiest = NULL, *rq; 
    unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1; 
    unsigned int busiest_nr = 0; 
    int i, done = 0; 
 
    trace_android_rvh_find_busiest_queue(env->dst_cpu, group, env->cpus, &busiest, &done); 
    if (done) 
        return busiest; 
 
    for_each_cpu_and(i, sched_group_span(group), env->cpus) { 
        unsigned long capacity, load, util; 
        unsigned int nr_running; 
        enum fbq_type rt; 
 
        rq = cpu_rq(i); 
        rt = fbq_classify_rq(rq); //直接return regular=0 
 
        /* 
         * We classify groups/runqueues into three groups: 
         *  - regular: there are !numa tasks 
         *  - remote:  there are numa tasks that run on the 'wrong' node 
         *  - all:     there is no distinction 
         * 
         * In order to avoid migrating ideally placed numa tasks, 
         * ignore those when there's better options. 
         * 
         * If we ignore the actual busiest queue to migrate another 
         * task, the next balance pass can still reduce the busiest 
         * queue by moving tasks around inside the node. 
         * 
         * If we cannot move enough load due to this classification 
         * the next pass will adjust the group classification and 
         * allow migration of more tasks. 
         * 
         * Both cases only affect the total convergence complexity. 
         */ 
        if (rt > env->fbq_type) 
            continue; 
 
        capacity = capacity_of(i); //cpu当前算力 
        nr_running = rq->cfs.h_nr_running; 
 
        /* 
         * For ASYM_CPUCAPACITY domains, don't pick a CPU that could 
         * eventually lead to active_balancing high->low capacity. 
         * Higher per-CPU capacity is considered better than balancing 
         * average load. 
         */ 
        //若是DIE层级的均衡,且dst cpu的算力小于最忙组中cpu的算力且这个最忙组中的cpu只有一个正在运行的任务,就跳过 
        if (env->sd->flags & SD_ASYM_CPUCAPACITY && capacity_of(env->dst_cpu) < capacity && nr_running == 1) 
            continue; 
 
        switch (env->migration_type) { 
        case migrate_load: 
            /* 
             * When comparing with load imbalance, use cpu_load() which is not scaled with the CPU capacity. 
             */ 
            load = cpu_load(rq); //return cfs_rq->avg.load_avg; 
 
            //此cpu中只有一个任务且负载大于不均衡值且可用于cfs任务的算力充足 
            if (nr_running == 1 && load > env->imbalance && !check_cpu_capacity(rq, env->sd)) 
                break; 
 
            /* 
             * For the load comparisons with the other CPUs, consider the cpu_load() scaled with the CPU 
             * capacity, so that the load can be moved away from the CPU that is potentially running at a 
             * lower capacity. 
             * Thus we're looking for max(load_i / capacity_i), crosswise multiplication to rid ourselves of 
             * the division works out to: load_i * capacity_j > load_j * capacity_i; 
             * where j is our previous maximum. 
             * 翻译: 
             * 对于与其他 CPU 的负载比较,请考虑随 CPU 容量缩放的 cpu_load(),以便可以将负载从可能以较低算力 
             * 运行的CPU上移开。 
             * 因此,我们正在寻找 max(load_i / capacity_i),横向乘法以摆脱除法的结果:load_i * capacity_j > load_j * capacity_i; 
             * 其中 j 是我们之前的最大值。 
             * 
             * 判断 load/capacity > busiest_load/busiest_capacity 来定最忙的cpu 
             */ 
            if (load * busiest_capacity > busiest_load * capacity) { 
                busiest_load = load; 
                busiest_capacity = capacity; 
                busiest = rq; 
            } 
            break; 
 
        case migrate_util: 
            util = cpu_util(cpu_of(rq)); 
 
            /* 
             * Don't try to pull utilization from a CPU with one running task. Whatever its utilization, we will fail 
             * detach the task. 
             * 只有一个任务就交给active balance吧 
             */ 
            if (nr_running <= 1) 
                continue; 
 
            //util最大的那个cpu最忙 
            if (busiest_util < util) { 
                busiest_util = util; 
                busiest = rq; 
            } 
            break; 
 
        case migrate_task: 
            //runnable+running任务数最多的cpu最忙 
            if (busiest_nr < nr_running) { 
                busiest_nr = nr_running; 
                busiest = rq; 
            } 
            break; 
 
        case migrate_misfit: 
            /* 
             * For ASYM_CPUCAPACITY domains with misfit tasks we simply seek the "biggest" misfit task. 
             */ 
            //misfit任务的load_avg最大的cpu最忙 
            if (rq->misfit_task_load > busiest_load) { 
                busiest_load = rq->misfit_task_load; 
                busiest = rq; 
            } 
 
            break; 
 
        } 
    } 
 
    return busiest; 
} 
 
 
static inline int check_cpu_capacity(struct rq *rq, struct sched_domain *sd) 
{ 
    /* rq->cpu_capacity <  rq->cpu_capacity_orig /sd->imbalance_pct * 100 */ 
    return ((rq->cpu_capacity * sd->imbalance_pct) < (rq->cpu_capacity_orig * 100)); 
}

七、detach_tasks——从busiest rq上摘取若干task

至此,我们已经确定了从busiest cpu的rq中搬移若干 load/util/task 到dst rq。不过无论是load还是util,最后还是要转成任务。此函数用来从 busiest cpu 的rq中摘取适合的任务,并把这些任务挂入 lb_env->tasks 链表中。由于关中断时长的问题,此函数也不会一次性把所有任务迁移到dest cpu上。

/* 
 * detach_tasks() -- tries to detach up to imbalance load/util/tasks from 
 * busiest_rq, as part of a balancing operation within domain "sd". 
 * 
 * Returns number of detached tasks if successful and 0 otherwise. 
 */ 
static int detach_tasks(struct lb_env *env) 
{ 
    struct list_head *tasks = &env->src_rq->cfs_tasks; 
    unsigned long util, load; 
    struct task_struct *p; 
    int detached = 0; 
 
    lockdep_assert_held(&env->src_rq->lock); 
 
    //已经均衡完毕了 
    if (env->imbalance <= 0) 
        return 0; 
 
    /* 
     * src rq的cfs_tasks链表就是该rq上的全部cfs任务,detach_tasks函数的主要逻辑就是遍历这 
     * 个cfs_tasks链表,找到最适合迁移到目标cpu rq的任务,并挂入 lb_env->tasks 链表。 
     * 
     * 为了达到均衡,一个任务可能会被多次扫描,也就是说tasks链表可能会被扫描多次! 
     */ 
    while (!list_empty(tasks)) { 
        /* 
         * We don't want to steal all, otherwise we may be treated likewise, 
         * which could at worst lead to a livelock crash. 
         */ 
        /* 
         * 在idle balance的时候,没有必要把src上的唯一的task拉取到本cpu上,否则的话任务 
         * 可能会在两个CPU上来回拉扯。 
         */ 
        if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1) 
            break; 
 
        /* 
         * 从 src_rq->cfs_tasks 链表队尾获取一个任务(只是获取,并没有摘除)。这个链表的头部 
         * 是最近访问的任务, 从尾部摘任务可以保证任务是cache cold的。上次不合适的已经move到 
         * 这个链表头了。 
         */ 
        p = list_last_entry(tasks, struct task_struct, se.group_node); 
 
        /* 
         * 当把src rq上的任务都遍历过之后,或者当达到循环上限,env->loop_max=min(sysctl_sched_nr_migrate, 
         * busiest->nr_running)的时候退出循环,之后若判断需要继续搬移任务再重新进入这个函数,目的是使对src 
         * cpu 关中断的临界区小一点 
         */ 
        env->loop++; 
        /* We've more or less seen every task there is, call it quits */ 
        /*TODO: 如果env->loop_max与env->loop_break相等,LBF_NEED_BREAK不就不会被置位了吗,逻辑是否合理?############*/ 
        if (env->loop > env->loop_max) 
            break; 
 
        /* take a breather every nr_migrate tasks */ 
        /* 
         * 当src rq上的任务数比较多的时候,并且需要迁移大量的任务才能完成均衡,为了减少关中断的区间, 
         * 迁移需要分段进行(每 sched_nr_migrate_break 暂停一下),把大的临界区分成几个小的临界区,确保 
         * 系统的延迟性能。 
         */ 
        if (env->loop > env->loop_break) { 
            env->loop_break += sched_nr_migrate_break; 
            //外层函数load_balnace判断这个标志位后会重跳转到从src rq摘取任务的逻辑处 
            env->flags |= LBF_NEED_BREAK; 
            break; 
        } 
 
        /*如果该任务不适合迁移,那么将其移到 cfs_tasks 链表头部*/ 
        if (!can_migrate_task(p, env)) 
            goto next; //放弃迁移此任务 
 
        /* 下面就是任务p可被迁移到 dst cpu 的逻辑了 */ 
        /* 
         * 下面判断迁移该任务是否能达到均衡 
         */ 
        switch (env->migration_type) { 
        case migrate_load: 
            /* 
             * Depending of the number of CPUs and tasks and the 
             * cgroup hierarchy, task_h_load() can return a null 
             * value. Make sure that env->imbalance decreases 
             * otherwise detach_tasks() will stop only after 
             * detaching up to loop_max tasks. 
             */ 
             /*计算该任务的负载。这里设定任务的最小负载是1。*/ 
            load = max_t(unsigned long, task_h_load(p), 1); 
 
            /* 
             * LB_MIN特性限制迁移小任务,默认为false,如果LB_MIN等于true,那么task load小于 
             * 16的任务将不参与负载均衡。 
             */ 
            if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) 
                goto next; 
 
            /* 
             * Make sure that we don't migrate too much load. 
             * Nevertheless, let relax the constraint if 
             * scheduler fails to find a good waiting task to migrate. 
             * 
             * 不要迁移过多的load,确保迁移的load不大于 env->imbalance。随着迁移失败的次增加, 
             * 这个限制可以适当放宽一些。 
             * 
             * I: load >> env->sd->nr_balance_failed > env->imbalance 
             */ 
            if (shr_bound(load, env->sd->nr_balance_failed) > env->imbalance) 
                goto next; 
 
            env->imbalance -= load; 
            break; 
 
        case migrate_util: 
            /* 
             * 对于migrate_util类型的迁移,我们通过任务的util和env->imbalance来判断是否迁 
             * 移了足够的utility。需要注意的是这里使用的是任务的util_est,没有考虑uclamp。 
             */ 
            util = task_util_est(p); 
 
            if (util > env->imbalance) 
                goto next; 
 
            env->imbalance -= util; 
            break; 
 
        case migrate_task: 
            /* 
             * migrate_task类型的迁移不关注load或者utility,只关心迁移的任务数,此type下 
             * env->imbalance保存的也是要迁移的任务量 
             */ 
            env->imbalance--; 
            break; 
 
        case migrate_misfit: 
            /* This is not a misfit task */ 
            /*找到misfit task即完成迁移,若不是misfit的就放弃迁移它 */ 
            if (task_fits_capacity(p, capacity_of(env->src_cpu))) 
                goto next; 
 
            env->imbalance = 0; 
            break; 
        } 
 
        /* 
         * 程序执行至此,说明任务p需要被迁移(不能迁移的都跳转到next标号了),此时才从tasks(env->src_rq->cfs_tasks) 
         * 链表上摘取下来挂入 env->tasks 链表。 
         */ 
        detach_task(p, env); 
        list_add(&p->se.group_node, &env->tasks); //头插法 
 
        detached++; 
 
#ifdef CONFIG_PREEMPTION 
        /* 
         * NEWIDLE balancing is a source of latency, so preemptible 
         * kernels will stop after the first task is detached to minimize 
         * the critical section. 
         */ 
        /* new idle balance 是调度延迟的一个来源,所有对于 new idle balance, 
         * 一次只迁移一个任务 
         */ 
        if (env->idle == CPU_NEWLY_IDLE) 
            break; 
#endif 
 
        /* 
         * We only want to steal up to the prescribed amount of load/util/tasks. 
         */ 
        /* 如果完成迁移,那么就退出遍历src rq的cfs task链表 */ 
        if (env->imbalance <= 0) 
            break; 
 
        continue; 
next: 
        /*对于不适合迁移的任务将其移动到链表头部,因为是从尾部进行扫描判断的*/ 
        list_move(&p->se.group_node, tasks); 
    } 
 
    /* 
     * Right now, this is one of only two places we collect this stat 
     * so we can safely collect detach_one_task() stats here rather 
     * than inside detach_one_task(). 
     */ 
    schedstat_add(env->sd->lb_gained[env->idle], detached); 
 
    return detached; 
}

1. can_migrate_task()

用来判断一个任务是否可以迁移至目标CPU

/* 
 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? 
 */ 
static int can_migrate_task(struct task_struct *p, struct lb_env *env) 
{ 
    int tsk_cache_hot; 
    int can_migrate = 1; 
 
    lockdep_assert_held(&env->src_rq->lock); 
 
    trace_android_rvh_can_migrate_task(p, env->dst_cpu, &can_migrate); 
    if (!can_migrate) 
        return 0; 
 
    /* 
     * We do not migrate tasks that are: 
     * 1) throttled_lb_pair, or 
     * 2) cannot be migrated to this CPU due to cpus_ptr, or 
     * 3) running (obviously), or 
     * 4) are cache-hot on their current CPU. 
     */ 
    /* 
     * 如果任务p所在的task group在src cpu 或 在dest cpu上被限流了,那么不 
     * 能迁移该任务,否者限流的逻辑会有问题. 
     */ 
    if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) 
        return 0; 
 
    /* Disregard per-cpu kthreads; they are where they need to be. */ 
    if ((p->flags & PF_KTHREAD) && kthread_is_per_cpu(p)) 
        return 0; 
 
    //若dst cpu不在任务p的cpu亲和性里面 
    if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) { 
        int cpu; 
 
        //统计由于cpu亲和性不能迁移到dst cpu 
        schedstat_inc(p->se.statistics.nr_failed_migrations_affine); 
 
        /* 
         * 任务由于affinity的原因不能在dest cpu上运行,因此这里设置上 
         * LBF_SOME_PINNED 标志,表示至少有一个任务由于affinity无法迁移 
         */ 
        env->flags |= LBF_SOME_PINNED; 
 
        /* 
         * Remember if this task can be migrated to any other CPU in 
         * our sched_group. We may want to revisit it if we couldn't 
         * meet load balance goals by pulling other tasks on src_cpu. 
         * 
         * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have 
         * already computed one in current iteration. 
         */ 
        /* 
         * 下面的逻辑会尝试选择备选dst cpu,如果是已经设定好了备选dst cpu 
         * 那么直接返回。如果是newidle balance那么也不需要备选CPU,因为它的 
         * 主要目标就是迁移一个任务到本idle的cpu。 
         */ 
        if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED)) 
            return 0; 
 
        /* Prevent to re-select dst_cpu via env's CPUs: */ 
        /* 
         * 设定备选CPU,以便后续第二轮的均衡可以把任务迁移到备选CPU上 
         * MC层级只有dst cpu一个,DIE层级是dst cpu所在cluster的所有cpu 
         */ 
        for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { 
            if (cpumask_test_cpu(cpu, p->cpus_ptr)) { 
                env->flags |= LBF_DST_PINNED; 
                env->new_dst_cpu = cpu; 
                break; 
            } 
        } 
 
        return 0; 
    } 
 
    /* 下面就是dst cpu 在 p->cpus_ptr 中了 */ 
 
    /* Record that we found atleast one task that could run on dst_cpu */ 
    /*至少有一个任务是可以运行在dest cpu上(从affinity角度),因此清除all pinned标记*/ 
    env->flags &= ~LBF_ALL_PINNED; 
 
    /*正处于运行状态的任务不参与迁移,迁移running task是后续 active migration 的逻辑。*/ 
    if (task_running(env->src_rq, p)) { //return p->on_cpu, TODO: 但是被抢占的任务其p->on_cpu也是为真的 
        schedstat_inc(p->se.statistics.nr_failed_migrations_running); 
        return 0; 
    } 
 
    /* 
     * Aggressive migration if: 
     * 1) destination numa is preferred 
     * 2) task is cache cold, or 
     * 3) too many balance attempts have failed. 
     */ 
    /* 
     * 判断该任务是否是cache-hot的,这主要从近期在src cpu上的执行时间点来判断,如果上 
     * 次任务在src cpu上开始执行的时间比较久远(sysctl_sched_migration_cost 是门限,默认0.5ms), 
     * 那么其在cache中的内容大概率是被刷掉了,可以认为是cache-cold的。此外如果任务p是 
     * src cpu上的next buddy或者last buddy,那么任务是cache hot的。 
     */ 
    tsk_cache_hot = migrate_degrades_locality(p, env); //没有配置 CONFIG_NUMA_BALANCING 的话直接返回-1 
    if (tsk_cache_hot == -1) 
        tsk_cache_hot = task_hot(p, env); 
 
    /* 
     * 一般而言,我们只迁移cache cold的任务。但是如果进行了太多轮的尝试仍然未能让负 
     * 载达到均衡,那么cache hot的任务也一样迁移。 
     * sd_init()中MC和DIE的cache_nice_tries都初始化为1。 
     * nr_balance_failed:load_balance中判断非new idle balance且一个任务都没迁移就加1 
     */ 
    if (tsk_cache_hot <= 0 || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { 
        if (tsk_cache_hot == 1) { 
            //由于上两次尝试一个任务都没迁移成功,这次cache_hot的也迁移 
            schedstat_inc(env->sd->lb_hot_gained[env->idle]); 
            schedstat_inc(p->se.statistics.nr_forced_migrations); 
        } 
        return 1; 
    } 
 
    schedstat_inc(p->se.statistics.nr_failed_migrations_hot); 
    return 0; 
}

八、attach_tasks——将从busiest rq上取下来的任务挂到dst cpu 上

attach_tasks主要的逻辑就是遍历 env->tasks 链表,摘下任务挂入dst cpu的队列

/* 
 * attach_tasks() -- attaches all tasks detached by detach_tasks() to their new rq. 
 */ 
static void attach_tasks(struct lb_env *env) 
{ 
    struct list_head *tasks = &env->tasks; 
    struct task_struct *p; 
    struct rq_flags rf; 
 
    rq_lock(env->dst_rq, &rf); 
    update_rq_clock(env->dst_rq); 
 
    while (!list_empty(tasks)) { 
        p = list_first_entry(tasks, struct task_struct, se.group_node); 
        list_del_init(&p->se.group_node); 
 
        attach_task(env->dst_rq, p); 
    } 
 
    rq_unlock(env->dst_rq, &rf); 
}

九、need_active_balance() 判断是否需要主动均衡

判断是否需要启动 active balance,就是判断是否需要将src cpu当前正在running的任务迁移到dst cpu,因为前面一番折腾后发现无法迁移runnable的任务,那么就再考虑一下running的任务。

(1) busiest cpu的算力被非CFS任务占用的比较多,且dst cpu的剩余算力比busiest cpu多出一定比例
(2) migration_type == migrate_misfit
(3) 该sd迁移runnable任务失败次数比 sd->cache_nice_tries 多2次以上

static int need_active_balance(struct lb_env *env) 
{ 
    struct sched_domain *sd = env->sd; 
 
    if (voluntary_active_balance(env)) 
        return 1; 
 
    /*  
     * 对于非new idle类型的balance,发现连一个runnable任务都无法迁移就加1 
     * sd_init: MC和DIE的 cache_nice_tries 都初始化为1。 
     */ 
    return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); 
} 
 
 
static inline bool voluntary_active_balance(struct lb_env *env) 
{ 
    struct sched_domain *sd = env->sd; 
 
    if (asym_active_balance(env)) 
        return 1; 
 
    /* 
     * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task. 
     * It's worth migrating the task if the src_cpu's capacity is reduced 
     * because of other sched_class or IRQs if more capacity stays 
     * available on dst_cpu. 
     */ 
    if ((env->idle != CPU_NOT_IDLE) && 
        (env->src_rq->cfs.h_nr_running == 1)) { 
        //(src_rq->cpu_capacity < 85.5% * src_rq->cpu_capacity_orig) && (dst_cpu->cpu_capacity > 1.17*src_cpu->cpu_capacity) 
        if ((check_cpu_capacity(env->src_rq, sd)) && (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100)) 
            return 1; 
    } 
 
    if (env->migration_type == migrate_misfit) 
        return 1; 
 
    return 0; 
} 
 
static inline bool asym_active_balance(struct lb_env *env) 
{ 
    /* 
     * ASYM_PACKING needs to force migrate tasks from busy but 
     * lower priority CPUs in order to pack all tasks in the 
     * highest priority CPUs. 
     */ 
    //DIE和MC都没有使能 SD_ASYM_PACKING,恒返回false 
    return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) && 
           sched_asym_prefer(env->dst_cpu, env->src_cpu); 
}

十、active_load_balance_cpu_stop()——主动迁移

stop_one_cpu_nowait() 中发起主动迁移,就是向 busiest cpu 的stop调度类的 "migration/X" 线程queue一个work,然后唤醒它,执行流程为:

per-cpu的cpu_stopper.thread --> smpboot_thread_fn --> cpu_stopper_thread --> fn(arg) --> active_load_balance_cpu_stop(busiest rq)

也就是说主动均衡函数运行在 stop 调度类的线程中,最高优先级的线程。

/* 
 * active_load_balance_cpu_stop is run by the CPU stopper. It pushes 
 * running tasks off the busiest CPU onto idle CPUs. It requires at 
 * least 1 task to be running on each physical CPU where possible, and 
 * avoids physical / logical imbalances. 
 */ 
//fair.c 
static int active_load_balance_cpu_stop(void *data) 
{ 
    struct rq *busiest_rq = data; 
    int busiest_cpu = cpu_of(busiest_rq); 
    int target_cpu = busiest_rq->push_cpu; //就是dst cpu 
    struct rq *target_rq = cpu_rq(target_cpu); 
    struct sched_domain *sd; 
    struct task_struct *p = NULL; 
    struct rq_flags rf; 
 
    rq_lock_irq(busiest_rq, &rf); //src cpu 上关中断 
    /* 
     * Between queueing the stop-work and running it is a hole in which 
     * CPUs can become inactive. We should not move tasks from or to 
     * inactive CPUs. 
     * 翻译: 
     * 在queue stop-work和运行它之间有一个间隙,在这个间隙中cpu可以变为inactive 
     * 状态,我们不应该将任务迁移到inactive cpu或从 inactive cpu迁移任务。 
     */ 
    if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu)) 
        goto out_unlock; 
 
    /* Make sure the requested CPU hasn't gone down in the meantime: */ 
    /* 
     * busiest_rq->active_balance: 在 load_balance() 触发active balance之前就赋值为1了。 
     * busiest_cpu == smp_processor_id() 应该是恒成立的,因为执行在busiest_cpu的per-cpu的"migration/X"中 
    */ 
    if (unlikely(busiest_cpu != smp_processor_id() || !busiest_rq->active_balance)) 
        goto out_unlock; 
 
    /* Is there any task to move? */ 
    /*只有stop调度类的"migration/X"在运行,没有其它任何任务在运行了,rq->nr_running中也包括被抢占的任务*/ 
    if (busiest_rq->nr_running <= 1) 
        goto out_unlock; 
 
    /* 
     * This condition is "impossible", if it occurs we need to fix it. Originally reported by 
     * Bjorn Helgaas on a 128-CPU setup. 
     */ 
    BUG_ON(busiest_rq == target_rq); 
 
    /* Search for an sd spanning us and the target CPU. */ 
    rcu_read_lock(); 
    //MC层级的若是能命中就是两个cpu在同一个cluster中,否则不在 
    for_each_domain(target_cpu, sd) { 
        if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) 
            break; 
    } 
 
    if (likely(sd)) { 
        struct lb_env env = { 
            .sd            = sd, 
            .dst_cpu    = target_cpu, 
            .dst_rq        = target_rq, 
            .src_cpu    = busiest_rq->cpu, 
            .src_rq        = busiest_rq, 
            .idle        = CPU_IDLE, 
            /* 
             * can_migrate_task() doesn't need to compute new_dst_cpu 
             * for active balancing. Since we have CPU_IDLE, but no 
             * @dst_grpmask we need to make that test go away with lying 
             * about DST_PINNED. 
             */ 
            .flags        = LBF_DST_PINNED, 
            .src_rq_rf    = &rf, 
        }; 
 
        //统计active balance的次数 
        schedstat_inc(sd->alb_count); 
        update_rq_clock(busiest_rq); 
 
        p = detach_one_task(&env); 
        if (p) { 
            schedstat_inc(sd->alb_pushed); 
            /* Active balancing done, reset the failure counter. */ 
            sd->nr_balance_failed = 0; 
        } else { 
            schedstat_inc(sd->alb_failed); 
        } 
    } 
    rcu_read_unlock(); 
out_unlock: 
    busiest_rq->active_balance = 0; 
    rq_unlock(busiest_rq, &rf); 
 
    if (p) 
        attach_one_task(target_rq, p); 
 
    local_irq_enable(); 
 
    return 0; 
}

1. detach_one_task()

这是active balance使用,只从 src rq 上dequeue一个任务

/* 
 * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as 
 * part of active balancing operations within "domain". 
 * 
 * Returns a task if successful and NULL otherwise. 
 */ 
static struct task_struct *detach_one_task(struct lb_env *env) 
{ 
    struct task_struct *p; 
 
    lockdep_assert_held(&env->src_rq->lock); 
 
    //从后向前遍历(迁移runnable任务时发现不能迁移的挂在链表头了) 
    list_for_each_entry_reverse(p, &env->src_rq->cfs_tasks, se.group_node) { 
        if (!can_migrate_task(p, env)) 
            continue; 
 
        detach_task(p, env); 
 
        /* 
         * Right now, this is only the second place where 
         * lb_gained[env->idle] is updated (other is detach_tasks) 
         * so we can safely collect stats here rather than 
         * inside detach_tasks(). 
         */ 
        schedstat_inc(env->sd->lb_gained[env->idle]); 
        return p; 
    } 
    return NULL; 
}

2. attach_one_task()

active balance时使用,用于将一个任务attach到dst cpu上。

/* 
 * attach_one_task() -- attaches the task returned from detach_one_task() to 
 * its new rq. 
 */ 
static void attach_one_task(struct rq *rq, struct task_struct *p) 
{ 
    struct rq_flags rf; 
 
    rq_lock(rq, &rf); 
    update_rq_clock(rq); 
    attach_task(rq, p); 
    rq_unlock(rq, &rf); 
}

十一、总结

1. load_balance()函数中不但会迁移runnable任务,若runnable任务迁移失败还会尝试进行active balance,也就是迁移running的任务。

2. new idle balnace 一次只迁移一个runnable的任务。

3. overload分等级,使用 sgs->group_type 表示,迁移也分为多种类型,使用 env->migration_type 表示,但最终都会转换为迁移任务。

4. active balance运行在busiest cpu的stop调度类的 migration/X 线程中。

十二、补充

1. rq->misfit_task_load 的更新逻辑

static inline void update_misfit_status(struct task_struct *p, struct rq *rq) 
{ 
    bool need_update = true; 
 
    trace_android_rvh_update_misfit_status(p, rq, &need_update); 
    if (!static_branch_unlikely(&sched_asym_cpucapacity) || !need_update) 
        return; 
 
    if (!p || p->nr_cpus_allowed == 1) { 
        rq->misfit_task_load = 0; 
        return; 
    } 
 
    //加上est和clamp的util,是否满足达到cpu当前算力的80% 
    if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) { 
        rq->misfit_task_load = 0; 
        return; 
    } 
 
    /* 
     * Make sure that misfit_task_load will not be null even if 
     * task_h_load() returns 0. 
     * 若没使能CFS组调度task_h_load为: p->se.avg.load_avg 
     * 若使能了CFS组调度task_h_load为: p->se.avg.load_avg * cfs_rq->h_load / (cfs_rq->avg.load_avg+1) 
     */ 
    rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1); 
}

调用路径:

pick_next_task_fair //fair.c pick到任务return前更新 
newidle_balance //fair.c 入口清0 rq->misfit_task_load 
task_tick_fair //fair.c 在tick中判断curr任务的util进行更新 
    update_misfit_status

结论:rq->misfit_task_load 是对rq上正在运行的任务的评估。

2. sched domain 的 flags

# cat /proc/sys/kernel/sched_domain/cpu0/domain0/flags //MC 
SD_BALANCE_NEWIDLE SD_BALANCE_EXEC SD_BALANCE_FORK SD_WAKE_AFFINE SD_SHARE_PKG_RESOURCES 
cat /proc/sys/kernel/sched_domain/cpu0/domain1/flags //DIE 
SD_BALANCE_NEWIDLE SD_BALANCE_EXEC SD_BALANCE_FORK SD_WAKE_AFFINE SD_ASYM_CPUCAPACITY SD_PREFER_SIBLING

3. sched domain 相关参数

/proc/sys/kernel/sched_domain/cpu0/domain1 # ls -l 
-rw-r--r--    busy_factor 
-rw-r--r--    cache_nice_tries 
-r--r--r--    flags 
-rw-r--r--    imbalance_pct 
-rw-r--r--    max_interval 
-rw-r--r--    max_newidle_lb_cost 
-rw-r--r--    min_interval 
-r--r--r--    name

大都是可调节(可写)的参数。

4. EAS的使能关闭和使用

在 /proc/sys/kernel/sched_energy_aware 的设置执行路径中会 enable/disable eas,在使能的情况下,fair选核首先尝试eas。原生均衡逻辑在判断 rd->overutilized=0 也就是没有处于overutilized且使能EAS时不会进行负载均衡。

/proc/sys/kernel/sched_energy_aware 
    sched_energy_aware_handler 
        rebuild_sched_domains 
            partition_sched_domains_locked 
                has_eas |= build_perf_domains 
                    sched_energy_set(has_eas) 
                        static_branch_enable_cpuslocked(&sched_energy_present); //eas enable 
                        static_branch_disable_cpuslocked(&sched_energy_present); //eas disable

5. rd->overutilized 的更新逻辑

load_balance //fair.c 负载均衡路径(1) 
    find_busiest_group 
        update_sd_lb_stats 
            update_sg_lb_stats //fair.c 对sg中的每个cpu都调用,只要有一个CPU处于overutilized(util>80%*cap),那么就 rd->overutilized=SG_OVERUTILIZED 
        enqueue_task_fair //rq中插入一个非新fork的任务(2) 
        task_tick_fair //tick中更新(3) 
            update_overutilized_status //fair.c 原生逻辑util>80%*cap 就 rq->rd->overutilized=SG_OVERUTILIZED 
                cpu_overutilized //fair.c 
                    trace_android_rvh_cpu_overutilized(cpu, &overutilized) 
                        mtk_cpu_overutilized 
                            trace_sched_cpu_overutilized(cpu, perf_domain_span(pd), sum_util, sum_cap, *overutilized); 
//打印: 
<...>-1033    [000] d..1 168362.136618: sched_cpu_overutilized: cpu=5 mask=0x70 sum_util=138 sum_cap=2202 overutilized=0

本文参考链接:https://www.cnblogs.com/hellokitty2/p/15673073.html