Skip to main content
 首页 » 操作系统

Linux 调度器之walt的cpu_array和cluster组织

2022年07月19日198cloudgamer

1. sched_cluster 变量的初始化

static bool walt_clusters_parsed; 
__read_mostly cpumask_t **cpu_array; //指针数组 
 
void walt_update_cluster_topology(void) //walt.c 
{ 
    struct cpumask cpus = *cpu_possible_mask; 
    struct cpumask cluster_cpus; 
    struct walt_sched_cluster *cluster; 
    struct list_head new_head; 
    cpumask_t **tmp; 
    int i; 
 
    INIT_LIST_HEAD(&new_head); 
 
    for_each_cpu(i, &cpus) { //对于每一个possible的cpu 
        cpumask_clear(&cluster_cpus); //清空 cluster_cpus 结构备用 
        walt_get_possible_siblings(i, &cluster_cpus); //获取i的所有兄弟cpu并将其mask设置进cluster_cpus中 
        if (cpumask_empty(&cluster_cpus)) { 
            WARN(1, "WALT: Invalid cpu topology!!"); 
            cleanup_clusters(&new_head); 
            return; 
        } 
        cpumask_andnot(&cpus, &cpus, &cluster_cpus); //从cpus中清理掉已经解析出来的cpu 
        add_cluster(&cluster_cpus, &new_head); //new一个cluster结构出来,然后将其挂在new_head链表上 
    } 
 
    assign_cluster_ids(&new_head); //解析后的cluster  传递给sched_cluster[] 
    ... 
    /* Ensure cluster ids are visible to all CPUs before making cluster_head visible. */ 
    move_list(&cluster_head, &new_head, false); //cluster_head指向也指向cluster链表 
 
    for_each_sched_cluster(cluster) { 
        if (cpumask_weight(&cluster->cpus) == 1) 
            /* *arg1 = *arg2 | *arg3,asym_cap_sibling_cpus 的唯一赋值位置 */ 
            cpumask_or(&asym_cap_sibling_cpus, &asym_cap_sibling_cpus, &cluster->cpus); 
    } 
    /* 4+3+1 这里会清0,6+2 根本不会赋值,因此此成员恒为0 */ 
    if (cpumask_weight(&asym_cap_sibling_cpus) == 1) 
        cpumask_clear(&asym_cap_sibling_cpus); 
 
    /* 这里面对 cpu_array 结构进行的初始化 */ 
    tmp = build_cpu_array(); 
    if (!tmp) { 
        BUG_ON(1); 
        return; 
    } 
    /*就是 cpu_array = tmp; */ 
    smp_store_release(&cpu_array, tmp); /*WRITE_ONCE(*p, v);    */ 
    walt_clusters_parsed = true; 
}

调用路径:

init_cpu_capacity_notifier.notifier_call //每个cluster的cpufreq策略ok后都会调用 
    init_cpu_capacity_callback //arch_topology.c 只有所有cluster都调用后才只执行一次 
        walt_update_cluster_topology();

build_cpu_array 函数:

//这个函数对 cpu_array 二维数组进行初始化。 
static cpumask_t **build_cpu_array(void) 
{ 
    int i; 
    cpumask_t **tmp_array = init_cpu_array(); //分配一个数组指针 
 
    if (!tmp_array) 
        return NULL; 
 
    /*Construct cpu_array row by row*/ 
    for (i = 0; i < num_sched_clusters; i++) { 
        int j, k = 1; 
 
        /* Fill out first column with appropriate cpu arrays*/ 
        cpumask_copy(&tmp_array[i][0], &sched_cluster[i]->cpus); 
 
        /* 
         * k starts from column 1 because 0 is filled 
         * Fill clusters for the rest of the row, 
         * above i in ascending order 
         */ 
        for (j = i + 1; j < num_sched_clusters; j++) { 
            cpumask_copy(&tmp_array[i][k], &sched_cluster[j]->cpus); 
            k++; 
        } 
 
        /* 
         * k starts from where we left off above. 
         * Fill clusters below i in descending order. 
         */ 
        for (j = i - 1; j >= 0; j--) { 
            cpumask_copy(&tmp_array[i][k], &sched_cluster[j]->cpus); 
            k++; 
        } 
    } 
    return tmp_array; 
} 
 
static cpumask_t **init_cpu_array(void) 
{ 
    int i; 
    cpumask_t **tmp_array; 
 
    tmp_array = kcalloc(num_sched_clusters, sizeof(cpumask_t *), GFP_ATOMIC); 
    if (!tmp_array) 
        return NULL; 
    for (i = 0; i < num_sched_clusters; i++) { 
        tmp_array[i] = kcalloc(num_sched_clusters, sizeof(cpumask_t), GFP_ATOMIC); 
        if (!tmp_array[i]) 
            return NULL; 
    } 
 
    return tmp_array; 
}

2. 应用代码模拟build_cpu_array的实现

build_cpu_array() 对 cpu_array 的初始化赋值不是很好理解,写应用代码模拟器操作,然后打印 cpu_array 的值:

#include <stdio.h> 
#include <stdlib.h> 
#include <string.h> 
 
#define NR_CPUS 32 
 
#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) 
#define BITS_PER_BYTE        8 
 
//include/linux/bitops.h 
#define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE) 
#define BITS_TO_LONGS(nr)    DIV_ROUND_UP(nr, BITS_PER_TYPE(long)) 
 
//include/linux/types.h 
#define DECLARE_BITMAP(name,bits) unsigned long name[BITS_TO_LONGS(bits)] 
 
//include/linux/cpumask.h 
typedef struct cpumask { DECLARE_BITMAP(bits_name, NR_CPUS); } cpumask_t; 
 
static int num_sched_clusters; 
 
struct cluster { 
    struct cpumask cpus; 
}; 
 
struct cluster sched_cluster[NR_CPUS]; 
 
static void cpumask_copy(struct cpumask *dstp, struct cpumask *srcp) 
{ 
    memcpy(dstp, srcp, sizeof(*dstp));     
} 
 
static void set_bit(int nr, unsigned long *addr) 
{ 
    addr[nr / BITS_PER_TYPE(long)] |= 1UL << (nr % BITS_PER_TYPE(long)); 
} 
 
static int test_bit(int nr, unsigned long *addr) 
{ 
    return 1UL & (addr[nr / BITS_PER_TYPE(long)] >> (nr & (BITS_PER_TYPE(long)-1))); 
} 
 
static void cpumask_set_cpu(int cpu, struct cpumask *dstp) 
{ 
    set_bit(cpu, dstp->bits_name); 
} 
 
static int cpumask_test_cpu(int cpu, struct cpumask *dstp) 
{ 
    return test_bit(cpu, dstp->bits_name); 
} 
 
static cpumask_t **init_cpu_array(void) 
{ 
    int i; 
    cpumask_t **tmp_array; 
 
    tmp_array = calloc(num_sched_clusters, sizeof(cpumask_t *)); 
    if (!tmp_array) { 
        printf("no memory 1\n"); 
        return NULL; 
    } 
    for (i = 0; i < num_sched_clusters; i++) { 
        tmp_array[i] = calloc(num_sched_clusters, sizeof(cpumask_t)); 
        if (!tmp_array[i]) { 
            printf("no memory 2\n"); 
            return NULL; 
        } 
    } 
 
    return tmp_array; 
} 
 
static cpumask_t **build_cpu_array(void) 
{ 
    int i; 
    cpumask_t **tmp_array = init_cpu_array(); 
 
    if (!tmp_array) 
        return NULL; 
 
    /*Construct cpu_array row by row*/ 
    for (i = 0; i < num_sched_clusters; i++) { 
        int j, k = 1; 
 
        /* Fill out first column with appropriate cpu arrays*/ 
        cpumask_copy(&tmp_array[i][0], &sched_cluster[i].cpus); 
 
        /* 
         * k starts from column 1 because 0 is filled 
         * Fill clusters for the rest of the row, 
         * above i in ascending order 
         */ 
        for (j = i + 1; j < num_sched_clusters; j++) { 
            cpumask_copy(&tmp_array[i][k], &sched_cluster[j].cpus); 
            k++; 
        } 
 
        /* 
         * k starts from where we left off above. 
         * Fill clusters below i in descending order. 
         */ 
        for (j = i - 1; j >= 0; j--) { 
            cpumask_copy(&tmp_array[i][k], &sched_cluster[j].cpus); 
            k++; 
        } 
    } 
    return tmp_array; 
} 
 
#if 0 
// 4+3+1 
static int num_sched_clusters = 3; 
static void test_init(void) 
{ 
    cpumask_set_cpu(0, &sched_cluster[0].cpus); 
    cpumask_set_cpu(1, &sched_cluster[0].cpus); 
    cpumask_set_cpu(2, &sched_cluster[0].cpus); 
    cpumask_set_cpu(3, &sched_cluster[0].cpus); 
 
    cpumask_set_cpu(4, &sched_cluster[1].cpus); 
    cpumask_set_cpu(5, &sched_cluster[1].cpus); 
    cpumask_set_cpu(6, &sched_cluster[1].cpus); 
 
    cpumask_set_cpu(7, &sched_cluster[2].cpus); 
} 
#endif 
 
#if 1 
// 6+2 
static int num_sched_clusters = 2; 
static void test_init(void) 
{ 
    cpumask_set_cpu(0, &sched_cluster[0].cpus); 
    cpumask_set_cpu(1, &sched_cluster[0].cpus); 
    cpumask_set_cpu(2, &sched_cluster[0].cpus); 
    cpumask_set_cpu(3, &sched_cluster[0].cpus); 
    cpumask_set_cpu(4, &sched_cluster[0].cpus); 
    cpumask_set_cpu(5, &sched_cluster[0].cpus); 
 
    cpumask_set_cpu(6, &sched_cluster[1].cpus); 
    cpumask_set_cpu(7, &sched_cluster[1].cpus); 
} 
#endif 
 
static void cpumask_print(cpumask_t *cpus) 
{ 
    int i; 
 
    for (i = 0; i < 8; i++) { 
        if (cpumask_test_cpu(i, cpus)) { 
            printf("CPU%d ", i); 
        } 
    } 
    printf("\n"); 
} 
 
int main() 
{ 
    int i, j; 
    cpumask_t **cpu_array; 
 
    test_init(); 
    cpu_array = build_cpu_array(); 
 
    for (i = 0; i < num_sched_clusters; i++) { 
        for (j = 0; j < num_sched_clusters; j++) { 
            printf("cpu_array[%d][%d]: ", i, j); 
            cpumask_print(&cpu_array[i][j]); 
        } 
    } 
 
    return 0; 
}

执行结果:

对于 6+2 架构: 
cpu_array[0][0]: CPU0 CPU1 CPU2 CPU3 CPU4 CPU5  
cpu_array[0][1]: CPU6 CPU7  
cpu_array[1][0]: CPU6 CPU7  
cpu_array[1][1]: CPU0 CPU1 CPU2 CPU3 CPU4 CPU5  
 
 
对于 4+3+1 架构: 
cpu_array[0][0]: CPU0 CPU1 CPU2 CPU3  
cpu_array[0][1]: CPU4 CPU5 CPU6  
cpu_array[0][2]: CPU7  
cpu_array[1][0]: CPU4 CPU5 CPU6  
cpu_array[1][1]: CPU7  
cpu_array[1][2]: CPU0 CPU1 CPU2 CPU3  
cpu_array[2][0]: CPU7  
cpu_array[2][1]: CPU4 CPU5 CPU6  
cpu_array[2][2]: CPU0 CPU1 CPU2 CPU3 

从 4+3+1 架构中可以看出,开始选核的cluster不同的情况下三个cluster是怎么选核的。MTK的选核逻辑一致。

3. walt选核流程中的使用

int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, int sync, int sibling_count_hint) //fair.c 
{ 
    int start_cpu, order_index, end_index; 
    ... 
    walt_get_indicies(p, &order_index, &end_index, task_boost, boosted); 
 
    /* start_cpu 控制从哪个核开始选 */ 
    start_cpu = cpumask_first(&cpu_array[order_index][0]); 
     
    fbt_env.start_cpu = start_cpu; 
    fbt_env.order_index = order_index; /*选核cluster id*/ 
    fbt_env.end_index = end_index; 
    fbt_env.strict_max = is_rtg && (task_boost == TASK_BOOST_STRICT_MAX); //walt_get_indicies中指定从大核开始选了 
 
    /*walt进行遍历算核,将选出的放在candidates中,最多只往里面set 2个cpu*/ 
    walt_find_best_target(NULL, candidates, p, &fbt_env); 
    ... 
} 
 
/* 设置 order_index 和 end_index 的值 */ 
static void walt_get_indicies(struct task_struct *p, int *order_index, int *end_index, int task_boost, bool boosted) //fair.c 
{ 
    int i = 0; 
 
    *order_index = 0;//设置为0 
    *end_index = 0; //设置为0 
 
    if (num_sched_clusters <= 1) 
        return; 
 
    if (task_boost > TASK_BOOST_ON_MID) { 
        *order_index = num_sched_clusters - 1; //减一后表示大核 
        return; 
    } 
 
    if (is_full_throttle_boost()) { 
        *order_index = num_sched_clusters - 1; //选大核 
        if ((*order_index > 1) && task_demand_fits(p, cpumask_first(&cpu_array[*order_index][1]))) //大核的[1][1]是中核(4+3+1)或小核(6+2) 
            *end_index = 1; //表述允许从中核或小核上选核 
        return; 
    } 
 
    if (boosted || task_boost_policy(p) == SCHED_BOOST_ON_BIG || walt_task_skip_min_cpu(p)) 
        *order_index = 1; //跳过小核 
 
    /* 若是没有boost操作,就根据算力从 order_index 开始选 */ 
    for (i = *order_index ; i < num_sched_clusters - 1; i++) { 
        if (task_demand_fits(p, cpumask_first(&cpu_array[i][0]))) 
            break; 
    } 
    *order_index = i; 
} 
 
 
static void walt_find_best_target(struct sched_domain *sd, cpumask_t *cpus, struct task_struct *p, struct find_best_target_env *fbt_env) //fair.c 
{ 
    int i, start_cpu = fbt_env->start_cpu; 
    int order_index = fbt_env->order_index, end_index = fbt_env->end_index; 
    ... 
    /* order_index>0 表示跳过小核开始选核,并且任务是在等待I/O,只能选大核(6+2),或只能选中大核(4+3+1) */ 
    if (order_index > 0 && p->wts.iowaited) { 
        stop_index = num_sched_clusters - 2; //0或1 
        most_spare_wake_cap = LONG_MIN; 
    } 
 
    /*  
     * strict_max=is_rtg && BOOST_MAX, walt_get_indicies中对BOOST_MAX指定从大核开始选核了,这里stop_index=0表示只能选大核了, 
     * 无论是6+2还是4+3+1架构 
     */ 
    if (fbt_env->strict_max) { 
        stop_index = 0; 
        most_spare_wake_cap = LONG_MIN; 
    } 
    ... 
     
    /* 慢速路径 */ 
    for (cluster = 0; cluster < num_sched_clusters; cluster++) { 
        /* 
         * 从任务p的亲和性cpu和此cluster的cpu的交集中选择。从order_index这个cluster开始选核, 
         * 它不同,一级数组中的大小核排布就不同。 
         */ 
        cpumask_and(&visit_cpus, &p->cpus_mask, &cpu_array[order_index][cluster]); 
        for_each_cpu(i, &visit_cpus) { 
            ... //选核逻辑,没有使用到相关成员 
        } 
 
        /*  
         * end_index 是结束选核的cluster的下标,也就是根据boost情况和算力,到end_index就应该结束选核了, 
         * 并且此时已经选到了target_cpu,且 初始选核的cluster是大核簇 或 开始选核的这个cluster中不只有1 
         * 个CPU 或 target_cpu不是开始选核的这个cluster的首个CPU, 就不再在下一个cluster中继续选核心了。 
         */ 
        if ((cluster >= end_index) && (target_cpu != -1) && walt_target_ok(target_cpu, order_index)) 
            break; 
 
        /* 找到了最大空余算力的cpu 且 达到了停止选核的stop_index阈值,退出继续遍历其它cluster进行选核的流程 */ 
        if (most_spare_cap_cpu != -1 && cluster >= stop_index) 
            break; 
     
    } 
    ... 
done: 
        /* 
         * trace log: 
         * cat-32758 [001] d..4  5235.529561: sched_find_best_target: pid=18090 comm=kworker/u16:25 
         * start_cpu=0 best_idle=-1 most_spare_cap=-1 target=0 order_index=0 end_index=0 skip=-1 running=0 
         */ 
        trace_sched_find_best_target(p, min_util, start_cpu, best_idle_cpu, most_spare_cap_cpu, 
                 target_cpu, order_index, end_index, fbt_env->skip_cpu, p->state == TASK_RUNNING); 
}

order_index 越大表示越从算力高的cluster开始选核,但是 end_index 需要参考我们写的应用程序配和来看。


本文参考链接:https://www.cnblogs.com/hellokitty2/p/15438824.html