1333/* 1334 * A system can have three types of NUMA topology: // 一个系统可以有三种 NUMA 拓扑类型: 1335 * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system // NUMA_DIRECT:所有节点直接相连,或者不是 NUMA 系统 1336 * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes // NUMA_GLUELESS_MESH:一些节点通过中间节点可达 1337 * NUMA_BACKPLANE: nodes can reach other nodes through a backplane // NUMA_BACKPLANE:节点可以通过背板到达其他节点 1338 * 1339 * The difference between a glueless mesh topology and a backplane // 无胶网格拓扑和背板之间的区别 1340 * topology lies in whether communication between not directly // 在于不直接 1341 * connected nodes goes through intermediary nodes (where programs // 连接的节点之间的通信是否通过中间节点(程序 1342 * could run), or through backplane controllers. This affects // 可以运行),还是通过背板控制器。这会影响 1343 * placement of programs. // 程序的放置 1344 * 1345 * The type of topology can be discerned with the following tests: // 拓扑类型可以通过以下测试来辨别 1346 * - If the maximum distance between any nodes is 1 hop, the system // - 如果任意节点之间的最大距离为 1 跳,系统 1347 * is directly connected. // 是直接连接的 1348 * - If for two nodes A and B, located N > 1 hops away from each other, // - 如果对于两个节点 A 和 B,彼此距离 N > 1 跳 1349 * there is an intermediary node C, which is < N hops away from both // 存在一个中间节点 C,它距离 A 和 B 都小于 N 跳 1350 * nodes A and B, the system is a glueless mesh. // 系统是无胶网格 1351 */ 1352staticvoidinit_numa_topology_type(void)// 定义一个静态的无返回值函数 init_numa_topology_type 1353 { 1354int a, b, c, n; // 定义四个整型变量 a, b, c, n 1355 1356 n = sched_max_numa_distance; // 为 n 赋值 1357 1358if (sched_domains_numa_levels <= 2) { // 如果条件成立 1359 sched_numa_topology_type = NUMA_DIRECT; // 赋值 1360return; // 返回 1361 } 1362 1363 for_each_online_node(a) { // 开始循环 1364 for_each_online_node(b) { // 嵌套循环 1365/* Find two nodes furthest removed from each other. */// 找到彼此距离最远的两个节点 1366if (node_distance(a, b) < n) // 如果条件成立 1367continue; // 继续下一次循环 1368 1369/* Is there an intermediary node between a and b? */// a 和 b 之间是否存在中间节点? 1370 for_each_online_node(c) { // 嵌套循环 1371if (node_distance(a, c) < n && // 如果条件成立 1372 node_distance(b, c) < n) { // 如果条件成立 1373 sched_numa_topology_type = // 赋值 1374 NUMA_GLUELESS_MESH; 1375return; // 返回 1376 } 1377 } 1378 1379 sched_numa_topology_type = NUMA_BACKPLANE; // 赋值 1380return; // 返回 1381 } 1382 } 1383 }
// vim CTKernel/include/linux/sched/topology.h +20
20#define SD_LOAD_BALANCE 0x0001 /* Do load balancing on this domain. */ 21#define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */ 22#define SD_BALANCE_EXEC 0x0004 /* Balance on exec */ 23#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ 24#define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ 25#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ 26#define SD_ASYM_CPUCAPACITY 0x0040 /* Groups have different max cpu capacities */ 27#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu capacity */ 28#define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */ 29#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ 30#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ 31#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ 32#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ 33#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ 34#define SD_NUMA 0x4000 /* cross-node balancing */
3172staticint __init ksm_init(void) 3173 { 3174structtask_struct *ksm_thread; 3175int err; 3176 3177/* The correct value depends on page size and endianness */ 3178 zero_checksum = calc_checksum(ZERO_PAGE(0)); 3179/* Default to false for backwards compatibility */ 3180 ksm_use_zero_pages = false; 3181 3182 err = ksm_slab_init(); 3183if (err) 3184goto out; 3185 3186 ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd"); 3187if (IS_ERR(ksm_thread)) { 3188 pr_err("ksm: creating kthread failed\n"); 3189 err = PTR_ERR(ksm_thread); 3190goto out_free; 3191 } 3192 3193#ifdef CONFIG_SYSFS 3194 err = sysfs_create_group(mm_kobj, &ksm_attr_group); 3195if (err) { 3196 pr_err("ksm: register sysfs failed\n"); 3197 kthread_stop(ksm_thread); 3198goto out_free; 3199 } 3200#else 3201 ksm_run = KSM_RUN_MERGE; /* no way for user to start it */ 3202 3203#endif/* CONFIG_SYSFS */ 3204 3205#ifdef CONFIG_MEMORY_HOTREMOVE 3206/* There is no significance to this priority 100 */ 3207 hotplug_memory_notifier(ksm_memory_callback, 100); 3208#endif 3209return0; 3210 3211 out_free: 3212 ksm_slab_free(); 3213 out: 3214return err; 3215 }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
// vim include/linux/kthread.h +34
34/** 35 * kthread_run - create and wake a thread. 36 * @threadfn: the function to run until signal_pending(current). 37 * @data: data ptr for @threadfn. 38 * @namefmt: printf-style name for the thread. 39 * 40 * Description: Convenient wrapper for kthread_create() followed by 41 * wake_up_process(). Returns the kthread or ERR_PTR(-ENOMEM). 42 */ 43#define kthread_run(threadfn, data, namefmt, ...) \ 44 ({ \ 45 struct task_struct *__k \ 46 = kthread_create(threadfn, data, namefmt, ## __VA_ARGS__); \ 47 if (!IS_ERR(__k)) \ 48 wake_up_process(__k); \ 49 __k; \ 50 })
kthread_create
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
// vim include/linux/kthread.h +14
14/** 15 * kthread_create - create a kthread on the current node 16 * @threadfn: the function to run in the thread 17 * @data: data pointer for @threadfn() 18 * @namefmt: printf-style format string for the thread name 19 * @arg...: arguments for @namefmt. 20 * 21 * This macro will create a kthread on the current node, leaving it in 22 * the stopped state. This is just a helper for kthread_create_on_node(); 23 * see the documentation there for more details. 24 */ 25#define kthread_create(threadfn, data, namefmt, arg...) \ 26 kthread_create_on_node(threadfn, data, NUMA_NO_NODE, namefmt, ##arg)
357/** 358 * kthread_create_on_node - create a kthread. 359 * @threadfn: the function to run until signal_pending(current). 360 * @data: data ptr for @threadfn. 361 * @node: task and thread structures for the thread are allocated on this node 362 * @namefmt: printf-style name for the thread. 363 * 364 * Description: This helper function creates and names a kernel 365 * thread. The thread will be stopped: use wake_up_process() to start 366 * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and 368 * 369 * If thread is going to be bound on a particular cpu, give its node 370 * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE. 371 * When woken, the thread will run @threadfn() with @data as its 372 * argument. @threadfn() can either call do_exit() directly if it is a 373 * standalone thread for which no one will call kthread_stop(), or 374 * return when 'kthread_should_stop()' is true (which means 375 * kthread_stop() has been called). The return value should be zero 376 * or a negative error number; it will be passed to kthread_stop(). 377 * 378 * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR). 379 */ 380struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), 381void *data, int node, 382constchar namefmt[], 383 ...) 384 { 385structtask_struct *task; 386 va_list args; 387 388 va_start(args, namefmt); 389 task = __kthread_create_on_node(threadfn, data, node, namefmt, args); 390 va_end(args); 391 392return task; 393 } 394 EXPORT_SYMBOL(kthread_create_on_node);
293static __printf(4, 0) 294structtask_struct *__kthread_create_on_node(int (*threadfn)(void *data), 295 void *data, intnode, 296 constcharnamefmt[], 297 va_listargs) 298 { 299 DECLARE_COMPLETION_ONSTACK(done); 300structtask_struct *task; 301structkthread_create_info *create = kmalloc(sizeof(*create), 302 GFP_KERNEL); 303 304if (!create) 305return ERR_PTR(-ENOMEM); 306 create->threadfn = threadfn; 307 create->data = data; 308 create->node = node; 309 create->done = &done; 310 311 spin_lock(&kthread_create_lock); 312 list_add_tail(&create->list, &kthread_create_list); 313 spin_unlock(&kthread_create_lock); 314 315 wake_up_process(kthreadd_task); 316/* 317 * Wait for completion in killable state, for I might be chosen by 318 * the OOM killer while kthreadd is trying to allocate memory for 319 * new kernel thread. 320 */ 321if (unlikely(wait_for_completion_killable(&done))) { 322/* 323 * If I was SIGKILLed before kthreadd (or new kernel thread) 324 * calls complete(), leave the cleanup of this structure to 325 * that thread. 326 */ 327if (xchg(&create->done, NULL)) 328return ERR_PTR(-EINTR); 329/* 330 * kthreadd (or new kernel thread) will call complete() 331 * shortly. 332 */ 333 wait_for_completion(&done); 334 } 335 task = create->result; 336if (!IS_ERR(task)) { 337staticconststructsched_paramparam = { .sched_priority = 0 }; 338char name[TASK_COMM_LEN]; 339 340/* 341 * task is already visible to other tasks, so updating 342 * COMM must be protected. 343 */ 344 vsnprintf(name, sizeof(name), namefmt, args); 345 set_task_comm(task, name); 346/* 347 * root may have changed our (kthreadd's) priority or CPU mask. 348 * The kernel thread should not inherit these properties. 349 */ 350 sched_setscheduler_nocheck(task, SCHED_NORMAL, ¶m); 351 set_cpus_allowed_ptr(task, cpu_all_mask); 352 } 353 kfree(create); 354return task;
wake_up_process
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
2141/** 2142 * wake_up_process - Wake up a specific process 2143 * @p: The process to be woken up. 2144 * 2145 * Attempt to wake up the nominated process and move it to the set of runnable 2146 * processes. 2147 * 2148 * Return: 1 if the process was woken up, 0 if it was already running. 2149 * 2150 * This function executes a full memory barrier before accessing the task state. 2151 */ 2152intwake_up_process(struct task_struct *p) 2153 { 2154return try_to_wake_up(p, TASK_NORMAL, 0); 2155 } 2156 EXPORT_SYMBOL(wake_up_process);
1959/** 1960 * try_to_wake_up - wake up a thread 1961 * @p: the thread to be awakened 1962 * @state: the mask of task states that can be woken 1963 * @wake_flags: wake modifier flags (WF_*) 1964 * 1965 * If (@state & @p->state) @p->state = TASK_RUNNING. 1966 * 1967 * If the task was not queued/runnable, also place it back on a runqueue. 1968 * 1969 * Atomic against schedule() which would dequeue a task, also see 1970 * set_current_state(). 1971 * 1972 * This function executes a full memory barrier before accessing the task 1973 * state; see set_current_state(). 1974 * 1975 * Return: %true if @p->state changes (an actual wakeup was done), 1976 * %false otherwise. 1977 */ 1978staticint 1979 try_to_wake_up(struct task_struct *p, unsignedint state, int wake_flags) 1980 { 1981unsignedlong flags; 1982int cpu, success = 0; 1983 1984/* 1985 * If we are going to wake up a thread waiting for CONDITION we 1986 * need to ensure that CONDITION=1 done by the caller can not be 1987 * reordered with p->state check below. This pairs with mb() in 1988 * set_current_state() the waiting thread does. 1989 */ 1990 raw_spin_lock_irqsave(&p->pi_lock, flags); 1991 smp_mb__after_spinlock(); 1992if (!(p->state & state)) 1993goto out; 1994 1995 trace_sched_waking(p); 1996 1997/* We're going to change ->state: */ 1998 success = 1; 1999 cpu = task_cpu(p); 2000 2001/* 2002 * Ensure we load p->on_rq _after_ p->state, otherwise it would 2003 * be possible to, falsely, observe p->on_rq == 0 and get stuck 2004 * in smp_cond_load_acquire() below. 2005 * 2006 * sched_ttwu_pending() try_to_wake_up() 2007 * STORE p->on_rq = 1 LOAD p->state 2008 * UNLOCK rq->lock 2009 * 2010 * __schedule() (switch to task 'p') 2011 * LOCK rq->lock smp_rmb(); 2012 * smp_mb__after_spinlock(); 2013 * UNLOCK rq->lock 2014 * 2015 * [task p] 2016 * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq 2017 * 2018 * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in 2019 * __schedule(). See the comment for smp_mb__after_spinlock(). 2020 */ 2021 smp_rmb(); 2022if (p->on_rq && ttwu_remote(p, wake_flags)) 2023goto stat; 2024 2025#ifdef CONFIG_SMP 2026/* 2027 * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be 2028 * possible to, falsely, observe p->on_cpu == 0. 2029 * 2030 * One must be running (->on_cpu == 1) in order to remove oneself 2031 * from the runqueue. 2032 * 2033 * __schedule() (switch to task 'p') try_to_wake_up() 2034 * STORE p->on_cpu = 1 LOAD p->on_rq 2035 * UNLOCK rq->lock 2036 * 2037 * __schedule() (put 'p' to sleep) 2038 * LOCK rq->lock smp_rmb(); 2039 * smp_mb__after_spinlock(); 2040 * STORE p->on_rq = 0 LOAD p->on_cpu 2041 * 2042 * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in 2043 * __schedule(). See the comment for smp_mb__after_spinlock(). 2044 */ 2045 smp_rmb(); 2046 2047/* 2048 * If the owning (remote) CPU is still in the middle of schedule() with 2049 * this task as prev, wait until its done referencing the task. 2050 * 2051 * Pairs with the smp_store_release() in finish_task(). 2052 * 2053 * This ensures that tasks getting woken will be fully ordered against 2054 * their previous state and preserve Program Order. 2055 */ 2056 smp_cond_load_acquire(&p->on_cpu, !VAL); 2057 2058 p->sched_contributes_to_load = !!task_contributes_to_load(p); 2059 p->state = TASK_WAKING; 2060 2061if (p->in_iowait) { 2062 delayacct_blkio_end(p); 2063atomic_dec(&task_rq(p)->nr_iowait); 2064 } 2065 2066 cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); 2067if (task_cpu(p) != cpu) { 2068 wake_flags |= WF_MIGRATED; 2069 psi_ttwu_dequeue(p); 2070 set_task_cpu(p, cpu); 2071 } 2072 2073#else/* CONFIG_SMP */ 2074 2075if (p->in_iowait) { 2076 delayacct_blkio_end(p); 2077atomic_dec(&task_rq(p)->nr_iowait); 2078 } 2079 2080#endif/* CONFIG_SMP */ 2081 2082 ttwu_queue(p, cpu, wake_flags); 2083 stat: 2084 ttwu_stat(p, cpu, wake_flags); 2085 out: 2086 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2087 2088return success; 2089 }
修复方案
bug复现
0号进程
查看系统初始化smp多核心前0号进程位于哪个cpu上:
ksmd
从/var/log/messages中查看ksmd被调度到隔离核心上的日志:
1 2
grep 'p->comm:ksmd' /var/log/messages -inr --color > ksmd.log vim ksmd.log
kthreadd
从/var/log/messages中查看kthreadd被调度到隔离核心上的日志:
1 2
grep 'p->comm:kthreadd' /var/log/messages -inr --color > kthreadd.log vim kthreadd.log
8aeaffef8c6e sched/fair: Take the scheduling domain into account in select_idle_smt() 3e6efe87cd5c sched/fair: Remove redundant check in select_idle_smt() 3e8c6c9aac42 sched/fair: Remove task_util from effective utilization in feec() c722f35b513f sched/fair: Bring back select_idle_smt(), but differently 6cd56ef1df39 sched/fair: Remove select_idle_smt() df3cb4ea1fb6 sched/fair: Fix wrong cpu selecting from isolated domain