diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 1bd144ed2..dd178cde7 100755 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -142,6 +142,60 @@ config GENERIC_IRQ_DEBUGFS If you don't know what to do here, say N. +config IRQ_SBALANCE + bool "SBalance IRQ balancer" + depends on SMP + default n + help + This is a simple IRQ balancer that polls every X number of + milliseconds and moves IRQs from the most interrupt-heavy CPU to the + least interrupt-heavy CPUs until the heaviest CPU is no longer the + heaviest. IRQs are only moved from one source CPU to any number of + destination CPUs per balance run. Balancing is skipped if the gap + between the most interrupt-heavy CPU and the least interrupt-heavy CPU + is below the configured threshold of interrupts. + + The heaviest IRQs are targeted for migration in order to reduce the + number of IRQs to migrate. If moving an IRQ would reduce overall + balance, then it won't be migrated. + + The most interrupt-heavy CPU is calculated by scaling the number of + new interrupts on that CPU to the CPU's current capacity. This way, + interrupt heaviness takes into account factors such as thermal + pressure and time spent processing interrupts rather than just the + sheer number of them. This also makes SBalance aware of CPU asymmetry, + where different CPUs can have different performance capacities and be + proportionally balanced. + +if IRQ_SBALANCE +config IRQ_SBALANCE_POLL_MSEC + int "Polling interval in milliseconds" + default 3000 + help + Perform IRQ balancing every X milliseconds. + +config IRQ_SBALANCE_THRESH + int "Balance threshold in number of interrupts" + default 1024 + help + There needs to be a difference of at least this many new interrupts + between the heaviest and least-heavy CPUs during the last polling + window in order for balancing to occur. This is to avoid balancing + when the system is quiet. + + This threshold is compared to the _scaled_ interrupt counts per CPU; + i.e., the number of interrupts scaled to the CPU's capacity. + +config SBALANCE_EXCLUDE_CPUS + string "CPUs to exclude from balancing" + help + Comma-separated list of CPUs to exclude from IRQ balancing. + + For example, to ignore CPU0, CPU1, and CPU2, it is valid to provide + "0,1-2" or "0-2" or "0,1,2". + +endif + endmenu config GENERIC_IRQ_MULTI_HANDLER diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index b4f53717d..a485d87af 100755 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -18,3 +18,4 @@ obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o obj-$(CONFIG_SMP) += affinity.o obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o obj-$(CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR) += matrix.o +obj-$(CONFIG_IRQ_SBALANCE) += sbalance.o diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index f1d83a8b4..a241db3f6 100755 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -130,6 +130,15 @@ static inline void unregister_handler_proc(unsigned int irq, struct irqaction *action) { } #endif +#ifdef CONFIG_IRQ_SBALANCE +extern void sbalance_desc_add(struct irq_desc *desc); +extern void sbalance_desc_del(struct irq_desc *desc); +#else +static inline void sbalance_desc_add(struct irq_desc *desc) { } +static inline void sbalance_desc_del(struct irq_desc *desc) { } +#endif + +extern bool __irq_can_set_affinity(struct irq_desc *desc); extern bool irq_can_set_affinity_usr(unsigned int irq); extern void irq_set_thread_affinity(struct irq_desc *desc); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 74df02894..f799994d2 100755 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -413,6 +413,7 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, desc_set_defaults(irq, desc, node, affinity, owner); irqd_set(&desc->irq_data, flags); kobject_init(&desc->kobj, &irq_kobj_type); + sbalance_desc_add(desc); return desc; @@ -443,6 +444,7 @@ static void free_desc(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); + sbalance_desc_del(desc); irq_remove_debugfs_entry(desc); unregister_irq_proc(irq, desc); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index df19cc9ae..41e986c3e 100755 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -147,7 +147,7 @@ EXPORT_SYMBOL(synchronize_irq); #ifdef CONFIG_SMP cpumask_var_t irq_default_affinity; -static bool __irq_can_set_affinity(struct irq_desc *desc) +bool __irq_can_set_affinity(struct irq_desc *desc) { if (!desc || !irqd_can_balance(&desc->irq_data) || !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) diff --git a/kernel/irq/sbalance.c b/kernel/irq/sbalance.c new file mode 100644 index 000000000..b062c4f6a --- /dev/null +++ b/kernel/irq/sbalance.c @@ -0,0 +1,357 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2023 Sultan Alsawaf . + */ + +/** + * DOC: SBalance description + * + * This is a simple IRQ balancer that polls every X number of milliseconds and + * moves IRQs from the most interrupt-heavy CPU to the least interrupt-heavy + * CPUs until the heaviest CPU is no longer the heaviest. IRQs are only moved + * from one source CPU to any number of destination CPUs per balance run. + * Balancing is skipped if the gap between the most interrupt-heavy CPU and the + * least interrupt-heavy CPU is below the configured threshold of interrupts. + * + * The heaviest IRQs are targeted for migration in order to reduce the number of + * IRQs to migrate. If moving an IRQ would reduce overall balance, then it won't + * be migrated. + * + * The most interrupt-heavy CPU is calculated by scaling the number of new + * interrupts on that CPU to the CPU's current capacity. This way, interrupt + * heaviness takes into account factors such as thermal pressure and time spent + * processing interrupts rather than just the sheer number of them. This also + * makes SBalance aware of CPU asymmetry, where different CPUs can have + * different performance capacities and be proportionally balanced. + */ + +#define pr_fmt(fmt) "sbalance: " fmt + +#include +#include +#include +#include "../sched/sched.h" +#include "internals.h" + +/* Perform IRQ balancing every POLL_MS milliseconds */ +#define POLL_MS CONFIG_IRQ_SBALANCE_POLL_MSEC + +/* + * There needs to be a difference of at least this many new interrupts between + * the heaviest and least-heavy CPUs during the last polling window in order for + * balancing to occur. This is to avoid balancing when the system is quiet. + * + * This threshold is compared to the _scaled_ interrupt counts per CPU; i.e., + * the number of interrupts scaled to the CPU's capacity. + */ +#define IRQ_SCALED_THRESH CONFIG_IRQ_SBALANCE_THRESH + +struct bal_irq { + struct list_head node; + struct list_head move_node; + struct rcu_head rcu; + struct irq_desc *desc; + unsigned int delta_nr; + unsigned int old_nr; + int prev_cpu; +}; + +struct bal_domain { + struct list_head movable_irqs; + unsigned int intrs; + int cpu; +}; + +static LIST_HEAD(bal_irq_list); +static DEFINE_SPINLOCK(bal_irq_lock); +static DEFINE_PER_CPU(struct bal_domain, balance_data); +static DEFINE_PER_CPU(unsigned long, cpu_cap); +static cpumask_t cpu_exclude_mask __read_mostly; + +void sbalance_desc_add(struct irq_desc *desc) +{ + struct bal_irq *bi; + + bi = kmalloc(sizeof(*bi), GFP_KERNEL); + if (WARN_ON(!bi)) + return; + + *bi = (typeof(*bi)){ .desc = desc }; + spin_lock(&bal_irq_lock); + list_add_tail_rcu(&bi->node, &bal_irq_list); + spin_unlock(&bal_irq_lock); +} + +void sbalance_desc_del(struct irq_desc *desc) +{ + struct bal_irq *bi; + + spin_lock(&bal_irq_lock); + list_for_each_entry(bi, &bal_irq_list, node) { + if (bi->desc == desc) { + list_del_rcu(&bi->node); + kfree_rcu(bi, rcu); + break; + } + } + spin_unlock(&bal_irq_lock); +} + +static int bal_irq_move_node_cmp(void *priv, const struct list_head *lhs_p, + const struct list_head *rhs_p) +{ + const struct bal_irq *lhs = list_entry(lhs_p, typeof(*lhs), move_node); + const struct bal_irq *rhs = list_entry(rhs_p, typeof(*rhs), move_node); + + return rhs->delta_nr - lhs->delta_nr; +} + +/* Returns false if this IRQ should be totally ignored for this balancing run */ +static bool update_irq_data(struct bal_irq *bi, int *cpu) +{ + struct irq_desc *desc = bi->desc; + unsigned int nr; + + /* Find the CPU which currently has this IRQ affined */ + raw_spin_lock_irq(&desc->lock); + *cpu = cpumask_first(desc->irq_common_data.affinity); + raw_spin_unlock_irq(&desc->lock); + if (*cpu >= nr_cpu_ids) + return false; + + /* + * Calculate the number of new interrupts from this IRQ. It is assumed + * that the IRQ has been running on the same CPU since the last + * balancing run. This might not hold true if the IRQ was moved by + * someone else since the last balancing run, or if the CPU this IRQ was + * previously running on has since gone offline. + */ + nr = *per_cpu_ptr(desc->kstat_irqs, *cpu); + if (nr <= bi->old_nr) { + bi->old_nr = nr; + return false; + } + + /* Calculate the number of new interrupts on this CPU from this IRQ */ + bi->delta_nr = nr - bi->old_nr; + bi->old_nr = nr; + return true; +} + +static int move_irq_to_cpu(struct bal_irq *bi, int cpu) +{ + struct irq_desc *desc = bi->desc; + int prev_cpu, ret; + + /* Set the affinity if it wasn't changed since we looked at it */ + raw_spin_lock_irq(&desc->lock); + prev_cpu = cpumask_first(desc->irq_common_data.affinity); + if (prev_cpu == bi->prev_cpu) { + ret = irq_set_affinity_locked(&desc->irq_data, cpumask_of(cpu), + false); + } else { + bi->prev_cpu = prev_cpu; + ret = -EINVAL; + } + raw_spin_unlock_irq(&desc->lock); + + if (!ret) { + /* Update the old interrupt count using the new CPU */ + bi->old_nr = *per_cpu_ptr(desc->kstat_irqs, cpu); + pr_debug("Moved IRQ%d (CPU%d -> CPU%d)\n", + irq_desc_get_irq(desc), prev_cpu, cpu); + } + return ret; +} + +static unsigned int scale_intrs(unsigned int intrs, int cpu) +{ + /* Scale the number of interrupts to this CPU's current capacity */ + return intrs * SCHED_CAPACITY_SCALE / per_cpu(cpu_cap, cpu); +} + +/* Returns true if IRQ balancing should stop */ +static bool find_min_bd(const cpumask_t *mask, unsigned int max_intrs, + struct bal_domain **min_bd) +{ + unsigned int intrs, min_intrs = UINT_MAX; + struct bal_domain *bd; + int cpu; + + for_each_cpu(cpu, mask) { + bd = per_cpu_ptr(&balance_data, cpu); + intrs = scale_intrs(bd->intrs, bd->cpu); + + /* Terminate when the formerly-max CPU isn't the max anymore */ + if (intrs > max_intrs) + return true; + + /* Find the CPU with the lowest relative number of interrupts */ + if (intrs < min_intrs) { + min_intrs = intrs; + *min_bd = bd; + } + } + + /* Don't balance if IRQs are already balanced evenly enough */ + return max_intrs - min_intrs < IRQ_SCALED_THRESH; +} + +static void balance_irqs(void) +{ + static cpumask_t cpus; + struct bal_domain *bd, *max_bd, *min_bd; + unsigned int intrs, max_intrs; + bool moved_irq = false; + struct bal_irq *bi; + int cpu; + + rcu_read_lock(); + + /* Find the available CPUs for balancing, if there are any */ + cpumask_andnot(&cpus, cpu_active_mask, &cpu_exclude_mask); + if (unlikely(cpumask_weight(&cpus) <= 1)) + goto unlock; + + /* + * Get the current capacity for each CPU. This is adjusted for time + * spent processing IRQs, RT-task time, and thermal pressure. We don't + * exclude time spent processing IRQs when balancing because balancing + * is only done using interrupt counts rather than time spent in + * interrupts. That way, time spent processing each interrupt is + * considered when balancing. + */ + for_each_cpu(cpu, &cpus) + per_cpu(cpu_cap, cpu) = cpu_rq(cpu)->cpu_capacity; + + list_for_each_entry_rcu(bi, &bal_irq_list, node) { + if (!update_irq_data(bi, &cpu)) + continue; + + /* Add the number of new interrupts to this CPU's count */ + bd = per_cpu_ptr(&balance_data, cpu); + bd->intrs += bi->delta_nr; + + /* Consider this IRQ for balancing if it's movable */ + if (!__irq_can_set_affinity(bi->desc)) + continue; + + /* Ignore for this balancing run if something else moved it */ + if (cpu != bi->prev_cpu) { + bi->prev_cpu = cpu; + continue; + } + + list_add_tail(&bi->move_node, &bd->movable_irqs); + } + + /* Find the most interrupt-heavy CPU with movable IRQs */ + while (1) { + max_intrs = 0; + for_each_cpu(cpu, &cpus) { + bd = per_cpu_ptr(&balance_data, cpu); + intrs = scale_intrs(bd->intrs, bd->cpu); + if (intrs > max_intrs) { + max_intrs = intrs; + max_bd = bd; + } + } + + /* No balancing to do if there aren't any movable IRQs */ + if (unlikely(!max_intrs)) + goto unlock; + + /* Ensure the heaviest CPU has IRQs which can be moved away */ + if (!list_empty(&max_bd->movable_irqs)) + break; + +try_next_heaviest: + /* + * If the heaviest CPU has no movable IRQs then it can neither + * receive IRQs nor give IRQs. Exclude it from balancing so the + * remaining CPUs can be balanced, if there are any. + */ + if (cpumask_weight(&cpus) == 2) + goto unlock; + + cpumask_clear_cpu(max_bd->cpu, &cpus); + } + + /* Find the CPU with the lowest relative interrupt count */ + if (find_min_bd(&cpus, max_intrs, &min_bd)) + goto unlock; + + /* Sort movable IRQs in descending order of number of new interrupts */ + list_sort(NULL, &max_bd->movable_irqs, bal_irq_move_node_cmp); + + /* Push IRQs away from the heaviest CPU to the least-heavy CPUs */ + list_for_each_entry(bi, &max_bd->movable_irqs, move_node) { + /* Skip this IRQ if it would just overload the target CPU */ + intrs = scale_intrs(min_bd->intrs + bi->delta_nr, min_bd->cpu); + if (intrs >= max_intrs) + continue; + + /* Try to migrate this IRQ, or skip it if migration fails */ + if (move_irq_to_cpu(bi, min_bd->cpu)) + continue; + + /* Keep track of whether or not any IRQs are moved */ + moved_irq = true; + + /* Update the counts and recalculate the max scaled count */ + min_bd->intrs += bi->delta_nr; + max_bd->intrs -= bi->delta_nr; + max_intrs = scale_intrs(max_bd->intrs, max_bd->cpu); + + /* Recheck for the least-heavy CPU since it may have changed */ + if (find_min_bd(&cpus, max_intrs, &min_bd)) + break; + } + + /* + * If the heaviest CPU has movable IRQs which can't actually be moved, + * then ignore it and try balancing the next heaviest CPU. + */ + if (!moved_irq) + goto try_next_heaviest; +unlock: + rcu_read_unlock(); + + /* Reset each balance domain for the next run */ + for_each_possible_cpu(cpu) { + bd = per_cpu_ptr(&balance_data, cpu); + INIT_LIST_HEAD(&bd->movable_irqs); + bd->intrs = 0; + } +} + +static int __noreturn sbalance_thread(void *data) +{ + long poll_jiffies = msecs_to_jiffies(POLL_MS); + struct bal_domain *bd; + int cpu; + + /* Parse the list of CPUs to exclude, if any */ + if (cpulist_parse(CONFIG_SBALANCE_EXCLUDE_CPUS, &cpu_exclude_mask)) + cpu_exclude_mask = CPU_MASK_NONE; + + /* Initialize the data used for balancing */ + for_each_possible_cpu(cpu) { + bd = per_cpu_ptr(&balance_data, cpu); + INIT_LIST_HEAD(&bd->movable_irqs); + bd->cpu = cpu; + } + + set_freezable(); + while (1) { + freezable_schedule_timeout_interruptible(poll_jiffies); + balance_irqs(); + } +} + +static int __init sbalance_init(void) +{ + BUG_ON(IS_ERR(kthread_run(sbalance_thread, NULL, "sbalanced"))); + return 0; +} +late_initcall(sbalance_init);