358 lines
10 KiB
C
358 lines
10 KiB
C
|
// SPDX-License-Identifier: GPL-2.0
|
||
|
/*
|
||
|
* Copyright (C) 2023 Sultan Alsawaf <sultan@kerneltoast.com>.
|
||
|
*/
|
||
|
|
||
|
/**
|
||
|
* DOC: SBalance description
|
||
|
*
|
||
|
* This is a simple IRQ balancer that polls every X number of milliseconds and
|
||
|
* moves IRQs from the most interrupt-heavy CPU to the least interrupt-heavy
|
||
|
* CPUs until the heaviest CPU is no longer the heaviest. IRQs are only moved
|
||
|
* from one source CPU to any number of destination CPUs per balance run.
|
||
|
* Balancing is skipped if the gap between the most interrupt-heavy CPU and the
|
||
|
* least interrupt-heavy CPU is below the configured threshold of interrupts.
|
||
|
*
|
||
|
* The heaviest IRQs are targeted for migration in order to reduce the number of
|
||
|
* IRQs to migrate. If moving an IRQ would reduce overall balance, then it won't
|
||
|
* be migrated.
|
||
|
*
|
||
|
* The most interrupt-heavy CPU is calculated by scaling the number of new
|
||
|
* interrupts on that CPU to the CPU's current capacity. This way, interrupt
|
||
|
* heaviness takes into account factors such as thermal pressure and time spent
|
||
|
* processing interrupts rather than just the sheer number of them. This also
|
||
|
* makes SBalance aware of CPU asymmetry, where different CPUs can have
|
||
|
* different performance capacities and be proportionally balanced.
|
||
|
*/
|
||
|
|
||
|
#define pr_fmt(fmt) "sbalance: " fmt
|
||
|
|
||
|
#include <linux/freezer.h>
|
||
|
#include <linux/irq.h>
|
||
|
#include <linux/list_sort.h>
|
||
|
#include "../sched/sched.h"
|
||
|
#include "internals.h"
|
||
|
|
||
|
/* Perform IRQ balancing every POLL_MS milliseconds */
|
||
|
#define POLL_MS CONFIG_IRQ_SBALANCE_POLL_MSEC
|
||
|
|
||
|
/*
|
||
|
* There needs to be a difference of at least this many new interrupts between
|
||
|
* the heaviest and least-heavy CPUs during the last polling window in order for
|
||
|
* balancing to occur. This is to avoid balancing when the system is quiet.
|
||
|
*
|
||
|
* This threshold is compared to the _scaled_ interrupt counts per CPU; i.e.,
|
||
|
* the number of interrupts scaled to the CPU's capacity.
|
||
|
*/
|
||
|
#define IRQ_SCALED_THRESH CONFIG_IRQ_SBALANCE_THRESH
|
||
|
|
||
|
struct bal_irq {
|
||
|
struct list_head node;
|
||
|
struct list_head move_node;
|
||
|
struct rcu_head rcu;
|
||
|
struct irq_desc *desc;
|
||
|
unsigned int delta_nr;
|
||
|
unsigned int old_nr;
|
||
|
int prev_cpu;
|
||
|
};
|
||
|
|
||
|
struct bal_domain {
|
||
|
struct list_head movable_irqs;
|
||
|
unsigned int intrs;
|
||
|
int cpu;
|
||
|
};
|
||
|
|
||
|
static LIST_HEAD(bal_irq_list);
|
||
|
static DEFINE_SPINLOCK(bal_irq_lock);
|
||
|
static DEFINE_PER_CPU(struct bal_domain, balance_data);
|
||
|
static DEFINE_PER_CPU(unsigned long, cpu_cap);
|
||
|
static cpumask_t cpu_exclude_mask __read_mostly;
|
||
|
|
||
|
void sbalance_desc_add(struct irq_desc *desc)
|
||
|
{
|
||
|
struct bal_irq *bi;
|
||
|
|
||
|
bi = kmalloc(sizeof(*bi), GFP_KERNEL);
|
||
|
if (WARN_ON(!bi))
|
||
|
return;
|
||
|
|
||
|
*bi = (typeof(*bi)){ .desc = desc };
|
||
|
spin_lock(&bal_irq_lock);
|
||
|
list_add_tail_rcu(&bi->node, &bal_irq_list);
|
||
|
spin_unlock(&bal_irq_lock);
|
||
|
}
|
||
|
|
||
|
void sbalance_desc_del(struct irq_desc *desc)
|
||
|
{
|
||
|
struct bal_irq *bi;
|
||
|
|
||
|
spin_lock(&bal_irq_lock);
|
||
|
list_for_each_entry(bi, &bal_irq_list, node) {
|
||
|
if (bi->desc == desc) {
|
||
|
list_del_rcu(&bi->node);
|
||
|
kfree_rcu(bi, rcu);
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
spin_unlock(&bal_irq_lock);
|
||
|
}
|
||
|
|
||
|
static int bal_irq_move_node_cmp(void *priv, const struct list_head *lhs_p,
|
||
|
const struct list_head *rhs_p)
|
||
|
{
|
||
|
const struct bal_irq *lhs = list_entry(lhs_p, typeof(*lhs), move_node);
|
||
|
const struct bal_irq *rhs = list_entry(rhs_p, typeof(*rhs), move_node);
|
||
|
|
||
|
return rhs->delta_nr - lhs->delta_nr;
|
||
|
}
|
||
|
|
||
|
/* Returns false if this IRQ should be totally ignored for this balancing run */
|
||
|
static bool update_irq_data(struct bal_irq *bi, int *cpu)
|
||
|
{
|
||
|
struct irq_desc *desc = bi->desc;
|
||
|
unsigned int nr;
|
||
|
|
||
|
/* Find the CPU which currently has this IRQ affined */
|
||
|
raw_spin_lock_irq(&desc->lock);
|
||
|
*cpu = cpumask_first(desc->irq_common_data.affinity);
|
||
|
raw_spin_unlock_irq(&desc->lock);
|
||
|
if (*cpu >= nr_cpu_ids)
|
||
|
return false;
|
||
|
|
||
|
/*
|
||
|
* Calculate the number of new interrupts from this IRQ. It is assumed
|
||
|
* that the IRQ has been running on the same CPU since the last
|
||
|
* balancing run. This might not hold true if the IRQ was moved by
|
||
|
* someone else since the last balancing run, or if the CPU this IRQ was
|
||
|
* previously running on has since gone offline.
|
||
|
*/
|
||
|
nr = *per_cpu_ptr(desc->kstat_irqs, *cpu);
|
||
|
if (nr <= bi->old_nr) {
|
||
|
bi->old_nr = nr;
|
||
|
return false;
|
||
|
}
|
||
|
|
||
|
/* Calculate the number of new interrupts on this CPU from this IRQ */
|
||
|
bi->delta_nr = nr - bi->old_nr;
|
||
|
bi->old_nr = nr;
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
static int move_irq_to_cpu(struct bal_irq *bi, int cpu)
|
||
|
{
|
||
|
struct irq_desc *desc = bi->desc;
|
||
|
int prev_cpu, ret;
|
||
|
|
||
|
/* Set the affinity if it wasn't changed since we looked at it */
|
||
|
raw_spin_lock_irq(&desc->lock);
|
||
|
prev_cpu = cpumask_first(desc->irq_common_data.affinity);
|
||
|
if (prev_cpu == bi->prev_cpu) {
|
||
|
ret = irq_set_affinity_locked(&desc->irq_data, cpumask_of(cpu),
|
||
|
false);
|
||
|
} else {
|
||
|
bi->prev_cpu = prev_cpu;
|
||
|
ret = -EINVAL;
|
||
|
}
|
||
|
raw_spin_unlock_irq(&desc->lock);
|
||
|
|
||
|
if (!ret) {
|
||
|
/* Update the old interrupt count using the new CPU */
|
||
|
bi->old_nr = *per_cpu_ptr(desc->kstat_irqs, cpu);
|
||
|
pr_debug("Moved IRQ%d (CPU%d -> CPU%d)\n",
|
||
|
irq_desc_get_irq(desc), prev_cpu, cpu);
|
||
|
}
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
static unsigned int scale_intrs(unsigned int intrs, int cpu)
|
||
|
{
|
||
|
/* Scale the number of interrupts to this CPU's current capacity */
|
||
|
return intrs * SCHED_CAPACITY_SCALE / per_cpu(cpu_cap, cpu);
|
||
|
}
|
||
|
|
||
|
/* Returns true if IRQ balancing should stop */
|
||
|
static bool find_min_bd(const cpumask_t *mask, unsigned int max_intrs,
|
||
|
struct bal_domain **min_bd)
|
||
|
{
|
||
|
unsigned int intrs, min_intrs = UINT_MAX;
|
||
|
struct bal_domain *bd;
|
||
|
int cpu;
|
||
|
|
||
|
for_each_cpu(cpu, mask) {
|
||
|
bd = per_cpu_ptr(&balance_data, cpu);
|
||
|
intrs = scale_intrs(bd->intrs, bd->cpu);
|
||
|
|
||
|
/* Terminate when the formerly-max CPU isn't the max anymore */
|
||
|
if (intrs > max_intrs)
|
||
|
return true;
|
||
|
|
||
|
/* Find the CPU with the lowest relative number of interrupts */
|
||
|
if (intrs < min_intrs) {
|
||
|
min_intrs = intrs;
|
||
|
*min_bd = bd;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* Don't balance if IRQs are already balanced evenly enough */
|
||
|
return max_intrs - min_intrs < IRQ_SCALED_THRESH;
|
||
|
}
|
||
|
|
||
|
static void balance_irqs(void)
|
||
|
{
|
||
|
static cpumask_t cpus;
|
||
|
struct bal_domain *bd, *max_bd, *min_bd;
|
||
|
unsigned int intrs, max_intrs;
|
||
|
bool moved_irq = false;
|
||
|
struct bal_irq *bi;
|
||
|
int cpu;
|
||
|
|
||
|
rcu_read_lock();
|
||
|
|
||
|
/* Find the available CPUs for balancing, if there are any */
|
||
|
cpumask_andnot(&cpus, cpu_active_mask, &cpu_exclude_mask);
|
||
|
if (unlikely(cpumask_weight(&cpus) <= 1))
|
||
|
goto unlock;
|
||
|
|
||
|
/*
|
||
|
* Get the current capacity for each CPU. This is adjusted for time
|
||
|
* spent processing IRQs, RT-task time, and thermal pressure. We don't
|
||
|
* exclude time spent processing IRQs when balancing because balancing
|
||
|
* is only done using interrupt counts rather than time spent in
|
||
|
* interrupts. That way, time spent processing each interrupt is
|
||
|
* considered when balancing.
|
||
|
*/
|
||
|
for_each_cpu(cpu, &cpus)
|
||
|
per_cpu(cpu_cap, cpu) = cpu_rq(cpu)->cpu_capacity;
|
||
|
|
||
|
list_for_each_entry_rcu(bi, &bal_irq_list, node) {
|
||
|
if (!update_irq_data(bi, &cpu))
|
||
|
continue;
|
||
|
|
||
|
/* Add the number of new interrupts to this CPU's count */
|
||
|
bd = per_cpu_ptr(&balance_data, cpu);
|
||
|
bd->intrs += bi->delta_nr;
|
||
|
|
||
|
/* Consider this IRQ for balancing if it's movable */
|
||
|
if (!__irq_can_set_affinity(bi->desc))
|
||
|
continue;
|
||
|
|
||
|
/* Ignore for this balancing run if something else moved it */
|
||
|
if (cpu != bi->prev_cpu) {
|
||
|
bi->prev_cpu = cpu;
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
list_add_tail(&bi->move_node, &bd->movable_irqs);
|
||
|
}
|
||
|
|
||
|
/* Find the most interrupt-heavy CPU with movable IRQs */
|
||
|
while (1) {
|
||
|
max_intrs = 0;
|
||
|
for_each_cpu(cpu, &cpus) {
|
||
|
bd = per_cpu_ptr(&balance_data, cpu);
|
||
|
intrs = scale_intrs(bd->intrs, bd->cpu);
|
||
|
if (intrs > max_intrs) {
|
||
|
max_intrs = intrs;
|
||
|
max_bd = bd;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* No balancing to do if there aren't any movable IRQs */
|
||
|
if (unlikely(!max_intrs))
|
||
|
goto unlock;
|
||
|
|
||
|
/* Ensure the heaviest CPU has IRQs which can be moved away */
|
||
|
if (!list_empty(&max_bd->movable_irqs))
|
||
|
break;
|
||
|
|
||
|
try_next_heaviest:
|
||
|
/*
|
||
|
* If the heaviest CPU has no movable IRQs then it can neither
|
||
|
* receive IRQs nor give IRQs. Exclude it from balancing so the
|
||
|
* remaining CPUs can be balanced, if there are any.
|
||
|
*/
|
||
|
if (cpumask_weight(&cpus) == 2)
|
||
|
goto unlock;
|
||
|
|
||
|
cpumask_clear_cpu(max_bd->cpu, &cpus);
|
||
|
}
|
||
|
|
||
|
/* Find the CPU with the lowest relative interrupt count */
|
||
|
if (find_min_bd(&cpus, max_intrs, &min_bd))
|
||
|
goto unlock;
|
||
|
|
||
|
/* Sort movable IRQs in descending order of number of new interrupts */
|
||
|
list_sort(NULL, &max_bd->movable_irqs, bal_irq_move_node_cmp);
|
||
|
|
||
|
/* Push IRQs away from the heaviest CPU to the least-heavy CPUs */
|
||
|
list_for_each_entry(bi, &max_bd->movable_irqs, move_node) {
|
||
|
/* Skip this IRQ if it would just overload the target CPU */
|
||
|
intrs = scale_intrs(min_bd->intrs + bi->delta_nr, min_bd->cpu);
|
||
|
if (intrs >= max_intrs)
|
||
|
continue;
|
||
|
|
||
|
/* Try to migrate this IRQ, or skip it if migration fails */
|
||
|
if (move_irq_to_cpu(bi, min_bd->cpu))
|
||
|
continue;
|
||
|
|
||
|
/* Keep track of whether or not any IRQs are moved */
|
||
|
moved_irq = true;
|
||
|
|
||
|
/* Update the counts and recalculate the max scaled count */
|
||
|
min_bd->intrs += bi->delta_nr;
|
||
|
max_bd->intrs -= bi->delta_nr;
|
||
|
max_intrs = scale_intrs(max_bd->intrs, max_bd->cpu);
|
||
|
|
||
|
/* Recheck for the least-heavy CPU since it may have changed */
|
||
|
if (find_min_bd(&cpus, max_intrs, &min_bd))
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* If the heaviest CPU has movable IRQs which can't actually be moved,
|
||
|
* then ignore it and try balancing the next heaviest CPU.
|
||
|
*/
|
||
|
if (!moved_irq)
|
||
|
goto try_next_heaviest;
|
||
|
unlock:
|
||
|
rcu_read_unlock();
|
||
|
|
||
|
/* Reset each balance domain for the next run */
|
||
|
for_each_possible_cpu(cpu) {
|
||
|
bd = per_cpu_ptr(&balance_data, cpu);
|
||
|
INIT_LIST_HEAD(&bd->movable_irqs);
|
||
|
bd->intrs = 0;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static int __noreturn sbalance_thread(void *data)
|
||
|
{
|
||
|
long poll_jiffies = msecs_to_jiffies(POLL_MS);
|
||
|
struct bal_domain *bd;
|
||
|
int cpu;
|
||
|
|
||
|
/* Parse the list of CPUs to exclude, if any */
|
||
|
if (cpulist_parse(CONFIG_SBALANCE_EXCLUDE_CPUS, &cpu_exclude_mask))
|
||
|
cpu_exclude_mask = CPU_MASK_NONE;
|
||
|
|
||
|
/* Initialize the data used for balancing */
|
||
|
for_each_possible_cpu(cpu) {
|
||
|
bd = per_cpu_ptr(&balance_data, cpu);
|
||
|
INIT_LIST_HEAD(&bd->movable_irqs);
|
||
|
bd->cpu = cpu;
|
||
|
}
|
||
|
|
||
|
set_freezable();
|
||
|
while (1) {
|
||
|
freezable_schedule_timeout_interruptible(poll_jiffies);
|
||
|
balance_irqs();
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static int __init sbalance_init(void)
|
||
|
{
|
||
|
BUG_ON(IS_ERR(kthread_run(sbalance_thread, NULL, "sbalanced")));
|
||
|
return 0;
|
||
|
}
|
||
|
late_initcall(sbalance_init);
|