// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2023 Sultan Alsawaf <sultan@kerneltoast.com>.
 */

/**
 * DOC: SBalance description
 *
 * This is a simple IRQ balancer that polls every X number of milliseconds and
 * moves IRQs from the most interrupt-heavy CPU to the least interrupt-heavy
 * CPUs until the heaviest CPU is no longer the heaviest. IRQs are only moved
 * from one source CPU to any number of destination CPUs per balance run.
 * Balancing is skipped if the gap between the most interrupt-heavy CPU and the
 * least interrupt-heavy CPU is below the configured threshold of interrupts.
 *
 * The heaviest IRQs are targeted for migration in order to reduce the number of
 * IRQs to migrate. If moving an IRQ would reduce overall balance, then it won't
 * be migrated.
 *
 * The most interrupt-heavy CPU is calculated by scaling the number of new
 * interrupts on that CPU to the CPU's current capacity. This way, interrupt
 * heaviness takes into account factors such as thermal pressure and time spent
 * processing interrupts rather than just the sheer number of them. This also
 * makes SBalance aware of CPU asymmetry, where different CPUs can have
 * different performance capacities and be proportionally balanced.
 */

#define pr_fmt(fmt) "sbalance: " fmt

#include <linux/freezer.h>
#include <linux/irq.h>
#include <linux/list_sort.h>
#include "../sched/sched.h"
#include "internals.h"

/* Perform IRQ balancing every POLL_MS milliseconds */
#define POLL_MS CONFIG_IRQ_SBALANCE_POLL_MSEC

/*
 * There needs to be a difference of at least this many new interrupts between
 * the heaviest and least-heavy CPUs during the last polling window in order for
 * balancing to occur. This is to avoid balancing when the system is quiet.
 *
 * This threshold is compared to the _scaled_ interrupt counts per CPU; i.e.,
 * the number of interrupts scaled to the CPU's capacity.
 */
#define IRQ_SCALED_THRESH CONFIG_IRQ_SBALANCE_THRESH

struct bal_irq {
	struct list_head node;
	struct list_head move_node;
	struct rcu_head rcu;
	struct irq_desc *desc;
	unsigned int delta_nr;
	unsigned int old_nr;
	int prev_cpu;
};

struct bal_domain {
	struct list_head movable_irqs;
	unsigned int intrs;
	int cpu;
};

static LIST_HEAD(bal_irq_list);
static DEFINE_SPINLOCK(bal_irq_lock);
static DEFINE_PER_CPU(struct bal_domain, balance_data);
static DEFINE_PER_CPU(unsigned long, cpu_cap);
static cpumask_t cpu_exclude_mask __read_mostly;

void sbalance_desc_add(struct irq_desc *desc)
{
	struct bal_irq *bi;

	bi = kmalloc(sizeof(*bi), GFP_KERNEL);
	if (WARN_ON(!bi))
		return;

	*bi = (typeof(*bi)){ .desc = desc };
	spin_lock(&bal_irq_lock);
	list_add_tail_rcu(&bi->node, &bal_irq_list);
	spin_unlock(&bal_irq_lock);
}

void sbalance_desc_del(struct irq_desc *desc)
{
	struct bal_irq *bi;

	spin_lock(&bal_irq_lock);
	list_for_each_entry(bi, &bal_irq_list, node) {
		if (bi->desc == desc) {
			list_del_rcu(&bi->node);
			kfree_rcu(bi, rcu);
			break;
		}
	}
	spin_unlock(&bal_irq_lock);
}

static int bal_irq_move_node_cmp(void *priv, const struct list_head *lhs_p,
				 const struct list_head *rhs_p)
{
	const struct bal_irq *lhs = list_entry(lhs_p, typeof(*lhs), move_node);
	const struct bal_irq *rhs = list_entry(rhs_p, typeof(*rhs), move_node);

	return rhs->delta_nr - lhs->delta_nr;
}

/* Returns false if this IRQ should be totally ignored for this balancing run */
static bool update_irq_data(struct bal_irq *bi, int *cpu)
{
	struct irq_desc *desc = bi->desc;
	unsigned int nr;

	/* Find the CPU which currently has this IRQ affined */
	raw_spin_lock_irq(&desc->lock);
	*cpu = cpumask_first(desc->irq_common_data.affinity);
	raw_spin_unlock_irq(&desc->lock);
	if (*cpu >= nr_cpu_ids)
		return false;

	/*
	 * Calculate the number of new interrupts from this IRQ. It is assumed
	 * that the IRQ has been running on the same CPU since the last
	 * balancing run. This might not hold true if the IRQ was moved by
	 * someone else since the last balancing run, or if the CPU this IRQ was
	 * previously running on has since gone offline.
	 */
	nr = *per_cpu_ptr(desc->kstat_irqs, *cpu);
	if (nr <= bi->old_nr) {
		bi->old_nr = nr;
		return false;
	}

	/* Calculate the number of new interrupts on this CPU from this IRQ */
	bi->delta_nr = nr - bi->old_nr;
	bi->old_nr = nr;
	return true;
}

static int move_irq_to_cpu(struct bal_irq *bi, int cpu)
{
	struct irq_desc *desc = bi->desc;
	int prev_cpu, ret;

	/* Set the affinity if it wasn't changed since we looked at it */
	raw_spin_lock_irq(&desc->lock);
	prev_cpu = cpumask_first(desc->irq_common_data.affinity);
	if (prev_cpu == bi->prev_cpu) {
		ret = irq_set_affinity_locked(&desc->irq_data, cpumask_of(cpu),
					      false);
	} else {
		bi->prev_cpu = prev_cpu;
		ret = -EINVAL;
	}
	raw_spin_unlock_irq(&desc->lock);

	if (!ret) {
		/* Update the old interrupt count using the new CPU */
		bi->old_nr = *per_cpu_ptr(desc->kstat_irqs, cpu);
		pr_debug("Moved IRQ%d (CPU%d -> CPU%d)\n",
			 irq_desc_get_irq(desc), prev_cpu, cpu);
	}
	return ret;
}

static unsigned int scale_intrs(unsigned int intrs, int cpu)
{
	/* Scale the number of interrupts to this CPU's current capacity */
	return intrs * SCHED_CAPACITY_SCALE / per_cpu(cpu_cap, cpu);
}

/* Returns true if IRQ balancing should stop */
static bool find_min_bd(const cpumask_t *mask, unsigned int max_intrs,
			struct bal_domain **min_bd)
{
	unsigned int intrs, min_intrs = UINT_MAX;
	struct bal_domain *bd;
	int cpu;

	for_each_cpu(cpu, mask) {
		bd = per_cpu_ptr(&balance_data, cpu);
		intrs = scale_intrs(bd->intrs, bd->cpu);

		/* Terminate when the formerly-max CPU isn't the max anymore */
		if (intrs > max_intrs)
			return true;

		/* Find the CPU with the lowest relative number of interrupts */
		if (intrs < min_intrs) {
			min_intrs = intrs;
			*min_bd = bd;
		}
	}

	/* Don't balance if IRQs are already balanced evenly enough */
	return max_intrs - min_intrs < IRQ_SCALED_THRESH;
}

static void balance_irqs(void)
{
	static cpumask_t cpus;
	struct bal_domain *bd, *max_bd, *min_bd;
	unsigned int intrs, max_intrs;
	bool moved_irq = false;
	struct bal_irq *bi;
	int cpu;

	rcu_read_lock();

	/* Find the available CPUs for balancing, if there are any */
	cpumask_andnot(&cpus, cpu_active_mask, &cpu_exclude_mask);
	if (unlikely(cpumask_weight(&cpus) <= 1))
		goto unlock;

	/*
	 * Get the current capacity for each CPU. This is adjusted for time
	 * spent processing IRQs, RT-task time, and thermal pressure. We don't
	 * exclude time spent processing IRQs when balancing because balancing
	 * is only done using interrupt counts rather than time spent in
	 * interrupts. That way, time spent processing each interrupt is
	 * considered when balancing.
	 */
	for_each_cpu(cpu, &cpus)
		per_cpu(cpu_cap, cpu) = cpu_rq(cpu)->cpu_capacity;

	list_for_each_entry_rcu(bi, &bal_irq_list, node) {
		if (!update_irq_data(bi, &cpu))
			continue;

		/* Add the number of new interrupts to this CPU's count */
		bd = per_cpu_ptr(&balance_data, cpu);
		bd->intrs += bi->delta_nr;

		/* Consider this IRQ for balancing if it's movable */
		if (!__irq_can_set_affinity(bi->desc))
			continue;

		/* Ignore for this balancing run if something else moved it */
		if (cpu != bi->prev_cpu) {
			bi->prev_cpu = cpu;
			continue;
		}

		list_add_tail(&bi->move_node, &bd->movable_irqs);
	}

	/* Find the most interrupt-heavy CPU with movable IRQs */
	while (1) {
		max_intrs = 0;
		for_each_cpu(cpu, &cpus) {
			bd = per_cpu_ptr(&balance_data, cpu);
			intrs = scale_intrs(bd->intrs, bd->cpu);
			if (intrs > max_intrs) {
				max_intrs = intrs;
				max_bd = bd;
			}
		}

		/* No balancing to do if there aren't any movable IRQs */
		if (unlikely(!max_intrs))
			goto unlock;

		/* Ensure the heaviest CPU has IRQs which can be moved away */
		if (!list_empty(&max_bd->movable_irqs))
			break;

try_next_heaviest:
		/*
		 * If the heaviest CPU has no movable IRQs then it can neither
		 * receive IRQs nor give IRQs. Exclude it from balancing so the
		 * remaining CPUs can be balanced, if there are any.
		 */
		if (cpumask_weight(&cpus) == 2)
			goto unlock;

		cpumask_clear_cpu(max_bd->cpu, &cpus);
	}

	/* Find the CPU with the lowest relative interrupt count */
	if (find_min_bd(&cpus, max_intrs, &min_bd))
		goto unlock;

	/* Sort movable IRQs in descending order of number of new interrupts */
	list_sort(NULL, &max_bd->movable_irqs, bal_irq_move_node_cmp);

	/* Push IRQs away from the heaviest CPU to the least-heavy CPUs */
	list_for_each_entry(bi, &max_bd->movable_irqs, move_node) {
		/* Skip this IRQ if it would just overload the target CPU */
		intrs = scale_intrs(min_bd->intrs + bi->delta_nr, min_bd->cpu);
		if (intrs >= max_intrs)
			continue;

		/* Try to migrate this IRQ, or skip it if migration fails */
		if (move_irq_to_cpu(bi, min_bd->cpu))
			continue;

		/* Keep track of whether or not any IRQs are moved */
		moved_irq = true;

		/* Update the counts and recalculate the max scaled count */
		min_bd->intrs += bi->delta_nr;
		max_bd->intrs -= bi->delta_nr;
		max_intrs = scale_intrs(max_bd->intrs, max_bd->cpu);

		/* Recheck for the least-heavy CPU since it may have changed */
		if (find_min_bd(&cpus, max_intrs, &min_bd))
			break;
	}

	/*
	 * If the heaviest CPU has movable IRQs which can't actually be moved,
	 * then ignore it and try balancing the next heaviest CPU.
	 */
	if (!moved_irq)
		goto try_next_heaviest;
unlock:
	rcu_read_unlock();

	/* Reset each balance domain for the next run */
	for_each_possible_cpu(cpu) {
		bd = per_cpu_ptr(&balance_data, cpu);
		INIT_LIST_HEAD(&bd->movable_irqs);
		bd->intrs = 0;
	}
}

static int __noreturn sbalance_thread(void *data)
{
	long poll_jiffies = msecs_to_jiffies(POLL_MS);
	struct bal_domain *bd;
	int cpu;

	/* Parse the list of CPUs to exclude, if any */
	if (cpulist_parse(CONFIG_SBALANCE_EXCLUDE_CPUS, &cpu_exclude_mask))
		cpu_exclude_mask = CPU_MASK_NONE;

	/* Initialize the data used for balancing */
	for_each_possible_cpu(cpu) {
		bd = per_cpu_ptr(&balance_data, cpu);
		INIT_LIST_HEAD(&bd->movable_irqs);
		bd->cpu = cpu;
	}

	set_freezable();
	while (1) {
		freezable_schedule_timeout_interruptible(poll_jiffies);
		balance_irqs();
	}
}

static int __init sbalance_init(void)
{
	BUG_ON(IS_ERR(kthread_run(sbalance_thread, NULL, "sbalanced")));
	return 0;
}
late_initcall(sbalance_init);