/*
 * EGO(Energy-Aware CPUFreq Governor) on Energy and Scheduler-Event.
 * Copyright (C) 2021, Samsung Electronic Corporation
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/sched/cpufreq.h>
#include <linux/cpu_pm.h>
#include <linux/cpufreq.h>
#include <trace/hooks/cpuidle.h>

#include "../sched.h"
#include "ems.h"

#include <dt-bindings/soc/samsung/ems.h>
#include <trace/events/ems.h>
#include <trace/events/ems_debug.h>

#define IOWAIT_BOOST_MIN	(SCHED_CAPACITY_SCALE / 8)
#define HIST_SIZE		40
#define	RATIO_UNIT		1000

struct ego_idle {
	int		avg_ratio[CSTATE_MAX];
	int		last_ratio[CSTATE_MAX];
	u32		prev_idx;
};

struct ego_policy {
	struct cpufreq_policy	*policy;

	raw_spinlock_t		update_lock;	/* For shared policies */
	u64			last_freq_update_time;
	s64			freq_update_delay_ns;
	unsigned int		next_freq;	/* final target freq */
	unsigned int		cached_raw_freq;/* util based raw freq */
	unsigned int		org_freq;	/* util based freq in table  */
	unsigned int		eng_freq;	/* lowest energy freq */

	/* The next fields are only needed if fast switch cannot be used: */
	struct			irq_work irq_work;
	struct			kthread_work work;
	struct			mutex work_lock;
	struct			kthread_worker worker;
	struct task_struct	*thread;
	bool			work_in_progress;

	bool			limits_changed;
	bool			need_freq_update;

	/* EGO specific */
	struct cpumask		cpus;
	struct cpumask		thread_allowed_cpus;
	int			heaviest_cpu;

	/* EGO tunables */
	unsigned int		ratio;
	int			dis_buck_share;	/* ignore buck-share when computing energy */
	int			pelt_boost;	/* dynamic changed boost */
	int			htask_boost;	/* tunable boost */
	int			pelt_margin;
	int			split_pelt_margin;
	unsigned int		split_pelt_margin_freq;
	s64			up_rate_limit_ns;
	s64			split_up_rate_limit_ns;
	unsigned int		split_up_rate_limit_freq;
	s64			down_rate_limit_ns;

	bool			build_somac_wall;
	unsigned int		somac_wall;

	struct kobject		kobj;
};

struct ego_cpu {
	struct update_util_data	update_util;
	struct ego_policy	*egp;
	unsigned int		cpu;

	bool			iowait_boost_pending;
	unsigned int		iowait_boost;
	u64			last_update;

	unsigned long		bw_dl;
	unsigned long		max;

	unsigned long		util;		/* current pelt util */
	unsigned long		boosted_util;	/* current boosted util */

	unsigned long		min_cap;

	/* idle state */
	struct ego_idle		idle;
};

struct kobject *ego_kobj;
static DEFINE_PER_CPU(struct ego_cpu, ego_cpu);

/*********************************************************************/
/*			EGO Specific Implementation		     */
/*********************************************************************/
/* returns wether cpufreq governor is EGO or NOT */
static bool inline ego_is_working(struct ego_policy *egp)
{
	return ((likely(egp)) && (likely(egp->policy))
		&& (egp->policy->governor_data == egp));
}

/* compute freq level diff between cur freq and given freq */
static unsigned int
get_diff_num_levels(struct cpufreq_policy *policy, unsigned int freq)
{
	unsigned int index1, index2;

	index1 = cpufreq_frequency_table_get_index(policy, policy->cur);
	index2 = cpufreq_frequency_table_get_index(policy, freq);

	return abs(index1 - index2);
}

#define ESG_MAX_DELAY_PERIODS 5
/*
 * Return true if we can delay frequency update because the requested frequency
 * change is not large enough, and false if it is large enough. The condition
 * for determining large enough compares the number of frequency level change
 * vs., elapsed time since last frequency update. For example,
 * ESG_MAX_DELAY_PERIODS of 5 would mean immediate frequency change is allowed
 * only if the change in frequency level is greater or equal to 5;
 * It also means change in frequency level equal to 1 would need to
 * wait 5 ticks for it to take effect.
 */
static bool ego_postpone_freq_update(struct ego_policy *egp,
				u64 time, unsigned int target_freq)
{
	unsigned int diff_num_levels, num_periods, elapsed, margin;

	if (egp->need_freq_update)
		return false;

	elapsed = time - egp->last_freq_update_time;

	if (egp->policy->cur < target_freq)
		return elapsed < egp->up_rate_limit_ns;

	margin  = egp->freq_update_delay_ns >> 2;
	num_periods = (elapsed + margin) / egp->freq_update_delay_ns;
	if (num_periods > ESG_MAX_DELAY_PERIODS)
		return false;

	diff_num_levels = get_diff_num_levels(egp->policy, target_freq);
	if (diff_num_levels > ESG_MAX_DELAY_PERIODS - num_periods)
		return false;
	else
		return true;
}

/*********************************************************************/
/*			To support expecting power		     */
/*********************************************************************/
static inline
unsigned long ego_compute_energy(struct ego_policy *egp, unsigned long freq)
{
	struct energy_state states[VENDOR_NR_CPUS] = { 0, };
	unsigned long time[CSTATE_MAX] = { 0 };
	unsigned long active_eng, idle_eng, capacity;
	int cpu, policy_cpu = egp->policy->cpu;

	capacity = max(et_freq_to_cap(policy_cpu, freq), (unsigned long)1);
	et_fill_energy_state(NULL, &egp->cpus, states, capacity, -1);

	/* compute nomalized time */
	for_each_cpu(cpu, &egp->cpus) {
		struct ego_cpu *egc = &per_cpu(ego_cpu, cpu);
		struct ego_idle *egi = &egc->idle;
		unsigned long idle_util, idle_ratio_sum;

		states[cpu].util = egc->util;

		/* We just guess nomalized value from clkoff/pwroff ratio */
		idle_util = max((long)(capacity - egc->util), (long) 0);
		idle_ratio_sum = egi->avg_ratio[CLKOFF] + egi->avg_ratio[PWROFF];
		time[CLKOFF] += (idle_util * egi->avg_ratio[CLKOFF] / idle_ratio_sum);
		time[PWROFF] += (idle_util * egi->avg_ratio[PWROFF] / idle_ratio_sum);
	}

	/* compute active energy */
	active_eng = et_compute_cpu_energy(&egp->cpus, states);

	/* compute idle energy */
	idle_eng = (states[policy_cpu].static_power * (time[CLKOFF] * RATIO_UNIT)) / capacity;

	trace_ego_cpu_eng(policy_cpu, capacity,
			states[policy_cpu].dynamic_power, states[policy_cpu].static_power,
			time[CLKOFF], active_eng, idle_eng);

	return active_eng + idle_eng;
}

static void ego_compute_cpu_idle_ratio(struct ego_cpu *egc, int hist_size)
{
	int avg_ratio[CSTATE_MAX] = { 0 };
	struct ego_idle *egi = &egc->idle;
	int cpu = egc->cpu;
	int state, idx, cur_idx = mlt_cur_period(cpu);
	int update = abs(cur_idx - egi->prev_idx);
	int last_ratio, cur_ratio;
	int last_idx = mlt_period_with_delta(cur_idx, 1);

	if (!update)
		return;

	/* compute last/current window only to fast computing */
	if (update == 1) {
		for (state = 0; state < CSTATE_MAX; state++) {
			last_ratio = egi->last_ratio[state];
			cur_ratio = mlt_cst_value(cpu, cur_idx, state);

			/* 1. compute ratio sum */
			avg_ratio[state] = egi->avg_ratio[state] * hist_size;
			/* 2. minus last window ratio */
			avg_ratio[state] = max((avg_ratio[state] - last_ratio), 0);
			/* 3. plus current window ratio */
			avg_ratio[state] += cur_ratio;
		}
	} else {
	/* compute all ratio about hist size */
		int cursor = cur_idx;
		for (idx = 0; idx < hist_size; idx++) {
			for (state = 0; state < CSTATE_MAX; state++)
				avg_ratio[state] += mlt_cst_value(cpu, cursor, state);
			cursor = mlt_prev_period(cursor);
		}
	}

	/* compute avg ratio */
	for (state = 0; state < CSTATE_MAX; state++)
		egi->avg_ratio[state] = avg_ratio[state] / hist_size;

	/* update last index */
	egi->prev_idx = cur_idx;

	/* save last ratio to fast computing */
	for (state = 0; state < CSTATE_MAX; state++)
		egi->last_ratio[state] = mlt_cst_value(cpu, last_idx, state);

	trace_ego_cpu_idle_ratio(cpu, update,
			cur_idx, egi->avg_ratio[CLKOFF], egi->avg_ratio[PWROFF],
			last_ratio, cur_ratio, last_idx);
}

/* to compute time delta, make time snapshot */
static inline void ego_compute_idle_ratio(struct ego_policy *egp)
{
	int cpu;

	for_each_cpu(cpu, &egp->cpus) {
		struct ego_cpu *egc = &per_cpu(ego_cpu, cpu);
		ego_compute_cpu_idle_ratio(egc, MLT_PERIOD_COUNT);
	}
}

static unsigned int ego_apply_eng_boost(unsigned int min_freq,
		unsigned int eng_freq, struct ego_policy *egp)
{
	int delta = eng_freq - min_freq;
	if (delta <= 0)
		return min_freq;
	return min_freq + (delta * egp->ratio) / RATIO_UNIT;
}

#define khz_to_mhz(x)	((x) / 1000)
static unsigned int ego_find_energy_freq(struct ego_policy *egp, unsigned int org_freq)
{
	struct cpufreq_frequency_table *pos;
	int min_energy = INT_MAX, eng_freq = -1;

	cpufreq_for_each_entry(pos, egp->policy->freq_table) {
		unsigned long energy;

		if (pos->frequency < org_freq)
			continue;

		energy = ego_compute_energy(egp, pos->frequency);
		if (energy < min_energy) {
			min_energy = energy;
			eng_freq = pos->frequency;
		}
	}

	if (eng_freq < 0)
		return org_freq;

	eng_freq = ego_apply_eng_boost(org_freq, eng_freq, egp);

	return clamp_val(eng_freq, egp->policy->min, egp->policy->max);
}

/*********************************************************************/
/*		      Sysbusy state change notifier		     */
/*********************************************************************/
static int ego_sysbusy_notifier_call(struct notifier_block *nb,
					unsigned long val, void *v)
{
	int cpu;
	enum sysbusy_state state = *(enum sysbusy_state *)v;

	if (val != SYSBUSY_STATE_CHANGE)
		return NOTIFY_OK;

	for_each_possible_cpu(cpu) {
		struct ego_policy *egp;
		if (cpu != cpumask_first(cpu_coregroup_mask(cpu)))
			continue;

		egp = per_cpu(ego_cpu, cpu).egp;
		if (!ego_is_working(egp))
			continue;

		egp->build_somac_wall = (state == SYSBUSY_SOMAC);
	}

	return NOTIFY_OK;
}

static struct notifier_block ego_sysbusy_notifier = {
	.notifier_call = ego_sysbusy_notifier_call,
};

/*********************************************************************/
/*		      EGO mode change notifier		     */
/*********************************************************************/
#define DEFAULT_PELT_MARGIN	(25)	/* 25% in default */
static int ego_mode_update_callback(struct notifier_block *nb,
				unsigned long val, void *v)
{
	struct emstune_set *cur_set = (struct emstune_set *)v;
	struct ego_policy *egp;
	int cpu;

	for_each_possible_cpu(cpu) {
		if (cpu != cpumask_first(cpu_coregroup_mask(cpu)))
			continue;

		egp = per_cpu(ego_cpu, cpu).egp;
		if (!egp)
			continue;

		egp->pelt_boost = cur_set->cpufreq_gov.pelt_boost[cpu];
		egp->htask_boost = cur_set->cpufreq_gov.htask_boost[cpu];
		egp->pelt_margin = DEFAULT_PELT_MARGIN;
		egp->split_pelt_margin = cur_set->cpufreq_gov.split_pelt_margin[cpu];
		egp->split_pelt_margin_freq = cur_set->cpufreq_gov.split_pelt_margin_freq[cpu];
		egp->up_rate_limit_ns = 4 * NSEC_PER_MSEC; /* 4 ms in default */
		egp->split_up_rate_limit_ns =
				cur_set->cpufreq_gov.split_up_rate_limit[cpu] * NSEC_PER_MSEC;
		egp->split_up_rate_limit_freq =
				cur_set->cpufreq_gov.split_up_rate_limit_freq[cpu];
		egp->down_rate_limit_ns = cur_set->cpufreq_gov.down_rate_limit * NSEC_PER_MSEC;
		egp->dis_buck_share = cur_set->cpufreq_gov.dis_buck_share[cpu];
	}

	return NOTIFY_OK;
}

static struct notifier_block ego_mode_update_notifier = {
	.notifier_call = ego_mode_update_callback,
};

/*********************************************************************/
/* 			      SLACK TIMER			     */
/*********************************************************************/
static void ego_update_min_cap(struct cpufreq_policy *policy)
{
	unsigned int cpu;
	unsigned long max_cap, min_cap;

	max_cap = capacity_cpu_orig(policy->cpu);

	/* min_cap is minimum value making higher frequency than policy->min */
	min_cap = (max_cap * policy->min) / policy->max;
	min_cap -= 1;

	for_each_cpu(cpu, policy->cpus)
		per_cpu(ego_cpu, cpu).min_cap = min_cap;
}

static int ego_need_slack_timer(void)
{
	unsigned int cpu = raw_smp_processor_id();
	struct ego_cpu *egc = &per_cpu(ego_cpu, cpu);
	struct ego_policy *egp = egc->egp;
	int need = 0;

	if (!ego_is_working(egp))
		return 0;

	if (egc->boosted_util > egc->min_cap) {
		need = 1;
		goto out;
	}

	/* want to add timer heaviest cpu only in this domain */
	if (egp->heaviest_cpu == cpu) {
		/* want to add timer when freq is high with energy freq, not min lock */
		if (egp->policy->cur > egp->policy->cpuinfo.min_freq &&
				egp->eng_freq > egp->org_freq)
			need = 1;
	}

out:
	trace_ego_need_slack_timer(cpu, egc->boosted_util, egc->min_cap,
				egp->heaviest_cpu, egp->policy->cur,
				egp->policy->cpuinfo.min_freq,
				egp->eng_freq, egp->org_freq, need);

	return need;
}

/************************ Governor internals ***********************/

static unsigned int ego_resolve_freq_wo_clamp(struct cpufreq_policy *policy,
							unsigned int target_freq)
{
	unsigned int index;

	index = cpufreq_table_find_index_al(policy, target_freq);
	if (index < 0) {
		pr_err("target frequency(%d) out of range\n", target_freq);
		return 0;
	}

	return policy->freq_table[index].frequency;
}

static bool ego_should_update_freq(struct ego_policy *egp, u64 time)
{
	s64 delta_ns, rate_limit_ns;

	/*
	 * Since cpufreq_update_util() is called with rq->lock held for
	 * the @target_cpu, our per-CPU data is fully serialized.
	 *
	 * However, drivers cannot in general deal with cross-CPU
	 * requests, so while get_next_freq() will work, our
	 * ego_update_commit() call may not for the fast switching platforms.
	 *
	 * Hence stop here for remote requests if they aren't supported
	 * by the hardware, as calculating the frequency is pointless if
	 * we cannot in fact act on it.
	 *
	 * This is needed on the slow switching platforms too to prevent CPUs
	 * going offline from leaving stale IRQ work items behind.
	 */
	if (!cpufreq_this_cpu_can_update(egp->policy))
		return false;

	if (unlikely(egp->limits_changed)) {
		egp->limits_changed = false;
		egp->need_freq_update = true;
		return true;
	}

	delta_ns = time - egp->last_freq_update_time;

	/*
	 * EGO doesn't know target frequency at this point, so consider
	 * the minimum value between up/down rate limit to cover all cases.
	 * The exact rate limit will be considered in ego_postpone_freq_update().
	 */
	rate_limit_ns = min(egp->up_rate_limit_ns, egp->down_rate_limit_ns);

	return delta_ns >= rate_limit_ns;
}

static void ego_update_pelt_margin(struct ego_policy *egp, u64 time,
				   unsigned int next_freq)
{
	if (next_freq < egp->split_pelt_margin_freq)
		egp->pelt_margin = DEFAULT_PELT_MARGIN;
	else
		egp->pelt_margin = egp->split_pelt_margin;
}

static void ego_update_up_rate_limit(struct ego_policy *egp, u64 time,
				     unsigned int next_freq)
{
	if (next_freq < egp->split_up_rate_limit_freq)
		egp->up_rate_limit_ns = 4 * NSEC_PER_MSEC; /* 4 ms in default */
	else
		egp->up_rate_limit_ns = egp->split_up_rate_limit_ns;
}

static void ego_update_freq_variant_param(struct ego_policy *egp, u64 time,
					  unsigned int next_freq)
{
	ego_update_pelt_margin(egp, time, next_freq);
	ego_update_up_rate_limit(egp, time, next_freq);
}

static bool ego_request_freq_change(struct ego_policy *egp, u64 time,
				   unsigned int next_freq)
{
	if (!egp->need_freq_update) {
		if (egp->policy->cur == next_freq)
			return false;
	} else {
		egp->need_freq_update = false;
	}

	return true;
}

/* update next freq and last frequency change requesting time  */
static void ego_update_next_freq(struct ego_policy *egp, u64 time,
				   unsigned int next_freq)
{
	ego_update_freq_variant_param(egp, time, next_freq);

	if (egp->next_freq > next_freq)
		next_freq = (egp->next_freq + next_freq) >> 1;

	egp->next_freq = next_freq;
	egp->last_freq_update_time = time;
}

static void ego_fast_switch(struct ego_policy *egp, u64 time,
			      unsigned int next_freq)
{
	struct cpufreq_policy *policy = egp->policy;

	if (!ego_request_freq_change(egp, time, next_freq))
		return;

	ego_update_next_freq(egp, time, next_freq);
	cpufreq_driver_fast_switch(policy, next_freq);
}


static void ego_deferred_update(struct ego_policy *egp, u64 time,
				  unsigned int next_freq)
{
	if (!ego_request_freq_change(egp, time, next_freq))
		return;

	ego_update_next_freq(egp, time, next_freq);

	if (!egp->work_in_progress) {
		egp->work_in_progress = true;
		irq_work_queue(&egp->irq_work);
	}
}

static inline unsigned long
ego_map_util_freq(struct ego_policy *egp, unsigned long util,
		  unsigned long freq, unsigned long cap)
{
	return ((freq * (100 + egp->pelt_margin)) / 100) * util / cap;
}

/**
 * get_next_freq - Compute a new frequency for a given cpufreq policy.
 * @egp: schedutil policy object to compute the new frequency for.
 * @util: Current CPU utilization.
 * @max: CPU capacity.
 *
 * If the utilization is frequency-invariant, choose the new frequency to be
 * proportional to it, that is
 *
 * next_freq = C * max_freq * util / max
 *
 * Otherwise, approximate the would-be frequency-invariant utilization by
 * util_raw * (curr_freq / max_freq) which leads to
 *
 * next_freq = C * curr_freq * util_raw / max
 *
 * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8.
 *
 * The lowest driver-supported frequency which is equal or greater than the raw
 * next_freq (as calculated above) is returned, subject to policy min/max and
 * cpufreq driver limitations.
 */

/*
 * use_energy_freq - return use energy freq or not
 * Must have at least one busy cpu to use enregy freq
 */
static bool use_energy_freq(struct cpufreq_policy *policy)
{
	int cpu;

	for_each_cpu(cpu, policy->cpus) {
		if (profile_get_cpu_wratio_busy(cpu))
			return true;
	}
	return false;
}

static unsigned int get_next_freq(struct ego_policy *egp,
		unsigned long util, unsigned long max)
{
	struct cpufreq_policy *policy = egp->policy;
	unsigned int freq, org_freq, eng_freq = 0;

	/* compute pure frequency base on util */
	org_freq = ego_map_util_freq(egp, util, policy->cpuinfo.max_freq, max);
	if ((org_freq == egp->cached_raw_freq || egp->work_in_progress)
					&& !egp->need_freq_update) {
		freq = max(egp->org_freq, egp->next_freq);
		goto skip_find_next_freq;
	}
	egp->cached_raw_freq = org_freq;

	/* find freq from table */
	org_freq = ego_resolve_freq_wo_clamp(policy, org_freq);
	if (egp->org_freq != org_freq) {
		egp->org_freq = org_freq;
		/* inform new freq to et */
		et_update_freq(policy->cpu, org_freq);
	}

	/* compute lowest energy freq */
	if (use_energy_freq(policy)) {
		ego_compute_idle_ratio(egp);
		egp->eng_freq = eng_freq = ego_find_energy_freq(egp, org_freq);
	} else {
		egp->eng_freq = 0;
	}
	freq = max(org_freq, eng_freq);

skip_find_next_freq:

	/* Apply fclamp */
	freq = fclamp_apply(policy, freq);
	freq = clamp_val(freq, policy->min, policy->max);

	freq = egp->build_somac_wall ? min(freq, egp->somac_wall) : freq;

	trace_ego_req_freq(policy->cpu, freq, policy->min, policy->max,
			org_freq, eng_freq, util, max);

	return freq;
}

/*
 * This function computes an effective utilization for the given CPU, to be
 * used for frequency selection given the linear relation: f = u * f_max.
 *
 * The scheduler tracks the following metrics:
 *
 *   cpu_util_{cfs,rt,dl,irq}()
 *   cpu_bw_dl()
 *
 * Where the cfs,rt and dl util numbers are tracked with the same metric and
 * synchronized windows and are thus directly comparable.
 *
 * The cfs,rt,dl utilization are the running times measured with rq->clock_task
 * which excludes things like IRQ and steal-time. These latter are then accrued
 * in the irq utilization.
 *
 * The DL bandwidth number otoh is not a measured metric but a value computed
 * based on the task model parameters and gives the minimal utilization
 * required to meet deadlines.
 */
unsigned long ego_cpu_util(int cpu, unsigned long util_cfs,
				 unsigned long max, enum schedutil_type type,
				 struct task_struct *p)
{
	unsigned long dl_util, util, irq;
	struct rq *rq = cpu_rq(cpu);

	/*
	 * Early check to see if IRQ/steal time saturates the CPU, can be
	 * because of inaccuracies in how we track these -- see
	 * update_irq_load_avg().
	 */

	irq = cpu_util_irq(rq);
	if (unlikely(irq >= max)) {
		util = irq;
		goto out;
	}

	/*
	 * Because the time spend on RT/DL tasks is visible as 'lost' time to
	 * CFS tasks and we use the same metric to track the effective
	 * utilization (PELT windows are synchronized) we can directly add them
	 * to obtain the CPU's actual utilization.
	 *
	 * CFS and RT utilization can be boosted or capped, depending on
	 * utilization clamp constraints requested by currently RUNNABLE
	 * tasks.
	 * When there are no CFS RUNNABLE tasks, clamps are released and
	 * frequency will be gracefully reduced with the utilization decay.
	 */
	util = util_cfs + cpu_util_rt(rq);
	if (type == FREQUENCY_UTIL)
		util = uclamp_rq_util_with(rq, util, p);
	dl_util = cpu_util_dl(rq);

	/*
	 * For frequency selection we do not make cpu_util_dl() a permanent part
	 * of this sum because we want to use cpu_bw_dl() later on, but we need
	 * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such
	 * that we select f_max when there is no idle time.
	 *
	 * NOTE: numerical errors or stop class might cause us to not quite hit
	 * saturation when we should -- something for later.
	 */
	if (util + dl_util >= max) {
		util = util + dl_util;
		goto out;
	}

	/*
	 * OTOH, for energy computation we need the estimated running time, so
	 * include util_dl and ignore dl_bw.
	 */
	if (type == ENERGY_UTIL)
		util += dl_util;

	/*
	 * There is still idle time; further improve the number by using the
	 * irq metric. Because IRQ/steal time is hidden from the task clock we
	 * need to scale the task numbers:
	 *
	 *              max - irq
	 *   U' = irq + --------- * U
	 *                 max
	 */
	util = scale_irq_capacity(util, irq, max);
	util += irq;

	/*
	 * Bandwidth required by DEADLINE must always be granted while, for
	 * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
	 * to gracefully reduce the frequency when no tasks show up for longer
	 * periods of time.
	 *
	 * Ideally we would like to set bw_dl as min/guaranteed freq and util +
	 * bw_dl as requested freq. However, cpufreq is not yet ready for such
	 * an interface. So, we only do the latter for now.
	 */
	if (type == FREQUENCY_UTIL)
		util += cpu_bw_dl(rq);

out:
	trace_ego_sched_util(cpu, util, util_cfs, cpu_util_rt(rq),
		cpu_util_dl(rq), cpu_bw_dl(rq), cpu_util_irq(rq));

	return min(max, util);
}

static unsigned long ego_get_util(struct ego_cpu *egc)
{
	struct rq *rq = cpu_rq(egc->cpu);
	unsigned long util = ml_cpu_util(egc->cpu);
	unsigned long max = arch_scale_cpu_capacity(egc->cpu);

	egc->max = max;
	egc->bw_dl = cpu_bw_dl(rq);

	return ego_cpu_util(egc->cpu, util, max, FREQUENCY_UTIL, NULL);
}

/**
 * ego_iowait_reset() - Reset the IO boost status of a CPU.
 * @egc: the ego data for the CPU to boost
 * @time: the update time from the caller
 * @set_iowait_boost: true if an IO boost has been requested
 *
 * The IO wait boost of a task is disabled after a tick since the last update
 * of a CPU. If a new IO wait boost is requested after more then a tick, then
 * we enable the boost starting from IOWAIT_BOOST_MIN, which improves energy
 * efficiency by ignoring sporadic wakeups from IO.
 */
static bool ego_iowait_reset(struct ego_cpu *egc, u64 time,
			       bool set_iowait_boost)
{
	s64 delta_ns = time - egc->last_update;

	/* Reset boost only if a tick has elapsed since last request */
	if (delta_ns <= TICK_NSEC)
		return false;

	egc->iowait_boost = set_iowait_boost ? IOWAIT_BOOST_MIN : 0;
	egc->iowait_boost_pending = set_iowait_boost;

	return true;
}

/**
 * ego_iowait_boost() - Updates the IO boost status of a CPU.
 * @egc: the ego data for the CPU to boost
 * @time: the update time from the caller
 * @flags: SCHED_CPUFREQ_IOWAIT if the task is waking up after an IO wait
 *
 * Each time a task wakes up after an IO operation, the CPU utilization can be
 * boosted to a certain utilization which doubles at each "frequent and
 * successive" wakeup from IO, ranging from IOWAIT_BOOST_MIN to the utilization
 * of the maximum OPP.
 *
 * To keep doubling, an IO boost has to be requested at least once per tick,
 * otherwise we restart from the utilization of the minimum OPP.
 */
static void ego_iowait_boost(struct ego_cpu *egc, u64 time,
			       unsigned int flags)
{
	bool set_iowait_boost = flags & SCHED_CPUFREQ_IOWAIT;

	/* Reset boost if the CPU appears to have been idle enough */
	if (egc->iowait_boost &&
	    ego_iowait_reset(egc, time, set_iowait_boost))
		return;

	/* Boost only tasks waking up after IO */
	if (!set_iowait_boost)
		return;

	/* Ensure boost doubles only one time at each request */
	if (egc->iowait_boost_pending)
		return;
	egc->iowait_boost_pending = true;

	/* Double the boost at each request */
	if (egc->iowait_boost) {
		egc->iowait_boost =
			min_t(unsigned int, egc->iowait_boost << 1, SCHED_CAPACITY_SCALE);
		return;
	}

	/* First wakeup after IO: start with minimum boost */
	egc->iowait_boost = IOWAIT_BOOST_MIN;
}

/**
 * ego_iowait_apply() - Apply the IO boost to a CPU.
 * @egc: the ego data for the cpu to boost
 * @time: the update time from the caller
 * @util: the utilization to (eventually) boost
 * @max: the maximum value the utilization can be boosted to
 *
 * A CPU running a task which woken up after an IO operation can have its
 * utilization boosted to speed up the completion of those IO operations.
 * The IO boost value is increased each time a task wakes up from IO, in
 * ego_iowait_apply(), and it's instead decreased by this function,
 * each time an increase has not been requested (!iowait_boost_pending).
 *
 * A CPU which also appears to have been idle for at least one tick has also
 * its IO boost utilization reset.
 *
 * This mechanism is designed to boost high frequently IO waiting tasks, while
 * being more conservative on tasks which does sporadic IO operations.
 */
static unsigned long ego_iowait_apply(struct ego_cpu *egc, u64 time,
					unsigned long util, unsigned long max)
{
	unsigned long boost;

	/* No boost currently required */
	if (!egc->iowait_boost)
		return 0;

	/* Reset boost if the CPU appears to have been idle enough */
	if (ego_iowait_reset(egc, time, false))
		return 0;

	if (!egc->iowait_boost_pending) {
		/*
		 * No boost pending; reduce the boost value.
		 */
		egc->iowait_boost >>= 1;
		if (egc->iowait_boost < IOWAIT_BOOST_MIN) {
			egc->iowait_boost = 0;
			return 0;
		}
	}

	egc->iowait_boost_pending = false;

	/*
	 * @util is already in capacity scale; convert iowait_boost
	 * into the same scale so we can compare.
	 */
	boost = (egc->iowait_boost * max) >> SCHED_CAPACITY_SHIFT;
	boost = max(boost, util);
	boost = uclamp_rq_util_with(cpu_rq(egc->cpu), boost, NULL);
	return boost;
}

/*
 * Make ego_should_update_freq() ignore the rate limit when DL
 * has increased the utilization.
 */
static inline void ignore_dl_rate_limit(struct ego_cpu *egc, struct ego_policy *egp)
{
	if (cpu_bw_dl(cpu_rq(egc->cpu)) > egc->bw_dl)
		egp->limits_changed = true;
}

static int get_boost_pelt_util(int capacity, int util, int boost)
{
	long long margin;

#if AMIGO_BUILD_VER >= 4
	margin = util * boost / 100;
#else
        if (!boost)
                return util;

        if (boost > 0) {
                margin = max(capacity - util, 0) * boost;
        } else {
                margin = util * boost;
        }
        margin /= 100;
#endif
	return util + margin;
}

static unsigned int ego_next_freq_shared(struct ego_cpu *egc, u64 time)
{
	struct ego_policy *egp = egc->egp;
	struct cpufreq_policy *policy = egp->policy;
	unsigned long util = 0, io_util = 0, max = 1;
	unsigned int cpu;

	for_each_cpu(cpu, policy->cpus) {
		struct ego_cpu *egc = &per_cpu(ego_cpu, cpu);
		unsigned long cpu_util, cpu_io_util, cpu_max;
		unsigned long cpu_boosted_util;

		egc->util = cpu_util = ego_get_util(egc);
		cpu_boosted_util = freqboost_cpu_boost(cpu, cpu_util);
		cpu_boosted_util = max(cpu_boosted_util,
					heavytask_cpu_boost(cpu, cpu_util, egp->htask_boost));
		cpu_boosted_util = get_boost_pelt_util(capacity_cpu(cpu),
					cpu_boosted_util, egp->pelt_boost);
		egc->boosted_util = cpu_boosted_util;
		cpu_max = egc->max;

		cpu_io_util = ego_iowait_apply(egc, time, cpu_util, cpu_max);

		/* find heaviest util and cpu */
		if (util < cpu_boosted_util) {
			util = cpu_boosted_util;
			egp->heaviest_cpu = cpu;
		}
		/* find heaviest io util */
		io_util = max(io_util, cpu_io_util);
		/* find heaviest max */
		max = max(max, cpu_max);

		trace_ego_cpu_util(cpu, egp->pelt_boost, cpu_util, io_util, cpu_boosted_util);
	}

	util = max(util, io_util);
	return get_next_freq(egp, util, max);
}

static void
ego_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
{
	struct ego_cpu *egc = container_of(hook, struct ego_cpu, update_util);
	struct ego_policy *egp = egc->egp;
	unsigned int next_f;

	ego_iowait_boost(egc, time, flags);
	egc->last_update = time;
	ignore_dl_rate_limit(egc, egp);

	if (egc->iowait_boost || egp->limits_changed)
		raw_spin_lock(&egp->update_lock);
	else
		if (!raw_spin_trylock(&egp->update_lock))
			return;

	if (ego_should_update_freq(egp, time)) {
		next_f = ego_next_freq_shared(egc, time);

		if (ego_postpone_freq_update(egp, time, next_f))
			goto out;

		if (egp->policy->fast_switch_enabled)
			ego_fast_switch(egp, time, next_f);
		else
			ego_deferred_update(egp, time, next_f);
	}

out:
	raw_spin_unlock(&egp->update_lock);
}

static void ego_work(struct kthread_work *work)
{
	struct ego_policy *egp = container_of(work, struct ego_policy, work);
	unsigned int freq;
	unsigned long flags;

	/*
	 * Hold egp->update_lock shortly to handle the case where:
	 * incase egp->next_freq is read here, and then updated by
	 * ego_deferred_update() just before work_in_progress is set to false
	 * here, we may miss queueing the new update.
	 *
	 * Note: If a work was queued after the update_lock is released,
	 * ego_work() will just be called again by kthread_work code; and the
	 * request will be proceed before the ego thread sleeps.
	 */
	raw_spin_lock_irqsave(&egp->update_lock, flags);
	freq = egp->next_freq;
	egp->work_in_progress = false;
	raw_spin_unlock_irqrestore(&egp->update_lock, flags);

	mutex_lock(&egp->work_lock);
	__cpufreq_driver_target(egp->policy, freq, CPUFREQ_RELATION_L);
	mutex_unlock(&egp->work_lock);
}

static void ego_irq_work(struct irq_work *irq_work)
{
	struct ego_policy *egp;

	egp = container_of(irq_work, struct ego_policy, irq_work);

	kthread_queue_work(&egp->worker, &egp->work);
}

/************************** sysfs interface ************************/
struct ego_attr {
	struct attribute attr;
	ssize_t (*show)(struct kobject *, char *);
	ssize_t (*store)(struct kobject *, const char *, size_t count);
};

#define ego_attr_rw(name)				\
static struct ego_attr name##_attr =			\
__ATTR(name, 0644, show_##name, store_##name)

#define ego_show(name)								\
static ssize_t show_##name(struct kobject *k, char *buf)			\
{										\
	struct ego_policy *egp =					\
			container_of(k, struct ego_policy, kobj);		\
										\
	return sprintf(buf, "%d\n", egp->name);				\
}										\

#define ego_store(name)								\
static ssize_t store_##name(struct kobject *k, const char *buf, size_t count)	\
{										\
	struct ego_policy *egp =					\
			container_of(k, struct ego_policy, kobj);		\
	int data;								\
										\
	if (!sscanf(buf, "%d", &data))						\
		return -EINVAL;							\
										\
	egp->name = data;						\
	return count;								\
}

ego_show(ratio);
ego_store(ratio);
ego_attr_rw(ratio);
ego_show(dis_buck_share);
ego_store(dis_buck_share);
ego_attr_rw(dis_buck_share);

ego_show(somac_wall);
ego_store(somac_wall);
ego_attr_rw(somac_wall);

static ssize_t show(struct kobject *kobj, struct attribute *at, char *buf)
{
	struct ego_attr *fvattr = container_of(at, struct ego_attr, attr);
	return fvattr->show(kobj, buf);
}

static ssize_t store(struct kobject *kobj, struct attribute *at,
					const char *buf, size_t count)
{
	struct ego_attr *fvattr = container_of(at, struct ego_attr, attr);
	return fvattr->store(kobj, buf, count);
}

static const struct sysfs_ops ego_sysfs_ops = {
	.show	= show,
	.store	= store,
};

static struct attribute *ego_attrs[] = {
	&ratio_attr.attr,
	&somac_wall_attr.attr,
	&dis_buck_share_attr.attr,
	NULL
};

static struct kobj_type ktype_ego = {
	.sysfs_ops	= &ego_sysfs_ops,
	.default_attrs	= ego_attrs,
};

/********************** cpufreq governor interface *********************/
struct cpufreq_governor energy_aware_gov;

static int ego_kthread_create(struct ego_policy *egp)
{
	struct task_struct *thread;
	struct sched_param param = { .sched_priority = MAX_RT_PRIO / 2 };
	struct cpufreq_policy *policy = egp->policy;
	int ret;

	/* kthread only required for slow path */
	if (policy->fast_switch_enabled)
		return 0;

	kthread_init_work(&egp->work, ego_work);
	kthread_init_worker(&egp->worker);
	thread = kthread_create(kthread_worker_fn, &egp->worker,
				"ego:%d", cpumask_first(policy->related_cpus));
	if (IS_ERR(thread)) {
		pr_err("failed to create ego thread: %ld\n", PTR_ERR(thread));
		return PTR_ERR(thread);
	}

	ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, &param);
	if (ret) {
		kthread_stop(thread);
		pr_warn("%s: failed to set SCHED_FIFO\n", __func__);
		return ret;
	}

	set_cpus_allowed_ptr(thread, &egp->thread_allowed_cpus);
	thread->flags |= PF_NO_SETAFFINITY;
	egp->thread = thread;
	init_irq_work(&egp->irq_work, ego_irq_work);
	mutex_init(&egp->work_lock);

	pr_info("%s: cpus=%#x, allowed-cpu=%#x\n", __func__,
			*(unsigned int *)cpumask_bits(&egp->cpus),
			*(unsigned int *)cpumask_bits(&egp->thread_allowed_cpus));

	return 0;
}

static int ego_init(struct cpufreq_policy *policy)
{
	struct ego_policy *egp = NULL;
	int cpu;

	/* State should be equivalent to EXIT */
	if (policy->governor_data)
		return -EBUSY;

	cpufreq_enable_fast_switch(policy);

	egp = per_cpu(ego_cpu, policy->cpu).egp;
	if (!egp) {
		pr_info("%s: ego_policy is not ready\n", __func__);
		goto fail_ego_init;
	}

	if (egp->policy) {
		egp->policy = policy;
		pr_info("%s: Already ego_policy was initialized\n", __func__);
		goto complete_ego_init;
	}
	egp->policy = policy;

	if (ego_kthread_create(egp)) {
		pr_info("%s: failed to create kthread\n", __func__);
		goto fail_ego_init;
	}

complete_ego_init:
	if (!policy->fast_switch_enabled)
		wake_up_process(egp->thread);

	policy->governor_data = egp;

	for_each_cpu(cpu, policy->related_cpus)
		cpufreq_register_hook(cpu, NULL, ego_need_slack_timer);

	pr_info("%s: ego init complete: cpus=%#x, allowed-cpu=%#x\n", __func__,
			*(unsigned int *)cpumask_bits(&egp->cpus),
			*(unsigned int *)cpumask_bits(&egp->thread_allowed_cpus));
	return 0;

fail_ego_init:
	cpufreq_disable_fast_switch(policy);
	pr_err("initialization failed\n");
	return -1;
}

static void ego_exit(struct cpufreq_policy *policy)
{
	int cpu;

	policy->governor_data = NULL;
	cpufreq_disable_fast_switch(policy);

	for_each_cpu(cpu, policy->related_cpus)
		cpufreq_unregister_hook(cpu);
}

static int ego_start(struct cpufreq_policy *policy)
{
	struct ego_policy *egp = policy->governor_data;
	unsigned int cpu;

	egp->pelt_margin		= DEFAULT_PELT_MARGIN;
	egp->freq_update_delay_ns	= 4 * NSEC_PER_MSEC;
	egp->up_rate_limit_ns		= 500 * NSEC_PER_MSEC;
	egp->down_rate_limit_ns		= 1000 * NSEC_PER_MSEC;
	egp->last_freq_update_time	= 0;
	egp->next_freq			= 0;
	egp->work_in_progress		= false;
	egp->limits_changed		= false;
	egp->need_freq_update		= false;
	egp->cached_raw_freq		= 0;

	for_each_cpu(cpu, policy->cpus) {
		struct ego_cpu *egc = &per_cpu(ego_cpu, cpu);
		egc->iowait_boost_pending = false;
		egc->iowait_boost = 0;
		egc->last_update = 0;
		egc->bw_dl = 0;
		egc->max = 0;
		egc->util = 0;
		egc->boosted_util = 0;
		egc->egp = egp;
		egc->cpu = cpu;
		egc->min_cap = ULONG_MAX;
	}

	for_each_cpu(cpu, policy->cpus) {
		struct ego_cpu *egc = &per_cpu(ego_cpu, cpu);
		cpufreq_add_update_util_hook(cpu,
			&egc->update_util, ego_update_shared);
	}
	return 0;
}

static void ego_stop(struct cpufreq_policy *policy)
{
	struct ego_policy *egp = policy->governor_data;
	unsigned int cpu;

	for_each_cpu(cpu, policy->cpus)
		cpufreq_remove_update_util_hook(cpu);

	synchronize_rcu();

	if (!policy->fast_switch_enabled) {
		irq_work_sync(&egp->irq_work);
		kthread_cancel_work_sync(&egp->work);
	}
}

static void ego_limits(struct cpufreq_policy *policy)
{
	struct ego_policy *egp = policy->governor_data;
	unsigned int target_freq;
	unsigned long flags;

	target_freq = max(egp->org_freq, egp->eng_freq);
	target_freq = clamp_val(target_freq, policy->min, policy->max);

	raw_spin_lock_irqsave(&egp->update_lock, flags);
	ego_update_min_cap(policy);
	ego_update_next_freq(egp, egp->last_freq_update_time, target_freq);
	raw_spin_unlock_irqrestore(&egp->update_lock, flags);

	if (!policy->fast_switch_enabled) {
		mutex_lock(&egp->work_lock);
		__cpufreq_driver_target(policy, target_freq, CPUFREQ_RELATION_H);
		mutex_unlock(&egp->work_lock);
	} else
		cpufreq_driver_fast_switch(policy, target_freq);
}

struct cpufreq_governor energy_aware_gov = {
	.name			= "energy_aware",
	.owner			= THIS_MODULE,
	.flags			= CPUFREQ_GOV_DYNAMIC_SWITCHING,
	.init			= ego_init,
	.exit			= ego_exit,
	.start			= ego_start,
	.stop			= ego_stop,
	.limits			= ego_limits,
};

#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_ENERGYAWARE
struct cpufreq_governor *cpufreq_default_governor(void)
{
	return &energy_aware_gov;
}
#endif

static int ego_register(struct kobject *ems_kobj)
{
	ego_kobj = kobject_create_and_add("ego", ems_kobj);
	if (!ego_kobj)
		return -EINVAL;

	sysbusy_register_notifier(&ego_sysbusy_notifier);
	emstune_register_notifier(&ego_mode_update_notifier);

	return cpufreq_register_governor(&energy_aware_gov);
}

static struct ego_policy *ego_policy_alloc(void)
{
	return kzalloc(sizeof(struct ego_policy), GFP_KERNEL);
}

static int ego_parse_dt(struct device_node *dn, struct ego_policy *egp)
{
	struct cpumask mask;
	const char *buf;

	if (of_property_read_string(dn, "cpus", &buf)) {
		pr_err("%s: cpus property is omitted\n", __func__);
		return -1;
	} else
		cpulist_parse(buf, &egp->cpus);

	if (!of_property_read_string(dn, "thread-run-on", &buf))
		cpulist_parse(buf, &mask);
	else
		cpumask_copy(&mask, cpu_possible_mask);
	cpumask_copy(&egp->thread_allowed_cpus, &mask);

	if (of_property_read_u32(dn, "ratio", &egp->ratio))
		egp->ratio = RATIO_UNIT;

	if (of_property_read_u32(dn, "dis-buck-share", &egp->dis_buck_share))
		egp->dis_buck_share = 0;

	if (of_property_read_u32(dn, "somac_wall", &egp->somac_wall))
		egp->somac_wall = UINT_MAX;

	return 0;
}

int ego_pre_init(struct kobject *ems_kobj)
{
	struct device_node *dn, *child;
	int cpu;
	dn = of_find_node_by_path("/ems/ego");
	if (!dn)
		goto fail;

	ego_register(ems_kobj);

	for_each_child_of_node(dn, child) {
		struct ego_policy *egp;

		egp = ego_policy_alloc();
		if (!egp) {
			pr_err("%s: failed to alloc ego_policy\n", __func__);
			goto fail;
		}

		/* Parse device tree */
		if (ego_parse_dt(child, egp))
			goto fail;

		/* Init Sysfs */
		if (kobject_init_and_add(&egp->kobj, &ktype_ego, ego_kobj,
					"coregroup%d", cpumask_first(&egp->cpus)))
			goto fail;

		/* init policy spin lock */
		raw_spin_lock_init(&egp->update_lock);

		for_each_cpu(cpu, &egp->cpus) {
			struct ego_cpu *egc = &per_cpu(ego_cpu, cpu);
			egc->egp = egp;
		}
	}

	return 0;

fail:
	for_each_possible_cpu(cpu) {
		if (per_cpu(ego_cpu, cpu).egp)
			kfree(per_cpu(ego_cpu, cpu).egp);
		per_cpu(ego_cpu, cpu).egp = NULL;
	}

	return -1;
}