/*
 * CPUFreq governor based on Energy-Step-Data And Scheduler-Event.
 *
 * Copyright (C) 2019,Samsung Electronic Corporation
 * Author: Youngtae Lee <yt0729.lee@samsung.com>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */

#include <linux/kthread.h>
#include <linux/cpufreq.h>
#include <uapi/linux/sched/types.h>
#include <linux/slab.h>
#include <linux/cpu_pm.h>
#include <linux/sched/cpufreq.h>

#include "../sched.h"
#include "ems.h"

#include <trace/events/ems.h>
#include <trace/events/ems_debug.h>

struct esgov_policy {
	struct cpufreq_policy	*policy;
	struct cpumask		cpus;
	struct kobject		kobj;
	raw_spinlock_t		update_lock;
	struct rw_semaphore	rwsem;
	bool			enabled;	/* whether esg is current cpufreq governor or not */
	bool			running;	/* whether esg is running or not */

	unsigned int		last_caller;
	unsigned int		target_freq;	/* target frequency at the current status */
	int			util;		/* target util  */
	u64			last_freq_update_time;

	/* The next fields are for the tunnables */
	int			step;
	unsigned long		step_power;	/* allowed energy at a step */
	s64			rate_delay_ns;
	int			patient_mode;
	int			pelt_margin;
	int			pelt_boost;
	int			up_rate_limit_ns;
	int			down_rate_limit_ns;
	int			rapid_scale_up;
	int			rapid_scale_down;
	bool			limits_changed;

	/* slack timer */
	int			slack_expired_time_ms;

	/* no work freq press */
	int			no_work_press_ratio;

	/* Tracking min max information */
	int			min_cap;	/* allowed max capacity */
	int			max_cap;	/* allowed min capacity */
	int			min;		/* min freq */
	int			max;		/* max freq */

	/* The next fields are for frequency change work */
	bool			work_in_progress;
	struct			irq_work irq_work;
	struct			kthread_work work;
	struct			mutex work_lock;
	struct			kthread_worker worker;
	struct task_struct	*thread;
};

struct esgov_cpu {
	struct update_util_data	update_util;

	struct esgov_policy	*esg_policy;
	unsigned int		cpu;
	int			util;		/* target util */
	int			pelt_util;	/* pelt util */
	int			step_util;	/* energy step util */
	int			io_util;	/* io boost util */
	int			active_ratio;

	int			capacity;
	int			last_idx;

	bool			iowait_boost_pending;
	unsigned int		iowait_boost;
	u64			last_update;

	unsigned long		min;		/* min util matched with min_cap */

	bool			no_work_press;
};

struct esgov_param {
	struct cpumask		cpus;
	int			step;
	int			patient_mode;
	int			pelt_margin;
	int			pelt_boost;
	int			up_rate_limit;
	int			down_rate_limit;
	int			rapid_scale_up;
	int			rapid_scale_down;
};

struct kobject *esg_kobj;
static struct esgov_policy __percpu **esgov_policy;
static struct esgov_cpu __percpu *esgov_cpu;
static struct esgov_param __percpu **esgov_param;

/*************************************************************************/
/*			       HELPER FUNCTION				 */
/************************************************************************/
int find_allowed_capacity(int cpu, unsigned int freq, int power)
{
	unsigned long cur_power = et_freq_to_dpower(cpu, freq);

	return et_dpower_to_cap(cpu, cur_power + power);
}

static void esg_update_freq_range(struct cpufreq_policy *data)
{
	unsigned int new_min, new_max, new_min_idx, new_max_idx;
	struct esgov_policy *esg_policy = *per_cpu_ptr(esgov_policy, data->cpu);
	struct cpufreq_policy *policy = esg_policy->policy;

	if (unlikely((!esg_policy) || !esg_policy->enabled))
		return;

	new_min = data->min;
	new_max = data->max;

	if (esg_policy->min == new_min && esg_policy->max == new_max)
		return;

	esg_policy->min = new_min;
	esg_policy->max = new_max;

	new_min_idx = cpufreq_frequency_table_target(
				policy, new_min, CPUFREQ_RELATION_L);
	new_max_idx = cpufreq_frequency_table_target(
				policy, new_max, CPUFREQ_RELATION_H);

	new_min = esg_policy->policy->freq_table[new_min_idx].frequency;
	new_max = esg_policy->policy->freq_table[new_max_idx].frequency;

	esg_policy->min_cap = find_allowed_capacity(policy->cpu, new_min, 0);
	esg_policy->max_cap = find_allowed_capacity(policy->cpu, new_max, 0);
	esg_policy->min_cap = min(esg_policy->max_cap, esg_policy->min_cap);

	trace_esg_update_limit(policy->cpu, esg_policy->min_cap, esg_policy->max_cap);
}

static void esg_update_step(struct esgov_policy *esg_policy, int step)
{
	int cpu = cpumask_first(&esg_policy->cpus);

	esg_policy->step = step;
	esg_policy->step_power = (et_max_dpower(cpu) - et_min_dpower(cpu)) / step;
}

static void
esg_sync_param(struct esgov_policy *esg_policy, struct esgov_param *param)
{
	esg_update_step(esg_policy, param->step);

	esg_policy->patient_mode = param->patient_mode;
	esg_policy->pelt_margin = param->pelt_margin;
	esg_policy->pelt_boost = param->pelt_boost;

	esg_policy->up_rate_limit_ns = param->up_rate_limit * NSEC_PER_MSEC;
	esg_policy->down_rate_limit_ns = param->down_rate_limit * NSEC_PER_MSEC;
	esg_policy->rapid_scale_up = param->rapid_scale_up;
	esg_policy->rapid_scale_down = param->rapid_scale_down;
}

static int esg_mode_update_callback(struct notifier_block *nb,
				unsigned long val, void *v)
{
	struct emstune_set *cur_set = (struct emstune_set *)v;
	struct esgov_policy *esg_policy;
	struct esgov_param *param;
	int cpu;

	for_each_possible_cpu(cpu) {
		if (cpu != cpumask_first(cpu_coregroup_mask(cpu)))
			continue;

		param = *per_cpu_ptr(esgov_param, cpu);
		if (unlikely(!param))
			continue;

		param->step = cur_set->esg.step[cpu];
		param->pelt_margin = cur_set->esg.pelt_margin[cpu];
		param->patient_mode = cur_set->cpufreq_gov.patient_mode[cpu];
		param->pelt_boost = cur_set->cpufreq_gov.pelt_boost[cpu];
		param->up_rate_limit = 4;
		param->down_rate_limit = 4;
		param->rapid_scale_up = cur_set->cpufreq_gov.rapid_scale_up;
		param->rapid_scale_down = cur_set->cpufreq_gov.rapid_scale_down;

		esg_policy = *per_cpu_ptr(esgov_policy, cpu);
		if (unlikely((!esg_policy) || !esg_policy->enabled))
			continue;

		esg_sync_param(esg_policy, param);
	}

	return NOTIFY_OK;
}

/*
 * return next maximum util of this group when task moves to dst_cpu
 * grp_cpu: one of the cpu of target group to get next maximum util
 * dst_cpu: dst_cpu of task
*/
static unsigned int esgov_calc_cpu_target_util(struct esgov_cpu *esg_cpu,
			int max, int org_pelt_util, int pelt_util_diff, int nr_running);
int esgov_get_gov_next_cap(struct tp_env *env, struct cpumask *cpus,
				int dst_cpu, bool apply_clamp)
{
	struct esgov_policy *esg_policy;
	int cpu, src_cpu = task_cpu(env->p);
	int max_util = 0;

	esg_policy = *per_cpu_ptr(esgov_policy, cpumask_any(cpus));
	if (unlikely(!esg_policy || !esg_policy->enabled))
		return -ENODEV;

	if (esg_policy->min_cap >= esg_policy->max_cap)
		return esg_policy->max_cap;

	/* get max util of the cluster of this cpu */
	for_each_cpu(cpu, esg_policy->policy->cpus) {
		struct esgov_cpu *esg_cpu = per_cpu_ptr(esgov_cpu, cpu);
		unsigned int max = capacity_cpu_orig(cpu);
		int cpu_util, pelt_util;
		int nr_running = env->cpu_stat[cpu].nr_running;

		if (!esg_cpu)
			continue;

		pelt_util = env->cpu_stat[cpu].util;

		if (cpu != dst_cpu && cpu!= src_cpu) {
			cpu_util = esgov_calc_cpu_target_util(esg_cpu, max,
					pelt_util, 0, nr_running);
		} else if (cpu == dst_cpu && cpu != src_cpu) {
			/* util of dst_cpu (when migrating task)*/
			cpu_util = esgov_calc_cpu_target_util(esg_cpu, max,
					pelt_util, env->task_util, nr_running);
		} else if (cpu != dst_cpu && cpu == src_cpu) {
			/*  util of src_cpu (when migrating task) */
			cpu_util = esgov_calc_cpu_target_util(esg_cpu, max,
					pelt_util, -env->task_util, nr_running);
		} else {
			/* util of src_cpu (when task stay on the src_cpu) */
			cpu_util = esgov_calc_cpu_target_util(esg_cpu, max,
					pelt_util, 0, nr_running);
		}

		if (cpu_util > max_util)
			max_util = cpu_util;
	}

	if (apply_clamp) {
		/* max floor depends on CPUFreq min/max lock */
		max_util = max(esg_policy->min_cap, max_util);
		max_util = min(esg_policy->max_cap, max_util);
	}

	return max_util;
}

/*********************************************************************/
/* 			      SLACK TIMER			     */
/*********************************************************************/
struct esgov_slack_timer {
	/* for slack timer */
	unsigned long min;
	int enabled;
	struct timer_list timer;
};

/* slack timer per cpu */
static struct esgov_slack_timer __percpu *esgov_timer;

static void slack_update_min(struct cpufreq_policy *policy)
{
	unsigned int cpu;
	unsigned long max_cap, min_cap;
	struct esgov_slack_timer *slack_timer;

	max_cap = capacity_cpu_orig(policy->cpu);

	/* min_cap is minimum value making higher frequency than policy->min */
	min_cap = (max_cap * policy->min) / policy->max;
	min_cap -= 1;

	for_each_cpu(cpu, policy->cpus) {
		slack_timer = per_cpu_ptr(esgov_timer, cpu);
		if (!slack_timer)
			continue;

		slack_timer->min = min_cap;
	}
}

static void slack_nop_timer(struct timer_list *timer)
{
	/*
	 * The purpose of slack-timer is to wake up the CPU from IDLE, in order
	 * to decrease its frequency if it is not set to minimum already.
	 *
	 * This is important for platforms where CPU with higher frequencies
	 * consume higher power even at IDLE.
	 */
	trace_cpufreq_gov_slack_func(smp_processor_id());
}

static int esgov_cpu_pm_callback(struct notifier_block *nb,
				 unsigned long event, void *v)
{
	unsigned int cpu = raw_smp_processor_id();
	struct esgov_slack_timer *slack_timer = per_cpu_ptr(esgov_timer, cpu);
	struct timer_list *timer;
	struct esgov_cpu *esg_cpu = per_cpu_ptr(esgov_cpu, cpu);

	if (!esg_cpu || !slack_timer)
		return NOTIFY_OK;

	if (!esg_cpu->esg_policy || !esg_cpu->esg_policy->running)
		return NOTIFY_OK;

	timer = &slack_timer->timer;

	switch (event) {
	case CPU_PM_ENTER:
		if (timer_pending(timer))
			del_timer_sync(timer);

		if (esg_cpu->util > slack_timer->min) {
			timer->expires = jiffies +
				 msecs_to_jiffies(esg_cpu->esg_policy->slack_expired_time_ms);
			add_timer_on(timer, cpu);

			trace_cpufreq_gov_slack(cpu, esg_cpu->util, slack_timer->min, event);
		}
		break;
	case CPU_PM_EXIT:
		if (timer_pending(timer)) {
			del_timer_sync(timer);

			trace_cpufreq_gov_slack(cpu, esg_cpu->util, slack_timer->min, event);
		}
		break;
	}

	return NOTIFY_OK;
}

static struct notifier_block esg_cpu_pm_notifier = {
	.notifier_call = esgov_cpu_pm_callback,
};

#define DEFAULT_SLACK_EXPIRED_TIME     (20)
static void esgov_init_slack_timer(struct cpufreq_policy *policy)
{
	int cpu;

	for_each_cpu(cpu, policy->related_cpus) {
		struct esgov_slack_timer *slack_timer = per_cpu_ptr(esgov_timer, cpu);

		if (!slack_timer)
			continue;

		/* Initialize slack-timer */
		slack_timer->min = ULONG_MAX;
		timer_setup(&slack_timer->timer, slack_nop_timer, TIMER_PINNED);
	}
}

/*************************************************************************/
/*			       IOWAIT BOOST				 */
/************************************************************************/

/**
 * esgov_iowait_reset() - Reset the IO boost status of a CPU.
 * @esg_cpu: the esgov data for the CPU to boost
 * @time: the update time from the caller
 * @set_iowait_boost: true if an IO boost has been requested
 *
 * The IO wait boost of a task is disabled after a tick since the last update
 * of a CPU. If a new IO wait boost is requested after more then a tick, then
 * we enable the boost starting from the minimum frequency, which improves
 * energy efficiency by ignoring sporadic wakeups from IO.
 */
static bool esgov_iowait_reset(struct esgov_cpu *esg_cpu, u64 time,
			       bool set_iowait_boost)
{
	s64 delta_ns = time - esg_cpu->last_update;

	/* Reset boost only if a tick has elapsed since last request */
	if (delta_ns <= TICK_NSEC)
		return false;

	esg_cpu->iowait_boost = set_iowait_boost ? esg_cpu->min : 0;
	esg_cpu->iowait_boost_pending = set_iowait_boost;

	return true;
}

/**
 * esggov_iowait_boost() - Updates the IO boost status of a CPU.
 * @esg_cpu: the esgov data for the CPU to boost
 * @time: the update time from the caller
 * @flags: SCHED_CPUFREQ_IOWAIT if the task is waking up after an IO wait
 *
 * Each time a task wakes up after an IO operation, the CPU utilization can be
 * boosted to a certain utilization which doubles at each "frequent and
 * successive" wakeup from IO, ranging from the utilization of the minimum
 * OPP to the utilization of the maximum OPP.
 * To keep doubling, an IO boost has to be requested at least once per tick,
 * otherwise we restart from the utilization of the minimum OPP.
 */
static void esgov_iowait_boost(struct esgov_cpu *esg_cpu, u64 time,
			       unsigned int flags)
{
	bool set_iowait_boost = flags & SCHED_CPUFREQ_IOWAIT;

	/* Reset boost if the CPU appears to have been idle enough */
	if (esg_cpu->iowait_boost &&
	    esgov_iowait_reset(esg_cpu, time, set_iowait_boost))
		return;

	/* Boost only tasks waking up after IO */
	if (!set_iowait_boost)
		return;

	/* Ensure boost doubles only one time at each request */
	if (esg_cpu->iowait_boost_pending)
		return;
	esg_cpu->iowait_boost_pending = true;

	/* Double the boost at each request */
	if (esg_cpu->iowait_boost) {
		esg_cpu->iowait_boost =
			min_t(unsigned int, esg_cpu->iowait_boost << 1, SCHED_CAPACITY_SCALE);
		return;
	}

	/* First wakeup after IO: start with minimum boost */
	esg_cpu->iowait_boost = esg_cpu->min;
}

/**
 * esgov_iowait_apply() - Apply the IO boost to a CPU.
 * @esg_cpu: the esgov data for the cpu to boost
 * @time: the update time from the caller
 * @util: the utilization to (eventually) boost
 * @max: the maximum value the utilization can be boosted to
 *
 * A CPU running a task which woken up after an IO operation can have its
 * utilization boosted to speed up the completion of those IO operations.
 * The IO boost value is increased each time a task wakes up from IO, in
 * esgov_iowait_apply(), and it's instead decreased by this function,
 * each time an increase has not been requested (!iowait_boost_pending).
 *
 * A CPU which also appears to have been idle for at least one tick has also
 * its IO boost utilization reset.
 *
 * This mechanism is designed to boost high frequently IO waiting tasks, while
 * being more conservative on tasks which does sporadic IO operations.
 */
static unsigned long esgov_iowait_apply(struct esgov_cpu *esg_cpu,
					u64 time, unsigned long max)
{
	unsigned long boost;

	/* No boost currently required */
	if (!esg_cpu->iowait_boost)
		return 0;

	/* Reset boost if the CPU appears to have been idle enough */
	if (esgov_iowait_reset(esg_cpu, time, false))
		return 0;

	if (!esg_cpu->iowait_boost_pending) {
		/*
		 * No boost pending; reduce the boost value.
		 */
		esg_cpu->iowait_boost >>= 1;
		if (esg_cpu->iowait_boost < esg_cpu->min) {
			esg_cpu->iowait_boost = 0;
			return 0;
		}
	}

	esg_cpu->iowait_boost_pending = false;

	boost = (esg_cpu->iowait_boost * max) >> SCHED_CAPACITY_SHIFT;
	boost = boost + (boost >> 2);
	return boost;
}


static struct notifier_block esg_mode_update_notifier = {
	.notifier_call = esg_mode_update_callback,
};

struct esg_attr {
	struct attribute attr;
	ssize_t (*show)(struct kobject *, char *);
	ssize_t (*store)(struct kobject *, const char *, size_t count);
};

#define esg_attr_rw(name)				\
static struct esg_attr name##_attr =			\
__ATTR(name, 0644, show_##name, store_##name)

#define esg_show_step(name, related_val)						\
static ssize_t show_##name(struct kobject *k, char *buf)			\
{										\
	struct esgov_policy *esg_policy =					\
			container_of(k, struct esgov_policy, kobj);		\
										\
	return sprintf(buf, "step: %d (energy: %lu)\n",				\
			esg_policy->name, esg_policy->related_val);		\
}

#define esg_store_step(name)								\
static ssize_t store_##name(struct kobject *k, const char *buf, size_t count)	\
{										\
	struct esgov_policy *esg_policy =					\
			container_of(k, struct esgov_policy, kobj);		\
	int data;								\
										\
	if (!sscanf(buf, "%d", &data))						\
		return -EINVAL;							\
										\
	esg_update_##name(esg_policy, data);					\
										\
	return count;								\
}

#define esg_show(name)								\
static ssize_t show_##name(struct kobject *k, char *buf)			\
{										\
	struct esgov_policy *esg_policy =					\
			container_of(k, struct esgov_policy, kobj);		\
										\
	return sprintf(buf, "%d\n", esg_policy->name);				\
}										\

#define esg_store(name)								\
static ssize_t store_##name(struct kobject *k, const char *buf, size_t count)	\
{										\
	struct esgov_policy *esg_policy =					\
			container_of(k, struct esgov_policy, kobj);		\
	int data;								\
										\
	if (!sscanf(buf, "%d", &data))						\
		return -EINVAL;							\
										\
	esg_policy->name = data;						\
	return count;								\
}

esg_show(slack_expired_time_ms);
esg_store(slack_expired_time_ms);
esg_attr_rw(slack_expired_time_ms);

esg_show(no_work_press_ratio);
esg_store(no_work_press_ratio);
esg_attr_rw(no_work_press_ratio);

static ssize_t show(struct kobject *kobj, struct attribute *at, char *buf)
{
	struct esg_attr *fvattr = container_of(at, struct esg_attr, attr);
	return fvattr->show(kobj, buf);
}

static ssize_t store(struct kobject *kobj, struct attribute *at,
					const char *buf, size_t count)
{
	struct esg_attr *fvattr = container_of(at, struct esg_attr, attr);
	return fvattr->store(kobj, buf, count);
}

static const struct sysfs_ops esg_sysfs_ops = {
	.show	= show,
	.store	= store,
};

static struct attribute *esg_attrs[] = {
	&slack_expired_time_ms_attr.attr,
	&no_work_press_ratio_attr.attr,
	NULL
};

static struct kobj_type ktype_esg = {
	.sysfs_ops	= &esg_sysfs_ops,
	.default_attrs	= esg_attrs,
};

static struct esgov_policy *esgov_policy_alloc(struct cpufreq_policy *policy)
{
	struct esgov_policy *esg_policy;
	struct esgov_param *param;

	/* allocate esgov_policy */
	esg_policy = kzalloc(sizeof(struct esgov_policy), GFP_KERNEL);
	if (!esg_policy)
		goto init_failed;

	/* Init cpumask */
	cpumask_copy(&esg_policy->cpus, policy->related_cpus);
	if (cpumask_weight(&esg_policy->cpus) == 0)
		goto free_allocation;

	param = *per_cpu_ptr(esgov_param, cpumask_any(&esg_policy->cpus));
	esg_sync_param(esg_policy, param);

	esgov_init_slack_timer(policy);

	esg_policy->rate_delay_ns = 4 * NSEC_PER_MSEC;

	/* Init Sysfs */
	if (kobject_init_and_add(&esg_policy->kobj, &ktype_esg, esg_kobj,
			"coregroup%d", cpumask_first(&esg_policy->cpus)))
		goto free_allocation;

	/* init spin lock */
	raw_spin_lock_init(&esg_policy->update_lock);
	init_rwsem(&esg_policy->rwsem);

	esg_policy->policy = policy;

	return esg_policy;

free_allocation:
	kfree(esg_policy);

init_failed:
	pr_warn("%s: Failed esgov_init(cpu%d)\n", __func__, policy->cpu);

	return NULL;
}

static void esgov_work(struct kthread_work *work)
{
	struct esgov_policy *esg_policy = container_of(work, struct esgov_policy, work);
	unsigned int freq;
	unsigned long flags;

	raw_spin_lock_irqsave(&esg_policy->update_lock, flags);
	freq = esg_policy->target_freq;
	esg_policy->work_in_progress = false;
	raw_spin_unlock_irqrestore(&esg_policy->update_lock, flags);

	down_write(&esg_policy->policy->rwsem);
	mutex_lock(&esg_policy->work_lock);
	__cpufreq_driver_target(esg_policy->policy, freq, CPUFREQ_RELATION_L);
	mutex_unlock(&esg_policy->work_lock);
	up_write(&esg_policy->policy->rwsem);
}

static void esgov_irq_work(struct irq_work *irq_work)
{
	struct esgov_policy *esg_policy;

	esg_policy = container_of(irq_work, struct esgov_policy, irq_work);

	kthread_queue_work(&esg_policy->worker, &esg_policy->work);
}

static int esgov_kthread_create(struct esgov_policy *esg_policy)
{
	struct task_struct *thread;
	struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 };
	struct cpufreq_policy *policy = esg_policy->policy;
	struct device_node *dn;
	int ret;

	/* kthread only required for slow path */
	if (policy->fast_switch_enabled)
		return 0;

	kthread_init_work(&esg_policy->work, esgov_work);
	kthread_init_worker(&esg_policy->worker);
	thread = kthread_create(kthread_worker_fn, &esg_policy->worker,
				"esgov:%d", cpumask_first(policy->related_cpus));
	if (IS_ERR(thread)) {
		pr_err("failed to create esgov thread: %ld\n", PTR_ERR(thread));
		return PTR_ERR(thread);
	}

	ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, &param);
	if (ret) {
		kthread_stop(thread);
		pr_warn("%s: failed to set SCHED_CLASS\n", __func__);
		return ret;
	}

	dn = of_find_node_by_path("/ems/esg");
	if (dn) {
		struct cpumask mask;
		const char *buf;

		cpumask_copy(&mask, cpu_possible_mask);
		if (!of_property_read_string(dn, "thread-run-on", &buf))
			cpulist_parse(buf, &mask);

		set_cpus_allowed_ptr(thread, &mask);
		thread->flags |= PF_NO_SETAFFINITY;
	}

	esg_policy->thread = thread;
	init_irq_work(&esg_policy->irq_work, esgov_irq_work);
	mutex_init(&esg_policy->work_lock);

	wake_up_process(thread);

	return 0;
}

static void esgov_policy_free(struct esgov_policy *esg_policy)
{
	kfree(esg_policy);
}

static int esgov_init(struct cpufreq_policy *policy)
{
	struct esgov_policy *esg_policy;
	int ret = 0;
	int cpu;

	if (policy->governor_data)
		return -EBUSY;

	cpufreq_enable_fast_switch(policy);

	esg_policy = *per_cpu_ptr(esgov_policy, policy->cpu);
	if (esg_policy) {
		pr_info("%s: Already allocated esgov_policy\n", __func__);
		goto complete_esg_init;
	}

	esg_policy = esgov_policy_alloc(policy);
	if (!esg_policy) {
		ret = -ENOMEM;
		goto failed_to_init;
	}

	ret = esgov_kthread_create(esg_policy);
	if (ret)
		goto free_esg_policy;

	for_each_cpu(cpu, &esg_policy->cpus)
		*per_cpu_ptr(esgov_policy, cpu) = esg_policy;

complete_esg_init:
	down_write(&esg_policy->rwsem);
	policy->governor_data = esg_policy;
	esg_policy->min = policy->min;
	esg_policy->max = policy->max;
	esg_policy->min_cap = find_allowed_capacity(policy->cpu, policy->min, 0);
	esg_policy->max_cap = find_allowed_capacity(policy->cpu, policy->max, 0);
	esg_policy->enabled = true;;
	esg_policy->last_caller = UINT_MAX;
	cpufreq_register_hook(esgov_get_gov_next_cap);
	up_write(&esg_policy->rwsem);

	return 0;

free_esg_policy:
	esgov_policy_free(esg_policy);

failed_to_init:
	pr_err("initialization failed (error %d)\n", ret);

	return ret;
}

static void esgov_exit(struct cpufreq_policy *policy)
{
	struct esgov_policy *esg_policy = *per_cpu_ptr(esgov_policy, policy->cpu);

	down_write(&esg_policy->rwsem);
	cpufreq_unregister_hook();
	esg_policy->enabled = false;;
	policy->governor_data = NULL;
	up_write(&esg_policy->rwsem);

	cpufreq_disable_fast_switch(policy);
}

static unsigned int get_next_freq(struct esgov_policy *esg_policy,
				unsigned long util, unsigned long max)
{
	struct cpufreq_policy *policy = esg_policy->policy;
	unsigned int freq;

	freq = (policy->cpuinfo.max_freq * util) / max;

	freq = cpufreq_driver_resolve_freq(policy, freq);
	freq = fclamp_apply(policy, freq);

	return clamp_val(freq, esg_policy->min, esg_policy->max);
}

static int esg_get_boost_pelt_util(int capacity, int util, int boost)
{
	long long margin;

	if (!boost)
		return util;

	if (boost > 0) {
		margin = max(capacity - util, 0) * boost;
	} else {
		margin = util * boost;
	}
	margin /= 100;

	return util + margin;
}

#define FREQ_UPDATE_BIT		(1 << 31)
/* return the max_util of this cpu */
static unsigned int esgov_calc_cpu_target_util(struct esgov_cpu *esg_cpu,
			int max, int org_pelt_util, int pelt_util_diff, int nr_running)
{
	int util, step_util, pelt_util, io_util;
	int org_io_util, org_step_util;
	int pelt_margin, pelt_boost;

	if (unlikely(!esg_cpu->esg_policy))
		return org_pelt_util;

	org_io_util = esg_cpu->io_util;
	org_step_util = esg_cpu->step_util;
	pelt_margin = esg_cpu->esg_policy->pelt_margin;
	pelt_boost = esg_cpu->esg_policy->pelt_boost;

	/* calculate boost util */
	io_util = org_io_util;

	/*
	 * calculate pelt_util
	 * add pelt_util_diff and then apply pelt-margin to sched-util
	 * pelt_util_diff: util_diff by migrating task
	 */
	pelt_util = org_pelt_util + pelt_util_diff;
	pelt_util = max(pelt_util, 0);
	if (pelt_util > 0) {
		pelt_util += pelt_util * pelt_margin / 100;
		pelt_util = esg_get_boost_pelt_util(capacity_cpu(esg_cpu->cpu),
				pelt_util, pelt_boost);
		pelt_util = min(max, pelt_util);
	}

	pelt_util = nr_running & ~FREQ_UPDATE_BIT ?  pelt_util :
			pelt_util * esg_cpu->esg_policy->no_work_press_ratio / 100;

	/*
	 * calculate step util
	 * if there is no running task, step util is always 0
	 */
	step_util = nr_running & ~FREQ_UPDATE_BIT ? org_step_util : 0;
	step_util = (esg_cpu->active_ratio == SCHED_CAPACITY_SCALE) ? step_util : 0;

	/* find max util */
	util = max(pelt_util, step_util);
	util = max(util, io_util);

	/* apply wakeup boost value */
	if ((nr_running & FREQ_UPDATE_BIT) && !pelt_boost)
		util = wakeboost_cpu_boost(esg_cpu->cpu, util);

	/* apply emstune_boost */
	util = freqboost_cpu_boost(esg_cpu->cpu, util);

	if (nr_running & FREQ_UPDATE_BIT)
		trace_esg_cpu_util(esg_cpu->cpu, nr_running & ~FREQ_UPDATE_BIT,
			org_io_util, io_util, org_step_util, step_util,
			org_pelt_util, pelt_util, pelt_margin, pelt_boost, max, util);

	return util;
}

/* return max util of the cluster of this cpu */
static unsigned int esgov_get_target_util(struct esgov_policy *esg_policy,
						u64 time, unsigned long max)
{
	unsigned long max_util = 0;
	unsigned int cpu;

	/* get max util in the cluster */
	for_each_cpu(cpu, esg_policy->policy->cpus) {
		struct esgov_cpu *esg_cpu = per_cpu_ptr(esgov_cpu, cpu);
		int nr_running = cpu_rq(cpu)->nr_running;

		if (!esgov_cpu)
			continue;

		esg_cpu->util = esgov_calc_cpu_target_util(
					esg_cpu, max, esg_cpu->pelt_util,
					0, nr_running | FREQ_UPDATE_BIT);

		if (esg_cpu->util > max_util)
			max_util = esg_cpu->util;
	}

	return max_util;
}

/* update the step_util */
#define PATIENT_MODE_BUSY_RATIO	950
static unsigned long esgov_get_step_util(struct esgov_cpu *esg_cpu, unsigned long max)
{
	struct esgov_policy *esg_policy = *per_cpu_ptr(esgov_policy, esg_cpu->cpu);
	unsigned int freq;
	int active_ratio = 0, util, prev_idx, hist_count = 0, idx;
	int patient_tick, over_cnt = 0;
	int hist_idx = mlt_cur_period(esg_cpu->cpu);
	int cpu = esg_cpu->cpu;

	if (unlikely(!esg_policy || !esg_policy->step))
		return 0;

	if (esg_cpu->last_idx == hist_idx)
		return esg_cpu->step_util;

	/* get active ratio for patient mode */
	idx = esg_cpu->last_idx = hist_idx;
	patient_tick = esg_policy->patient_mode;
	while (hist_count++ < patient_tick) {
		if (mlt_art_value(cpu, idx) > PATIENT_MODE_BUSY_RATIO)
			over_cnt++;
		idx = mlt_prev_period(idx);
	}
	active_ratio = (patient_tick == over_cnt) ? SCHED_CAPACITY_SCALE : 0;
	esg_cpu->active_ratio = active_ratio;

	/* get active ratio for step util */
	prev_idx = mlt_prev_period(hist_idx);
	active_ratio = mlt_art_value(cpu, hist_idx) + mlt_art_value(cpu, prev_idx);
	active_ratio >>= 1;

	/* update the capacity */
	freq = esg_cpu->step_util * (esg_policy->policy->max / max);
	esg_cpu->capacity = find_allowed_capacity(esg_cpu->cpu, freq, esg_policy->step_power);

	/* calculate step_util */
	util = (esg_cpu->capacity * active_ratio) >> SCHED_CAPACITY_SHIFT;

	trace_esg_cpu_step_util(esg_cpu->cpu, esg_cpu->capacity, active_ratio, max, util);

	return util;
}

 /* update cpu util */
static void esgov_update_cpu_util(struct esgov_policy *esg_policy, u64 time, unsigned long max)
{
	int cpu;

	for_each_cpu(cpu, esg_policy->policy->cpus) {
		struct esgov_cpu *esg_cpu = per_cpu_ptr(esgov_cpu, cpu);
		struct rq *rq = cpu_rq(cpu);

		if (!esg_cpu)
			continue;

		/* update iowait boost util */
		esg_cpu->io_util = esgov_iowait_apply(esg_cpu, time, max);

		/* update sched_util */
		esg_cpu->pelt_util = ml_cpu_util(cpu) + cpu_util_rt(rq);

		/* update step_util, If cpu is idle, we want to ignore step_util */
		esg_cpu->step_util = esgov_get_step_util(esg_cpu, max);
	}
}

static bool esgov_check_rate_delay(struct esgov_policy *esg_policy, u64 time)
{
	s64 delta_ns = time - esg_policy->last_freq_update_time;

	if (delta_ns < esg_policy->rate_delay_ns)
		return false;

	return true;
}

static unsigned int
get_diff_num_levels(struct cpufreq_policy *policy, unsigned int freq)
{
	unsigned int index1, index2;

	index1 = cpufreq_frequency_table_get_index(policy, policy->cur);
	index2 = cpufreq_frequency_table_get_index(policy, freq);

	return abs(index1 - index2);
}

enum {
	RAPID_SCALE_UP = 1,
	RAPID_SCALE_DOWN,
	RAPID_SCALE_MUST,
};

#define ESG_MAX_DELAY_PERIODS 5
/*
 * Return true if we can delay frequency update because the requested frequency
 * change is not large enough, and false if it is large enough. The condition
 * for determining large enough compares the number of frequency level change
 * vs., elapsed time since last frequency update. For example,
 * ESG_MAX_DELAY_PERIODS of 5 would mean immediate frequency change is allowed
 * only if the change in frequency level is greater or equal to 5;
 * It also means change in frequency level equal to 1 would need to
 * wait 5 ticks for it to take effect.
 */
static bool esgov_postpone_freq_update(struct esgov_policy *esg_policy,
			u64 time, unsigned int target_freq, int rapid_scale)
{
	unsigned int diff_num_levels, num_periods, elapsed, margin;

	if (rapid_scale == RAPID_SCALE_MUST)
		return false;

	elapsed = time - esg_policy->last_freq_update_time;

	/* In this point target_freq is different with cur freq */
	if (esg_policy->policy->cur < target_freq) {
		u64 ramp_up_bound = esg_policy->up_rate_limit_ns;

		if (rapid_scale == RAPID_SCALE_UP)
			return false;

		if (elapsed < ramp_up_bound)
			return true;
	} else {
		u64 ramp_down_bound = esg_policy->down_rate_limit_ns;

		if (rapid_scale == RAPID_SCALE_DOWN)
			return false;

		if (elapsed < ramp_down_bound)
			return true;
	}

	/*
	 * if there is no pelt_margin, we do better increase
	 * frequency immediately to prevent performance drop
	 */
	if (esg_policy->pelt_margin <= 0 && target_freq > esg_policy->policy->cur)
		return false;

	margin  = esg_policy->rate_delay_ns >> 2;
	num_periods = (elapsed + margin) / esg_policy->rate_delay_ns;
	if (num_periods > ESG_MAX_DELAY_PERIODS)
		return false;

	diff_num_levels = get_diff_num_levels(esg_policy->policy, target_freq);
	if (diff_num_levels > ESG_MAX_DELAY_PERIODS - num_periods)
		return false;
	else
		return true;
}

static int esgov_check_rapid_scale(struct esgov_policy *esg_policy, struct esgov_cpu *esg_cpu)
{
	int cpu, nr_running = 0;

	if (esg_policy->limits_changed) {
		esg_policy->limits_changed = false;
		return RAPID_SCALE_MUST;
	}

	for_each_cpu(cpu, &esg_cpu->esg_policy->cpus)
		nr_running += cpu_rq(cpu)->nr_running;

	/* rapid scale up */
	if (esg_cpu->no_work_press
		&& wakeboost_pending(esg_cpu->cpu)) {
		esg_cpu->no_work_press = false;
		if (esg_cpu->esg_policy->rapid_scale_up)
			return RAPID_SCALE_UP;
	}

	/* rapid scale down */
	if (!esg_cpu->no_work_press
		&& !wakeboost_pending(esg_cpu->cpu)) {
		esg_cpu->no_work_press = true;
		if (esg_cpu->esg_policy->rapid_scale_down)
			return RAPID_SCALE_DOWN;
	}

	return 0;
}

static void
esgov_update(struct update_util_data *hook, u64 time, unsigned int flags)
{
	struct esgov_cpu *esg_cpu = container_of(hook, struct esgov_cpu, update_util);
	struct esgov_policy *esg_policy = esg_cpu->esg_policy;
	unsigned long max = capacity_cpu_orig(esg_cpu->cpu);
	unsigned int target_util, target_freq;
	int rapid_scale;

	if (unlikely(!esg_policy))
		return;

	/* check iowait boot */
	esgov_iowait_boost(esg_cpu, time, flags);
	esg_cpu->last_update = time;

	if (!cpufreq_this_cpu_can_update(esg_policy->policy))
		return;

	/*
	 * try to hold lock.
	 * If somebody is holding this lock, this updater(cpu) will be failed
	 * to update freq because somebody already updated or will update
	 * the last_update_time very close with now
	 */
	if (!raw_spin_trylock(&esg_policy->update_lock))
		return;

	rapid_scale = esgov_check_rapid_scale(esg_policy, esg_cpu);

	/* check rate delay */
	if (!rapid_scale && !esgov_check_rate_delay(esg_policy, time))
		goto out;

	/* update cpu_util of this cluster */
	esgov_update_cpu_util(esg_policy, time, max);

	/* update target util of the cluster of this cpu */
	target_util = esgov_get_target_util(esg_policy, time, max);

	/* get target freq for new target util */
	target_freq = get_next_freq(esg_policy, target_util, max);
	if (esg_policy->policy->cur == target_freq)
		goto out;

	/* inform new freq to et */
	et_update_freq(esg_cpu->cpu, target_freq);

	if (esgov_postpone_freq_update(esg_policy, time,
				target_freq, rapid_scale))
		goto out;

	if (esg_policy->work_in_progress)
		goto out;

	esg_policy->last_caller = smp_processor_id();
	esg_policy->util = target_util;
	esg_policy->target_freq = target_freq;
	esg_policy->last_freq_update_time = time;
	trace_esg_req_freq(esg_policy->policy->cpu,
		esg_policy->util, esg_policy->target_freq, rapid_scale);

	if (esg_policy->policy->fast_switch_enabled) {
		cpufreq_driver_fast_switch(esg_policy->policy, target_freq);
	} else {
		esg_policy->work_in_progress = true;
		irq_work_queue(&esg_policy->irq_work);
	}
out:
	raw_spin_unlock(&esg_policy->update_lock);
}

static int esgov_start(struct cpufreq_policy *policy)
{
	struct esgov_policy *esg_policy = policy->governor_data;
	unsigned int cpu;

	/* TODO: We SHOULD implement FREQVAR-RATE-DELAY Base on SchedTune */
	esg_policy->last_freq_update_time = 0;
	esg_policy->target_freq = 0;

	/* Initialize slack expired time */
	esg_policy->slack_expired_time_ms = 20;	/* Default 20ms */

	/* Initialize no work press ratio */
	esg_policy->no_work_press_ratio = 100;	/* Default 100% */

	esg_policy->limits_changed = false;
	esg_policy->running = 1;

	for_each_cpu(cpu, policy->cpus) {
		struct esgov_cpu *esg_cpu = per_cpu_ptr(esgov_cpu, cpu);
		esg_cpu->esg_policy = esg_policy;
		esg_cpu->cpu = cpu;
		esg_cpu->min =
			(SCHED_CAPACITY_SCALE * policy->cpuinfo.min_freq) /
			policy->cpuinfo.max_freq;
	}

	for_each_cpu(cpu, policy->cpus) {
		struct esgov_cpu *esg_cpu = per_cpu_ptr(esgov_cpu, cpu);
		cpufreq_add_update_util_hook(cpu, &esg_cpu->update_util,
							esgov_update);
	}

	return 0;
}

static void esgov_stop(struct cpufreq_policy *policy)
{
	struct esgov_policy *esg_policy = policy->governor_data;
	unsigned int cpu;

	for_each_cpu(cpu, policy->cpus)
		cpufreq_remove_update_util_hook(cpu);

	synchronize_rcu();

	if (!policy->fast_switch_enabled)
		irq_work_sync(&esg_policy->irq_work);

	esg_policy->running = 0;
}

static void esgov_limits(struct cpufreq_policy *policy)
{
	struct esgov_policy *esg_policy = policy->governor_data;
	unsigned long max = capacity_cpu_orig(policy->cpu);
	unsigned int target_util, target_freq = 0;

	/* These don't need to get work_lock */
	slack_update_min(policy);
	esg_update_freq_range(policy);

	if (!policy->fast_switch_enabled) {
		mutex_lock(&esg_policy->work_lock);
		cpufreq_policy_apply_limits(policy);

		/* Get target util of the cluster of this cpu */
		target_util = esgov_get_target_util(esg_policy, 0, max);

		/* get target freq for new target util */
		target_freq = get_next_freq(esg_policy, target_util, max);

		/*
		 * After freq limits change, CPUFreq policy->cur can be different
		 * with ESG's target freq. In that case, explicitly change current freq
		 * to ESG's target freq
		 */
		if (policy->cur != target_freq)
			__cpufreq_driver_target(policy, target_freq, CPUFREQ_RELATION_L);

		mutex_unlock(&esg_policy->work_lock);
	} else
		esg_policy->limits_changed = true;
}

struct cpufreq_governor energy_step_gov = {
	.name			= "energy_step",
	.owner			= THIS_MODULE,
	.flags			= CPUFREQ_GOV_DYNAMIC_SWITCHING,
	.init			= esgov_init,
	.exit			= esgov_exit,
	.start			= esgov_start,
	.stop			= esgov_stop,
	.limits			= esgov_limits,
};

#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_ENERGYSTEP
unsigned long cpufreq_governor_get_util(unsigned int cpu)
{
	struct esgov_cpu *esg_cpu = per_cpu_ptr(esgov_cpu, cpu);

	if (!esg_cpu)
		return 0;

	return esg_cpu->util;
}

unsigned int cpufreq_governor_get_freq(int cpu)
{
	struct esgov_policy *esg_policy;
	unsigned int freq;
	unsigned long flags;

	if (cpu < 0)
		return 0;

	esg_policy = *per_cpu_ptr(esgov_policy, cpu);
	if (!esg_policy)
		return 0;

	raw_spin_lock_irqsave(&esg_policy->update_lock, flags);
	freq = esg_policy->target_freq;
	raw_spin_unlock_irqrestore(&esg_policy->update_lock, flags);

	return freq;
}

struct cpufreq_governor *cpufreq_default_governor(void)
{
	return &energy_step_gov;
}
#endif

static int esgov_register(void)
{
	cpu_pm_register_notifier(&esg_cpu_pm_notifier);
	emstune_register_notifier(&esg_mode_update_notifier);

	return cpufreq_register_governor(&energy_step_gov);
}

#define DEFAULT_ESG_STEP	(4)
#define DEFAULT_PATIENT_MODE	(0)
#define DEFAULT_PELT_MARGIN	(25)
#define DEFAULT_PELT_BOOST	(0)

int esgov_pre_init(struct kobject *ems_kobj)
{
	struct device_node *dn, *child;
	struct esgov_param *param;
	int ret = 0, cpu;

	esgov_policy = alloc_percpu(struct esgov_policy *);
	esgov_cpu = alloc_percpu(struct esgov_cpu);
	esgov_param = alloc_percpu(struct esgov_param *);
	esgov_timer = alloc_percpu(struct esgov_slack_timer);

	esgov_register();

	dn = of_find_node_by_path("/ems/esg");
	for_each_child_of_node(dn, child) {
		const char *buf;

		param = kzalloc(sizeof(struct esgov_param), GFP_KERNEL);
		if (!param) {
			pr_err("%s: failed to alloc esgov_param\n", __func__);
			ret = -ENOMEM;
			goto fail;
		}

		if (of_property_read_string(child, "shared-cpus", &buf)) {
			pr_err("%s: cpus property is omitted\n", __func__);
			ret = -ENODATA;
			goto fail;
		} else
			cpulist_parse(buf, &param->cpus);

		for_each_cpu(cpu, &param->cpus)
			*per_cpu_ptr(esgov_param, cpu) = param;
	}

	esg_kobj = kobject_create_and_add("energy_step", ems_kobj);
	if (!esg_kobj)
		return -EINVAL;

	return ret;

fail:
	for_each_possible_cpu(cpu)
		kfree(*per_cpu_ptr(esgov_param, cpu));

	return ret;
}