/****************************************************************************
 * Helper driver for Solarflare network adapters
 * This driver applies tweaks needed for high performance on some systems
 *
 * Copyright 2007:      Solarflare Communications Inc,
 *                      9501 Jeronimo Road, Suite 250,
 *                      Irvine, CA 92618, USA
 *
 * Developed by Solarflare Communications <linux-net-drivers@solarflare.com>
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation, incorporated herein by reference.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 ****************************************************************************
 */

/* This file implements an alternate idle loop to work-around an A1
 * Silicon issue.
 */

#include <linux/version.h>
#include <linux/module.h>

#include <linux/sched.h>
#include <linux/notifier.h>
#include <asm/semaphore.h>
#include <linux/timer.h>
#include <linux/signal.h>
#include <linux/smp_lock.h>
#include <linux/cpu.h>

#ifdef EFX_USE_KCOMPAT
# include "config.h"
# include "kernel_compat.h"
#endif

#include "idle.h"
#include "efx.h"

#if !defined(CONFIG_SMP) && LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,11) \
	&& LINUX_VERSION_CODE < KERNEL_VERSION(2,6,15)
	/* set_cpus_allowed() refers to cpu_online_map, which does not exist */
	static inline int efx_set_cpus_allowed(task_t *p, cpumask_t new_mask)
	{
		return 0;
	}
	#define set_cpus_allowed efx_set_cpus_allowed
#endif

#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,15)
	#define task_thread_info(task) (task)->thread_info
#endif

static unsigned int idle_enable = 1;
module_param(idle_enable, uint, 0444);
MODULE_PARM_DESC(idle_enable, "sfc_tune: Idle polling (0=>disabled,"
		 " 1=>auto, 2=>enabled)");

static DECLARE_MUTEX(efx_idle_startup_mutex);

static inline void efx_set_thread_info(struct task_struct *task,
				       struct thread_info *info)
{
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
	task->thread_info = info;
#else
	task->stack = (char *)info;
#endif
}

#if !defined(CPU_DOWN_PREPARE) && defined(CPU_OFFLINE)
#    error CPU_OFFLINE
#    define CPU_DOWN_PREPARE CPU_OFFLINE
#endif

struct efx_thread {
	char name[32];
	int (*func) (struct efx_thread *);

	struct task_struct *task;
	/* There's no need for request code because the requests are
	 * always performed in the order: create, start, stop.  The
	 * thread calls down() on request_sem and the client calls
	 * up() to indicate that the next step should be performed by
	 * the thread.  The client calls down() on response_sem and
	 * the thread calls up() to indicate that the step has been
	 * completed. */
	struct semaphore request_sem;
	struct semaphore response_sem;
	/* The success status of the previous step. */
	int result;
	/* Set to 1 to indicate that the thread should stop. */
	int is_stopping;
	/* The workqueue item for creating the thread. */
	struct work_struct work;
};

#define EFX_IDLE_STATE_UNLOAD  1
#define EFX_IDLE_STATE_OFFLINE 2
#define EFX_IDLE_STATE_INIT    3
#define EFX_IDLE_STATE_TIMER   4
#define EFX_IDLE_STATE_READY   5
#define EFX_IDLE_STATE_ACTIVE  6

struct efx_idle_info {
	int state;
	struct efx_thread new_idle;
	struct task_struct *old_idle_task;
	struct timer_list timer;
};

static DEFINE_PER_CPU(struct efx_idle_info, efx_idle_info);

static DECLARE_MUTEX(efx_idle_switcher_mutex);

static struct efx_thread efx_idle_switcher;

static unsigned long idle_cpus = ~0;
module_param(idle_cpus, ulong, 0644);
MODULE_PARM_DESC(idle_cpus,
		 "The set of CPUs which should use the enhanced idle loop");


/*** Thread helpers ******************************************************/

static int efx_thread_wrapper(void *arg)
{
	struct efx_thread *thread = (struct efx_thread *)arg;
	int res;

	lock_kernel();
	daemonize(thread->name);
	unlock_kernel();

	/* Handle the "create" request. */
	down(&thread->request_sem);
	thread->result = 0;
	thread->task = current;
	up(&thread->response_sem);

	/* Handle a "start" request. */
	down(&thread->request_sem);
	EFX_BUG_ON_PARANOID(thread->is_stopping);
	up(&thread->response_sem);

	/* Call the thread function. */
	res = thread->func(thread);

	/* Handle the "stop" request. */
	down(&thread->request_sem);
	EFX_BUG_ON_PARANOID(!thread->is_stopping);
	thread->result = res;
	thread->task = NULL;
	up(&thread->response_sem);

	return res;
}


static void efx_thread_work_fn(struct work_struct *arg)
{
	int pid;
	struct efx_thread *thread = container_of(arg, struct efx_thread, work);

	pid = kernel_thread(efx_thread_wrapper, thread,
			    CLONE_FS | CLONE_FILES | CLONE_SIGHAND);
	if (pid < 0) {
		printk(KERN_ERR "%s: kernel_thread returned %d (%s)",
		       __func__, pid, thread->name);
		/* Handle the "create" request. */
		down(&thread->request_sem);
		thread->result = pid;
		up(&thread->response_sem);
	}
}


static int efx_thread_create(struct efx_thread *thread,
			     int (*func) (struct efx_thread *))
{
	/* thread->name should be initialised already */

	/* Make sure the name is terminated. */
	thread->name[sizeof(thread->name) - 1] = 0;
	thread->func = func;
	thread->task = NULL;
	thread->result = 0;
	sema_init(&thread->request_sem, 0);
	sema_init(&thread->response_sem, 0);
	thread->is_stopping = 0;

	memset(&thread->work, 0, sizeof(thread->work));

	INIT_WORK(&thread->work, efx_thread_work_fn);

	if (schedule_work(&thread->work) == 0) {
		printk(KERN_ERR "%s: schedule_work failed (%s)",
			__func__, thread->name);
		return -EBUSY;
	}

	/* Wait for the thread to be created. */
	up(&thread->request_sem);
	down(&thread->response_sem);

	return thread->result;
}


static void efx_thread_start(struct efx_thread *thread)
{
	EFX_BUG_ON_PARANOID(thread->is_stopping);
	EFX_BUG_ON_PARANOID(!thread->task);

	thread->is_stopping = 0;

	/* Wait for the task to start. */
	up(&thread->request_sem);
	down(&thread->response_sem);

	EFX_BUG_ON_PARANOID(!thread->task);
}


static int efx_thread_stop(struct efx_thread *thread)
{
	EFX_BUG_ON_PARANOID(thread->is_stopping);
	EFX_BUG_ON_PARANOID(!thread->task);

	/* Set stopping flag. */
	*(volatile int *)&thread->is_stopping = 1;
	wake_up_process(thread->task);

	/* Wait for the task to exit. */
	up(&thread->request_sem);
	down(&thread->response_sem);

	EFX_BUG_ON_PARANOID(thread->task);
	return thread->result;
}

static inline int efx_thread_should_stop(struct efx_thread *thread)
{
	return *(volatile int *)&thread->is_stopping;
}

/*************************************************************************/


static int efx_idle_thread(struct efx_thread *thread)
{
	int first_time = 1;

	/* This function can run in the context of either an idle task
	 * or an efx_idle kernel thread.  The context cannot change
	 * while preemption is disabled. */

	preempt_disable();

	while (1) {
		/* This isn't actually most likely, but it's the path
		 * where latency matters.  It doesn't matter how long
		 * it takes to get into the idle loop. */
		if (likely(need_resched())) {
			/* If something else needs to run, let it
			 * run. */
			preempt_enable_no_resched();
			schedule();
			preempt_disable();

		} else if (likely(current != thread->task)) {
			while (!need_resched())
				cpu_relax();

		} else {
			/* We're running in the context of the
			 * efx_idle kernel thread.  Wait for someone
			 * to tell us to stop. */
			set_current_state(TASK_INTERRUPTIBLE);

			/* Inform the switcher thread that this task
			 * is now sleeping. */
			if (first_time) {
				first_time = 0;
				up(&thread->response_sem);
			}

			/* Don't sleep if this thread should stop. */
			if (efx_thread_should_stop(thread)) {
				__set_current_state(TASK_RUNNING);
				break;
			}

			preempt_enable_no_resched();
			schedule();
			preempt_disable();

			/* This can run in the efx_idle kernel thread
			 * or in the idle task.  It doesn't matter.
			 * If schedule returned then we're running. */
			BUG_ON(current->state != TASK_RUNNING);
		}
	}

	preempt_enable();

	return 0;
}


/*************************************************************************/

/* Swap the idle tasks on the specified CPU.
 * The caller must hold efx_idle_switcher_mutex.
 */
static int efx_switch_tasks(int cpu)
{
	static union {
		struct thread_struct thread_struct;
		struct thread_info *thread_info;
	} tmp;

	struct efx_idle_info *info;

	struct task_struct *old_task;
	struct task_struct *new_task;

	cpumask_t allowed_cpus, saved_cpus;
	int curr_cpu;
	int err;

	info = &per_cpu(efx_idle_info, cpu);

	old_task = info->old_idle_task;
	new_task = info->new_idle.task;

	BUG_ON(old_task->state != TASK_RUNNING);
	BUG_ON(new_task->state != TASK_STOPPED &&
	       new_task->state != TASK_INTERRUPTIBLE);

	BUG_ON(task_thread_info(old_task)->task != old_task);
	BUG_ON(task_thread_info(new_task)->task != new_task);

	/* Bind this task to the specified CPU. */
	saved_cpus = current->cpus_allowed;
	allowed_cpus = cpumask_of_cpu(cpu);
	err = set_cpus_allowed(current, allowed_cpus);

	/* Make sure the new task saves its state by binding it to
	 * this CPU. */
	set_cpus_allowed(new_task, allowed_cpus);

	if (likely(err == 0)) {
		curr_cpu = get_cpu();

		if (likely(curr_cpu == cpu)) {
			/* This is the correct CPU.  Switch the
			 * contexts. */

			tmp.thread_struct = old_task->thread;
			old_task->thread = new_task->thread;
			new_task->thread = tmp.thread_struct;

			tmp.thread_info = task_thread_info(old_task);
			efx_set_thread_info(old_task,
					    task_thread_info(new_task));
			efx_set_thread_info(new_task, tmp.thread_info);

			task_thread_info(old_task)->task = old_task;
			task_thread_info(new_task)->task = new_task;
		} else {
			err = -EAGAIN;
		}

		put_cpu();
		set_cpus_allowed(current, saved_cpus);
	}

	if (unlikely(err != 0)) {
		printk(KERN_ERR "Error binding to CPU %d, err = %d\n",
			cpu, err);
		return err;
	}

	return err;
}


static void efx_idle_start_timer(int cpu, unsigned delay)
{
	cpumask_t allowed_cpus, saved_cpus;
	struct efx_idle_info *info;

	info = &per_cpu(efx_idle_info, cpu);

	/* I'd really like to call add_timer_on but it's not
	 * exported. */

	/* Attempt to bind to the target CPU.  This shouldn't fail,
	 * but if it does then the timer will sort things out. */
	saved_cpus = current->cpus_allowed;
	allowed_cpus = cpumask_of_cpu(cpu);
	set_cpus_allowed(current, allowed_cpus);

	info->state = EFX_IDLE_STATE_TIMER;
	mod_timer(&info->timer, jiffies + delay);

	set_cpus_allowed(current, saved_cpus);
}


/* Enable the enhanced idle thread on the specified CPU.
 *
 * This performs the next operation required to run the enhanced idle
 * thread on the specified CPU.  If more work needs to be done, the
 * switcher task will get scheduled to complete it.
 *
 * This function must only be called with efx_idle_switcher_mutex held.
 */
static void efx_idle_enable(int cpu)
{
	struct efx_idle_info *info;
	int err;

	info = &per_cpu(efx_idle_info, cpu);

	switch (info->state) {
	case EFX_IDLE_STATE_UNLOAD:
	case EFX_IDLE_STATE_OFFLINE:
		/* Trying to modify an offline CPU would be silly.
		 * Don't do that. */
		break;

	case EFX_IDLE_STATE_INIT:
		EFX_BUG_ON_PARANOID(info->old_idle_task);
		EFX_BUG_ON_PARANOID(info->new_idle.task);

		/* When the timer fires, it will send a signal
		 * to the switcher thread to continue
		 * processing. */
		efx_idle_start_timer(cpu, 1);
		break;

	case EFX_IDLE_STATE_TIMER:
		EFX_BUG_ON_PARANOID(info->new_idle.task);

		/* Keep waiting for the timer to fire. */
		break;

	case EFX_IDLE_STATE_READY:
		/* Make sure the timer has completed.  This ensures
		 * that the values written by the timer are
		 * visible. */
		del_timer_sync(&info->timer);

		if (info->old_idle_task == NULL) {
			efx_idle_start_timer(cpu, HZ);
			break;
		}

		EFX_BUG_ON_PARANOID(!info->old_idle_task);
		EFX_BUG_ON_PARANOID(info->new_idle.task);

		/* Create the replacement thread. */
		snprintf(info->new_idle.name, sizeof(info->new_idle.name),
			 "sfc_idle/%d", cpu);
		err = efx_thread_create(&info->new_idle, &efx_idle_thread);
		if (err < 0) {
			printk(KERN_ERR "Error creating sfc_idle thread %d "
			       "(error %d)\n", cpu, err);
			break;
		}

		efx_thread_start(&info->new_idle);

		/* Make sure the task will be stopped when it's next
		 * rescheduled.  This doesn't ensure that the task
		 * state has been saved, but efx_switch_tasks takes
		 * care of that. */
		down(&info->new_idle.response_sem);

		if (efx_switch_tasks(cpu) == 0)
			info->state = EFX_IDLE_STATE_ACTIVE;
		else
			/* An error occurred.  Delete the thread. */
			efx_thread_stop(&info->new_idle);

		break;

	case EFX_IDLE_STATE_ACTIVE:
		/* It's already running.  Nothing to do. */
		EFX_BUG_ON_PARANOID(!info->old_idle_task);
		EFX_BUG_ON_PARANOID(!info->new_idle.task);
		break;

	default:
		printk(KERN_ERR "Invalid sfc_idle state %d\n", info->state);
		break;
	}

}

/* Disable the enhanced idle thread on the specified CPU.
 *
 * Stops the idle thread on the specified CPU and deletes the timer.
 * This function must only be called with efx_idle_switcher_mutex held.
 *
 */
static void efx_idle_disable(int cpu)
{
	struct efx_idle_info *info;

	info = &per_cpu(efx_idle_info, cpu);

	switch (info->state) {
	case EFX_IDLE_STATE_UNLOAD:
	case EFX_IDLE_STATE_OFFLINE:
	case EFX_IDLE_STATE_INIT:
		EFX_BUG_ON_PARANOID(info->old_idle_task);
		EFX_BUG_ON_PARANOID(info->new_idle.task);

		/* Nothing to disable. */
		break;

	case EFX_IDLE_STATE_TIMER:
	case EFX_IDLE_STATE_READY:
		/* Make sure the timer has completed. */
		del_timer_sync(&info->timer);

		EFX_BUG_ON_PARANOID(info->new_idle.task);

		/* The idle task might have been found, but it's not
		 * worth bothering about.  Going back to the init
		 * state means we don't have to keep deleting the
		 * timer. */
		info->old_idle_task = NULL;
		info->state = EFX_IDLE_STATE_INIT;
		return;

	case EFX_IDLE_STATE_ACTIVE:
		EFX_BUG_ON_PARANOID(!info->old_idle_task);
		EFX_BUG_ON_PARANOID(!info->new_idle.task);

		/* The enhanced idle task is running.  Stop it. */
		if (efx_switch_tasks(cpu) == 0) {
			efx_thread_stop(&info->new_idle);
			info->new_idle.task = NULL;
			info->old_idle_task = NULL;
			info->state = EFX_IDLE_STATE_INIT;
		}
		break;

	default:
		printk(KERN_ERR "Invalid sfc_idle state %d\n", info->state);
		return;
	}
}


/*************************************************************************/

#  if defined(CONFIG_HOTPLUG_CPU)
static int efx_cpu_notifier_func(struct notifier_block *nb,
				 unsigned long action, void *cpu_ptr)
{
	int cpu = (long)cpu_ptr;
	struct efx_idle_info *info = &per_cpu(efx_idle_info, cpu);

	down(&efx_idle_switcher_mutex);

	switch (action) {
#    if defined(CPU_DOWN_FAILED)
	case CPU_DOWN_FAILED:	/* The CPU didn't go offline after all. */
#    endif
	case CPU_ONLINE:	/* A new CPU has come online. */
		if (info->state == EFX_IDLE_STATE_OFFLINE)
			info->state = EFX_IDLE_STATE_INIT;

		/* Just let the switcher thread do all the work. */
		send_sig(SIGHUP, efx_idle_switcher.task, 1);
		break;

#    if defined(CPU_DOWN_PREPARE)
	case CPU_DOWN_PREPARE:	/* A CPU is going offline. */
		efx_idle_disable(cpu);
		EFX_WARN_ON_PARANOID(info->state != EFX_IDLE_STATE_INIT);
		if (info->state == EFX_IDLE_STATE_INIT)
			info->state = EFX_IDLE_STATE_OFFLINE;
		break;
#    endif
	}

	up(&efx_idle_switcher_mutex);

	return 0;
}
#  endif


#  if defined(CONFIG_HOTPLUG_CPU)
static struct notifier_block efx_cpu_notifier = {
	.notifier_call = efx_cpu_notifier_func,
	.priority = 0
};
#  endif


static int efx_idle_switcher_thread(struct efx_thread *thread)
{
	sigset_t blocked;
	int cpu;

	siginitsetinv(&blocked, sigmask(SIGHUP));
	sigprocmask(SIG_SETMASK, &blocked, NULL);
	allow_signal(SIGHUP);

	/* Wait before starting to avoid a race with the CPU hotplug
	 * system. */
	schedule_timeout_interruptible(HZ);

	while (1) {
		/* Signals are used to wake this thread.  The signal
		 * is sent after something has changed, the state is
		 * checked after the signal is deleted and the task
		 * will not sleep with a signal pending so there's no
		 * chance that anything will go unnoticed. */
		if (signal_pending(current))
			flush_signals(current);

		down(&efx_idle_switcher_mutex);

		/* Check all the CPUs and make sure they are in the
		 * correct state. */
		for_each_online_cpu(cpu) {
			if ((idle_cpus >> (cpu % BITS_PER_LONG)) & 1)
				efx_idle_enable(cpu);
			else
				efx_idle_disable(cpu);
		}

		up(&efx_idle_switcher_mutex);

		/* Wait for another signal to arrive.  Set the task
		 * state with preemption disabled to avoid racing with
		 * efx_thread_stop and a schedule triggered by a timer
		 * tick. */
		preempt_disable();
		set_current_state(TASK_INTERRUPTIBLE);

		if (efx_thread_should_stop(thread)) {
			__set_task_state(current, TASK_RUNNING);
			preempt_enable();
			break;
		}

		preempt_enable_no_resched();
		schedule();
	}

	return 0;
}


static void efx_idle_timer_func(unsigned long cpu)
{
	struct efx_idle_info *info = &per_cpu(efx_idle_info, cpu);

	EFX_BUG_ON_PARANOID(info->state != EFX_IDLE_STATE_TIMER);

	if (likely(current->pid != 0)) {
		/* Wrong task.  Try again later. */
		mod_timer(&info->timer, jiffies + 1);
		return;
	}

	if (smp_processor_id() == cpu)
		/* Found an idle task and we're still on the correct
		 * CPU. */
		info->old_idle_task = current;
	else
		/* Somehow we ended up on the wrong CPU.  This
		 * shouldn't really happen.  Get the switcher to sort
		 * things out. */
		info->old_idle_task = NULL;

	/* Wake up the switcher. */
	info->state = EFX_IDLE_STATE_READY;
	send_sig(SIGHUP, efx_idle_switcher.task, 1);
}


/*************************************************************************/

static int efx_idle_ensure_init(void)
{
	int err;
	struct efx_idle_info *info;
	int cpu;
	int cpu_count;

	if (efx_idle_switcher.task != NULL)
		return -EBUSY;

	for_each_cpu_mask(cpu, cpu_possible_map) {
		info = &per_cpu(efx_idle_info, cpu);
		setup_timer(&info->timer, efx_idle_timer_func, cpu);
		info->old_idle_task = NULL;
		info->new_idle.task = NULL;
		info->state = EFX_IDLE_STATE_OFFLINE;
	}

	snprintf(efx_idle_switcher.name, sizeof(efx_idle_switcher.name),
		 "sfc_idle");
	err = efx_thread_create(&efx_idle_switcher, &efx_idle_switcher_thread);
	if (err < 0) {
		printk(KERN_ERR "Error creating sfc_idle thread (error %d)\n",
		       err);
		return err;
	}
#  if defined(CONFIG_HOTPLUG_CPU)
	/* The notifier needs a pointer to the switcher task so that
	 * it can send a signal to it. */
	register_cpu_notifier(&efx_cpu_notifier);
#  endif

	/* Set the state of all online CPUs.  NB. This races with the
	 * hotplug system.  If a CPU is being taken down and the
	 * notifiers have been called, it's possible we'll replace the
	 * idle task which will then cause the unplug to wedge.
	 * There's nothing we can do about this, but the delay at the
	 * start of the switcher thread should make it impossible to
	 * hit in practise. */
	cpu_count = 0;
	down(&efx_idle_switcher_mutex);
	for_each_online_cpu(cpu) {
		info = &per_cpu(efx_idle_info, cpu);
		if (info->state == EFX_IDLE_STATE_OFFLINE)
			info->state = EFX_IDLE_STATE_INIT;
		if ((idle_cpus >> (cpu % BITS_PER_LONG)) & 1)
			cpu_count++;
	}
	up(&efx_idle_switcher_mutex);

	printk(KERN_INFO "Starting enhanced idle thread on %d CPUs\n",
	       cpu_count);

	efx_thread_start(&efx_idle_switcher);

	/* Success. */
	return 0;
}


void efx_idle_fini(void)
{
	int cpu;
	struct efx_idle_info *info;

	down(&efx_idle_startup_mutex);

	if (efx_idle_switcher.task != NULL) {
		/* Disable all enhanced idle tasks.  Do this before
		 * unregistering the notifier to avoid racing with the
		 * CPU hotplug system. */
		down(&efx_idle_switcher_mutex);
		for_each_online_cpu(cpu) {
			efx_idle_disable(cpu);
			info = &per_cpu(efx_idle_info, cpu);
			EFX_BUG_ON_PARANOID(info->state !=
					    EFX_IDLE_STATE_INIT);
			/* Make sure this CPU doesn't come back up. */
			info->state = EFX_IDLE_STATE_UNLOAD;
		}
		up(&efx_idle_switcher_mutex);

#  if defined(CONFIG_HOTPLUG_CPU)
		/* Now that all enhanced idle tasks have been stopped,
		 * we don't need to know about CPUs coming up or going
		 * down.  This must happen before the switcher task is
		 * stopped. */
		unregister_cpu_notifier(&efx_cpu_notifier);
#  endif

		efx_thread_stop(&efx_idle_switcher);
	}

	up(&efx_idle_startup_mutex);
}

int efx_idle_init(void)
{
	int rc = 0;
	down(&efx_idle_startup_mutex);
	if (idle_enable >= 2)
		rc = efx_idle_ensure_init();
	up(&efx_idle_startup_mutex);
	return rc;
}

#ifndef CONFIG_XEN
int efx_idle_enhance(void)
{
	int rc = 0;

	down(&efx_idle_startup_mutex);
	if (idle_enable == 1) {
		/* Only ever try to start the enhanced idle loop
		 * once. */
		idle_enable = 2;
		rc = efx_idle_ensure_init();
	}
	up(&efx_idle_startup_mutex);

	return rc;
}
#endif
