/*
* 'schedule()' is the scheduler function. It's a very simple and nice
* scheduler: it's not perfect, but certainly works for most things.
*
* The goto is "interesting".
*
* NOTE!! Task 0 is the 'idle' task, which gets called when no other
* tasks can run. It can not be killed, and it cannot sleep. The 'state'
* information in task[0] is never used.
*/
asmlinkage void schedule(void)
{
struct schedule_data * sched_data;
struct task_struct *prev, *next, *p;
struct list_head *tmp;
int this_cpu, c;
spin_lock_prefetch(&runqueue_lock);
if (!current->active_mm) BUG();
need_resched_back:
prev = current;
this_cpu = prev->processor;
if (unlikely(in_interrupt())) {
printk("Scheduling in interrupt\n");
BUG();
}
release_kernel_lock(prev, this_cpu);
/*
* 'sched_data' is protected by the fact that we can run
* only one process per CPU.
*/
sched_data = & aligned_data[this_cpu].schedule_data;
spin_lock_irq(&runqueue_lock);
/* move an exhausted RR process to be last.. */
if (unlikely(prev->policy == SCHED_RR))
if (!prev->counter) {
prev->counter = NICE_TO_TICKS(prev->nice);
move_last_runqueue(prev);
}
switch (prev->state) {
case TASK_INTERRUPTIBLE:
if (signal_pending(prev)) {
prev->state = TASK_RUNNING;
break;
}
default:
del_from_runqueue(prev);
case TASK_RUNNING:;
}
prev->need_resched = 0;
/*
* this is the scheduler proper:
*/
repeat_schedule:
/*
* Default process to select..
*/
next = idle_task(this_cpu);
c = -1000;
list_for_each(tmp, &runqueue_head) {
p = list_entry(tmp, struct task_struct, run_list);
if (can_schedule(p, this_cpu)) {
int weight = goodness(p, this_cpu, prev->active_mm);
if (weight > c)
c = weight, next = p;
}
}
/* Do we need to re-calculate counters? */
if (unlikely(!c)) {
struct task_struct *p;
spin_unlock_irq(&runqueue_lock);
read_lock(&tasklist_lock);
for_each_task(p)
p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
read_unlock(&tasklist_lock);
spin_lock_irq(&runqueue_lock);
goto repeat_schedule;
}
/*
* from this point on nothing can prevent us from
* switching to the next task, save this fact in
* sched_data.
*/
sched_data->curr = next;
task_set_cpu(next, this_cpu);
spin_unlock_irq(&runqueue_lock);
if (unlikely(prev == next)) {
/* We won't go through the normal tail, so do this by hand */
prev->policy &= ~SCHED_YIELD;
goto same_process;
}
kstat.context_swtch++;
/*
* there are 3 processes which are affected by a context switch:
*
* prev == .... ==> (last => next)
*
* It's the 'much more previous' 'prev' that is on next's stack,
* but prev is set to (the just run) 'last' process by switch_to().
* This might sound slightly confusing but makes tons of sense.
*/
prepare_to_switch();
{
struct mm_struct *mm = next->mm;
struct mm_struct *oldmm = prev->active_mm;
if (!mm) {
if (next->active_mm) BUG();
next->active_mm = oldmm;
atomic_inc(&oldmm->mm_count);
enter_lazy_tlb(oldmm, next, this_cpu);
} else {
if (next->active_mm != mm) BUG();
switch_mm(oldmm, mm, next, this_cpu);
}
if (!prev->mm) {
prev->active_mm = NULL;
mmdrop(oldmm);
}
}
/*
* This just switches the register state and the
* stack.
*/
switch_to(prev, next, prev);
__schedule_tail(prev);
same_process:
reacquire_kernel_lock(current);
if (current->need_resched)
goto need_resched_back;
return;
}
/*
* schedule_tail() is getting called from the fork return path. This
* cleans up all remaining scheduler things, without impacting the
* common case.
*/
static inline void __schedule_tail(struct task_struct *prev)
{
#ifdef CONFIG_SMP
int policy;
/*
* prev->policy can be written from here only before `prev'
* can be scheduled (before setting prev->cpus_runnable to ~0UL).
* Of course it must also be read before allowing prev
* to be rescheduled, but since the write depends on the read
* to complete, wmb() is enough. (the spin_lock() acquired
* before setting cpus_runnable is not enough because the spin_lock()
* common code semantics allows code outside the critical section
* to enter inside the critical section)
*/
policy = prev->policy;
prev->policy = policy & ~SCHED_YIELD;
wmb();
/*
* fast path falls through. We have to clear cpus_runnable before
* checking prev->state to avoid a wakeup race. Protect against
* the task exiting early.
*/
task_lock(prev);
task_release_cpu(prev);
mb();
if (prev->state == TASK_RUNNING)
goto needs_resched;
out_unlock:
task_unlock(prev); /* Synchronise here with release_task() if prev is TASK_ZOMBIE */
return;
/*
* Slow path - we 'push' the previous process and
* reschedule_idle() will attempt to find a new
* processor for it. (but it might preempt the
* current process as well.) We must take the runqueue
* lock and re-check prev->state to be correct. It might
* still happen that this process has a preemption
* 'in progress' already - but this is not a problem and
* might happen in other circumstances as well.
*/
needs_resched:
{
unsigned long flags;
/*
* Avoid taking the runqueue lock in cases where
* no preemption-check is necessery:
*/
if ((prev == idle_task(smp_processor_id())) ||
(policy & SCHED_YIELD))
goto out_unlock;
spin_lock_irqsave(&runqueue_lock, flags);
if ((prev->state == TASK_RUNNING) && !task_has_cpu(prev))
reschedule_idle(prev);
spin_unlock_irqrestore(&runqueue_lock, flags);
goto out_unlock;
}
#else
prev->policy &= ~SCHED_YIELD;
#endif /* CONFIG_SMP */
}
#define switch_to(prev,next,last) do { \
asm volatile("pushl %%esi\n\t" \
"pushl %%edi\n\t" \
"pushl %%ebp\n\t" \
"movl %%esp,%0\n\t" /* save ESP */ \
"movl %3,%%esp\n\t" /* restore ESP */ \
"movl $1f,%1\n\t" /* save EIP */ \
"pushl %4\n\t" /* restore EIP */ \
"jmp __switch_to\n" \
"1:\t" \
"popl %%ebp\n\t" \
"popl %%edi\n\t" \
"popl %%esi\n\t" \
:"=m" (prev->thread.esp),"=m" (prev->thread.eip), \
"=b" (last) \
:"m" (next->thread.esp),"m" (next->thread.eip), \
"a" (prev), "d" (next), \
"b" (prev)); \
} while (0)
/*
* switch_to(x,yn) should switch tasks from x to y.
*
* We fsave/fwait so that an exception goes off at the right time
* (as a call from the fsave or fwait in effect) rather than to
* the wrong process. Lazy FP saving no longer makes any sense
* with modern CPU's, and this simplifies a lot of things (SMP
* and UP become the same).
*
* NOTE! We used to use the x86 hardware context switching. The
* reason for not using it any more becomes apparent when you
* try to recover gracefully from saved state that is no longer
* valid (stale segment register values in particular). With the
* hardware task-switch, there is no way to fix up bad state in
* a reasonable manner.
*
* The fact that Intel documents the hardware task-switching to
* be slow is a fairly red herring - this code is not noticeably
* faster. However, there _is_ some room for improvement here,
* so the performance issues may eventually be a valid point.
* More important, however, is the fact that this allows us much
* more flexibility.
*/
void __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
struct thread_struct *prev = &prev_p->thread,
*next = &next_p->thread;
struct tss_struct *tss = init_tss + smp_processor_id();
unlazy_fpu(prev_p);
/*
* Reload esp0, LDT and the page table pointer:
*/
tss->esp0 = next->esp0;
/*
* Save away %fs and %gs. No need to save %es and %ds, as
* those are always kernel segments while inside the kernel.
*/
asm volatile("movl %%fs,%0":"=m" (*(int *)&prev->fs));
asm volatile("movl %%gs,%0":"=m" (*(int *)&prev->gs));
/*
* Restore %fs and %gs.
*/
loadsegment(fs, next->fs);
loadsegment(gs, next->gs);
/*
* Now maybe reload the debug registers
*/
if (next->debugreg[7]){
loaddebug(next, 0);
loaddebug(next, 1);
loaddebug(next, 2);
loaddebug(next, 3);
/* no 4 and 5 */
loaddebug(next, 6);
loaddebug(next, 7);
}
if (prev->ioperm || next->ioperm) {
if (next->ioperm) {
/*
* 4 cachelines copy ... not good, but not that
* bad either. Anyone got something better?
* This only affects processes which use ioperm().
* [Putting the TSSs into 4k-tlb mapped regions
* and playing VM tricks to switch the IO bitmap
* is not really acceptable.]
*/
memcpy(tss->io_bitmap, next->io_bitmap,
IO_BITMAP_SIZE*sizeof(unsigned long));
tss->bitmap = IO_BITMAP_OFFSET;
} else
/*
* a bitmap offset pointing outside of the TSS limit
* causes a nicely controllable SIGSEGV if a process
* tries to use a port IO instruction. The first
* sys_ioperm() call sets up the bitmap properly.
*/
tss->bitmap = INVALID_IO_BITMAP_OFFSET;
}
}
/*
* This is the function that decides how desirable a process is..
* You can weigh different processes against each other depending
* on what CPU they've run on lately etc to try to handle cache
* and TLB miss penalties.
*
* Return values:
* -1000: never select this
* 0: out of time, recalculate counters (but it might still be
* selected)
* +ve: "goodness" value (the larger, the better)
* +1000: realtime process, select this.
*/
static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
{
int weight;
/*
* select the current process after every other
* runnable process, but before the idle thread.
* Also, dont trigger a counter recalculation.
*/
weight = -1;
if (p->policy & SCHED_YIELD)
goto out;
/*
* Non-RT process - normal case first.
*/
if (p->policy == SCHED_OTHER) {
/*
* Give the process a first-approximation goodness value
* according to the number of clock-ticks it has left.
*
* Don't do any other calculations if the time slice is
* over..
*/
weight = p->counter;
if (!weight)
goto out;
#ifdef CONFIG_SMP
/* Give a largish advantage to the same processor... */
/* (this is equivalent to penalizing other processors) */
if (p->processor == this_cpu)
weight += PROC_CHANGE_PENALTY;
#endif
/* .. and a slight advantage to the current MM */
if (p->mm == this_mm || !p->mm)
weight += 1;
weight += 20 - p->nice;
goto out;
}
/*
* Realtime process, select the first one on the
* runqueue (taking priorities within processes
* into account).
*/
weight = 1000 + p->rt_priority;
out:
return weight;
}