Linux kernel & device driver programming

Cross-Referenced Linux and Device Driver Code

[ source navigation ] [ diff markup ] [ identifier search ] [ freetext search ] [ file search ]
Version: [ 2.6.11.8 ] [ 2.6.25 ] [ 2.6.25.8 ] [ 2.6.31.13 ] Architecture: [ i386 ]
  1 /*
  2  *  Copyright (C) 1995  Linus Torvalds
  3  *
  4  *  Pentium III FXSR, SSE support
  5  *      Gareth Hughes <gareth@valinux.com>, May 2000
  6  *
  7  *  X86-64 port
  8  *      Andi Kleen.
  9  *
 10  *      CPU hotplug support - ashok.raj@intel.com
 11  */
 12 
 13 /*
 14  * This file handles the architecture-dependent parts of process handling..
 15  */
 16 
 17 #include <linux/stackprotector.h>
 18 #include <linux/cpu.h>
 19 #include <linux/errno.h>
 20 #include <linux/sched.h>
 21 #include <linux/fs.h>
 22 #include <linux/kernel.h>
 23 #include <linux/mm.h>
 24 #include <linux/elfcore.h>
 25 #include <linux/smp.h>
 26 #include <linux/slab.h>
 27 #include <linux/user.h>
 28 #include <linux/interrupt.h>
 29 #include <linux/utsname.h>
 30 #include <linux/delay.h>
 31 #include <linux/module.h>
 32 #include <linux/ptrace.h>
 33 #include <linux/notifier.h>
 34 #include <linux/kprobes.h>
 35 #include <linux/kdebug.h>
 36 #include <linux/tick.h>
 37 #include <linux/prctl.h>
 38 #include <linux/uaccess.h>
 39 #include <linux/io.h>
 40 #include <linux/ftrace.h>
 41 #include <linux/dmi.h>
 42 
 43 #include <asm/pgtable.h>
 44 #include <asm/system.h>
 45 #include <asm/processor.h>
 46 #include <asm/i387.h>
 47 #include <asm/mmu_context.h>
 48 #include <asm/prctl.h>
 49 #include <asm/desc.h>
 50 #include <asm/proto.h>
 51 #include <asm/ia32.h>
 52 #include <asm/idle.h>
 53 #include <asm/syscalls.h>
 54 #include <asm/ds.h>
 55 
 56 asmlinkage extern void ret_from_fork(void);
 57 
 58 DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
 59 EXPORT_PER_CPU_SYMBOL(current_task);
 60 
 61 DEFINE_PER_CPU(unsigned long, old_rsp);
 62 static DEFINE_PER_CPU(unsigned char, is_idle);
 63 
 64 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
 65 
 66 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
 67 
 68 void idle_notifier_register(struct notifier_block *n)
 69 {
 70         atomic_notifier_chain_register(&idle_notifier, n);
 71 }
 72 EXPORT_SYMBOL_GPL(idle_notifier_register);
 73 
 74 void idle_notifier_unregister(struct notifier_block *n)
 75 {
 76         atomic_notifier_chain_unregister(&idle_notifier, n);
 77 }
 78 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
 79 
 80 void enter_idle(void)
 81 {
 82         percpu_write(is_idle, 1);
 83         atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
 84 }
 85 
 86 static void __exit_idle(void)
 87 {
 88         if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
 89                 return;
 90         atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
 91 }
 92 
 93 /* Called from interrupts to signify idle end */
 94 void exit_idle(void)
 95 {
 96         /* idle loop has pid 0 */
 97         if (current->pid)
 98                 return;
 99         __exit_idle();
100 }
101 
102 #ifndef CONFIG_SMP
103 static inline void play_dead(void)
104 {
105         BUG();
106 }
107 #endif
108 
109 /*
110  * The idle thread. There's no useful work to be
111  * done, so just try to conserve power and have a
112  * low exit latency (ie sit in a loop waiting for
113  * somebody to say that they'd like to reschedule)
114  */
115 void cpu_idle(void)
116 {
117         current_thread_info()->status |= TS_POLLING;
118 
119         /*
120          * If we're the non-boot CPU, nothing set the stack canary up
121          * for us.  CPU0 already has it initialized but no harm in
122          * doing it again.  This is a good place for updating it, as
123          * we wont ever return from this function (so the invalid
124          * canaries already on the stack wont ever trigger).
125          */
126         boot_init_stack_canary();
127 
128         /* endless idle loop with no priority at all */
129         while (1) {
130                 tick_nohz_stop_sched_tick(1);
131                 while (!need_resched()) {
132 
133                         rmb();
134 
135                         if (cpu_is_offline(smp_processor_id()))
136                                 play_dead();
137                         /*
138                          * Idle routines should keep interrupts disabled
139                          * from here on, until they go to idle.
140                          * Otherwise, idle callbacks can misfire.
141                          */
142                         local_irq_disable();
143                         enter_idle();
144                         /* Don't trace irqs off for idle */
145                         stop_critical_timings();
146                         pm_idle();
147                         start_critical_timings();
148                         /* In many cases the interrupt that ended idle
149                            has already called exit_idle. But some idle
150                            loops can be woken up without interrupt. */
151                         __exit_idle();
152                 }
153 
154                 tick_nohz_restart_sched_tick();
155                 preempt_enable_no_resched();
156                 schedule();
157                 preempt_disable();
158         }
159 }
160 
161 /* Prints also some state that isn't saved in the pt_regs */
162 void __show_regs(struct pt_regs *regs, int all)
163 {
164         unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
165         unsigned long d0, d1, d2, d3, d6, d7;
166         unsigned int fsindex, gsindex;
167         unsigned int ds, cs, es;
168         const char *board;
169 
170         printk("\n");
171         print_modules();
172         board = dmi_get_system_info(DMI_PRODUCT_NAME);
173         if (!board)
174                 board = "";
175         printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n",
176                 current->pid, current->comm, print_tainted(),
177                 init_utsname()->release,
178                 (int)strcspn(init_utsname()->version, " "),
179                 init_utsname()->version, board);
180         printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
181         printk_address(regs->ip, 1);
182         printk(KERN_INFO "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
183                         regs->sp, regs->flags);
184         printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
185                regs->ax, regs->bx, regs->cx);
186         printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
187                regs->dx, regs->si, regs->di);
188         printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
189                regs->bp, regs->r8, regs->r9);
190         printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
191                regs->r10, regs->r11, regs->r12);
192         printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
193                regs->r13, regs->r14, regs->r15);
194 
195         asm("movl %%ds,%0" : "=r" (ds));
196         asm("movl %%cs,%0" : "=r" (cs));
197         asm("movl %%es,%0" : "=r" (es));
198         asm("movl %%fs,%0" : "=r" (fsindex));
199         asm("movl %%gs,%0" : "=r" (gsindex));
200 
201         rdmsrl(MSR_FS_BASE, fs);
202         rdmsrl(MSR_GS_BASE, gs);
203         rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
204 
205         if (!all)
206                 return;
207 
208         cr0 = read_cr0();
209         cr2 = read_cr2();
210         cr3 = read_cr3();
211         cr4 = read_cr4();
212 
213         printk(KERN_INFO "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
214                fs, fsindex, gs, gsindex, shadowgs);
215         printk(KERN_INFO "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
216                         es, cr0);
217         printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
218                         cr4);
219 
220         get_debugreg(d0, 0);
221         get_debugreg(d1, 1);
222         get_debugreg(d2, 2);
223         printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
224         get_debugreg(d3, 3);
225         get_debugreg(d6, 6);
226         get_debugreg(d7, 7);
227         printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
228 }
229 
230 void show_regs(struct pt_regs *regs)
231 {
232         printk(KERN_INFO "CPU %d:", smp_processor_id());
233         __show_regs(regs, 1);
234         show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
235 }
236 
237 void release_thread(struct task_struct *dead_task)
238 {
239         if (dead_task->mm) {
240                 if (dead_task->mm->context.size) {
241                         printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
242                                         dead_task->comm,
243                                         dead_task->mm->context.ldt,
244                                         dead_task->mm->context.size);
245                         BUG();
246                 }
247         }
248 }
249 
250 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
251 {
252         struct user_desc ud = {
253                 .base_addr = addr,
254                 .limit = 0xfffff,
255                 .seg_32bit = 1,
256                 .limit_in_pages = 1,
257                 .useable = 1,
258         };
259         struct desc_struct *desc = t->thread.tls_array;
260         desc += tls;
261         fill_ldt(desc, &ud);
262 }
263 
264 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
265 {
266         return get_desc_base(&t->thread.tls_array[tls]);
267 }
268 
269 /*
270  * This gets called before we allocate a new thread and copy
271  * the current task into it.
272  */
273 void prepare_to_copy(struct task_struct *tsk)
274 {
275         unlazy_fpu(tsk);
276 }
277 
278 int copy_thread(unsigned long clone_flags, unsigned long sp,
279                 unsigned long unused,
280         struct task_struct *p, struct pt_regs *regs)
281 {
282         int err;
283         struct pt_regs *childregs;
284         struct task_struct *me = current;
285 
286         childregs = ((struct pt_regs *)
287                         (THREAD_SIZE + task_stack_page(p))) - 1;
288         *childregs = *regs;
289 
290         childregs->ax = 0;
291         childregs->sp = sp;
292         if (sp == ~0UL)
293                 childregs->sp = (unsigned long)childregs;
294 
295         p->thread.sp = (unsigned long) childregs;
296         p->thread.sp0 = (unsigned long) (childregs+1);
297         p->thread.usersp = me->thread.usersp;
298 
299         set_tsk_thread_flag(p, TIF_FORK);
300 
301         p->thread.fs = me->thread.fs;
302         p->thread.gs = me->thread.gs;
303 
304         savesegment(gs, p->thread.gsindex);
305         savesegment(fs, p->thread.fsindex);
306         savesegment(es, p->thread.es);
307         savesegment(ds, p->thread.ds);
308 
309         if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
310                 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
311                 if (!p->thread.io_bitmap_ptr) {
312                         p->thread.io_bitmap_max = 0;
313                         return -ENOMEM;
314                 }
315                 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
316                                 IO_BITMAP_BYTES);
317                 set_tsk_thread_flag(p, TIF_IO_BITMAP);
318         }
319 
320         /*
321          * Set a new TLS for the child thread?
322          */
323         if (clone_flags & CLONE_SETTLS) {
324 #ifdef CONFIG_IA32_EMULATION
325                 if (test_thread_flag(TIF_IA32))
326                         err = do_set_thread_area(p, -1,
327                                 (struct user_desc __user *)childregs->si, 0);
328                 else
329 #endif
330                         err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
331                 if (err)
332                         goto out;
333         }
334 
335         clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
336         p->thread.ds_ctx = NULL;
337 
338         clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
339         p->thread.debugctlmsr = 0;
340 
341         err = 0;
342 out:
343         if (err && p->thread.io_bitmap_ptr) {
344                 kfree(p->thread.io_bitmap_ptr);
345                 p->thread.io_bitmap_max = 0;
346         }
347         return err;
348 }
349 
350 void
351 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
352 {
353         loadsegment(fs, 0);
354         loadsegment(es, 0);
355         loadsegment(ds, 0);
356         load_gs_index(0);
357         regs->ip                = new_ip;
358         regs->sp                = new_sp;
359         percpu_write(old_rsp, new_sp);
360         regs->cs                = __USER_CS;
361         regs->ss                = __USER_DS;
362         regs->flags             = 0x200;
363         set_fs(USER_DS);
364         /*
365          * Free the old FP and other extended state
366          */
367         free_thread_xstate(current);
368 }
369 EXPORT_SYMBOL_GPL(start_thread);
370 
371 /*
372  *      switch_to(x,y) should switch tasks from x to y.
373  *
374  * This could still be optimized:
375  * - fold all the options into a flag word and test it with a single test.
376  * - could test fs/gs bitsliced
377  *
378  * Kprobes not supported here. Set the probe on schedule instead.
379  * Function graph tracer not supported too.
380  */
381 __notrace_funcgraph struct task_struct *
382 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
383 {
384         struct thread_struct *prev = &prev_p->thread;
385         struct thread_struct *next = &next_p->thread;
386         int cpu = smp_processor_id();
387         struct tss_struct *tss = &per_cpu(init_tss, cpu);
388         unsigned fsindex, gsindex;
389 
390         /* we're going to use this soon, after a few expensive things */
391         if (next_p->fpu_counter > 5)
392                 prefetch(next->xstate);
393 
394         /*
395          * Reload esp0, LDT and the page table pointer:
396          */
397         load_sp0(tss, next);
398 
399         /*
400          * Switch DS and ES.
401          * This won't pick up thread selector changes, but I guess that is ok.
402          */
403         savesegment(es, prev->es);
404         if (unlikely(next->es | prev->es))
405                 loadsegment(es, next->es);
406 
407         savesegment(ds, prev->ds);
408         if (unlikely(next->ds | prev->ds))
409                 loadsegment(ds, next->ds);
410 
411 
412         /* We must save %fs and %gs before load_TLS() because
413          * %fs and %gs may be cleared by load_TLS().
414          *
415          * (e.g. xen_load_tls())
416          */
417         savesegment(fs, fsindex);
418         savesegment(gs, gsindex);
419 
420         load_TLS(next, cpu);
421 
422         /*
423          * Leave lazy mode, flushing any hypercalls made here.
424          * This must be done before restoring TLS segments so
425          * the GDT and LDT are properly updated, and must be
426          * done before math_state_restore, so the TS bit is up
427          * to date.
428          */
429         arch_end_context_switch(next_p);
430 
431         /*
432          * Switch FS and GS.
433          *
434          * Segment register != 0 always requires a reload.  Also
435          * reload when it has changed.  When prev process used 64bit
436          * base always reload to avoid an information leak.
437          */
438         if (unlikely(fsindex | next->fsindex | prev->fs)) {
439                 loadsegment(fs, next->fsindex);
440                 /*
441                  * Check if the user used a selector != 0; if yes
442                  *  clear 64bit base, since overloaded base is always
443                  *  mapped to the Null selector
444                  */
445                 if (fsindex)
446                         prev->fs = 0;
447         }
448         /* when next process has a 64bit base use it */
449         if (next->fs)
450                 wrmsrl(MSR_FS_BASE, next->fs);
451         prev->fsindex = fsindex;
452 
453         if (unlikely(gsindex | next->gsindex | prev->gs)) {
454                 load_gs_index(next->gsindex);
455                 if (gsindex)
456                         prev->gs = 0;
457         }
458         if (next->gs)
459                 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
460         prev->gsindex = gsindex;
461 
462         /* Must be after DS reload */
463         unlazy_fpu(prev_p);
464 
465         /*
466          * Switch the PDA and FPU contexts.
467          */
468         prev->usersp = percpu_read(old_rsp);
469         percpu_write(old_rsp, next->usersp);
470         percpu_write(current_task, next_p);
471 
472         percpu_write(kernel_stack,
473                   (unsigned long)task_stack_page(next_p) +
474                   THREAD_SIZE - KERNEL_STACK_OFFSET);
475 
476         /*
477          * Now maybe reload the debug registers and handle I/O bitmaps
478          */
479         if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
480                      task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
481                 __switch_to_xtra(prev_p, next_p, tss);
482 
483         /* If the task has used fpu the last 5 timeslices, just do a full
484          * restore of the math state immediately to avoid the trap; the
485          * chances of needing FPU soon are obviously high now
486          *
487          * tsk_used_math() checks prevent calling math_state_restore(),
488          * which can sleep in the case of !tsk_used_math()
489          */
490         if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
491                 math_state_restore();
492         return prev_p;
493 }
494 
495 /*
496  * sys_execve() executes a new program.
497  */
498 asmlinkage
499 long sys_execve(char __user *name, char __user * __user *argv,
500                 char __user * __user *envp, struct pt_regs *regs)
501 {
502         long error;
503         char *filename;
504 
505         filename = getname(name);
506         error = PTR_ERR(filename);
507         if (IS_ERR(filename))
508                 return error;
509         error = do_execve(filename, argv, envp, regs);
510         putname(filename);
511         return error;
512 }
513 
514 void set_personality_64bit(void)
515 {
516         /* inherit personality from parent */
517 
518         /* Make sure to be in 64bit mode */
519         clear_thread_flag(TIF_IA32);
520 
521         /* TBD: overwrites user setup. Should have two bits.
522            But 64bit processes have always behaved this way,
523            so it's not too bad. The main problem is just that
524            32bit childs are affected again. */
525         current->personality &= ~READ_IMPLIES_EXEC;
526 }
527 
528 asmlinkage long
529 sys_clone(unsigned long clone_flags, unsigned long newsp,
530           void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
531 {
532         if (!newsp)
533                 newsp = regs->sp;
534         return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
535 }
536 
537 void set_personality_ia32(void)
538 {
539         /* inherit personality from parent */
540 
541         /* Make sure to be in 32bit mode */
542         set_thread_flag(TIF_IA32);
543 
544         /* Prepare the first "return" to user space */
545         current_thread_info()->status |= TS_COMPAT;
546 }
547 
548 unsigned long get_wchan(struct task_struct *p)
549 {
550         unsigned long stack;
551         u64 fp, ip;
552         int count = 0;
553 
554         if (!p || p == current || p->state == TASK_RUNNING)
555                 return 0;
556         stack = (unsigned long)task_stack_page(p);
557         if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
558                 return 0;
559         fp = *(u64 *)(p->thread.sp);
560         do {
561                 if (fp < (unsigned long)stack ||
562                     fp >= (unsigned long)stack+THREAD_SIZE)
563                         return 0;
564                 ip = *(u64 *)(fp+8);
565                 if (!in_sched_functions(ip))
566                         return ip;
567                 fp = *(u64 *)fp;
568         } while (count++ < 16);
569         return 0;
570 }
571 
572 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
573 {
574         int ret = 0;
575         int doit = task == current;
576         int cpu;
577 
578         switch (code) {
579         case ARCH_SET_GS:
580                 if (addr >= TASK_SIZE_OF(task))
581                         return -EPERM;
582                 cpu = get_cpu();
583                 /* handle small bases via the GDT because that's faster to
584                    switch. */
585                 if (addr <= 0xffffffff) {
586                         set_32bit_tls(task, GS_TLS, addr);
587                         if (doit) {
588                                 load_TLS(&task->thread, cpu);
589                                 load_gs_index(GS_TLS_SEL);
590                         }
591                         task->thread.gsindex = GS_TLS_SEL;
592                         task->thread.gs = 0;
593                 } else {
594                         task->thread.gsindex = 0;
595                         task->thread.gs = addr;
596                         if (doit) {
597                                 load_gs_index(0);
598                                 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
599                         }
600                 }
601                 put_cpu();
602                 break;
603         case ARCH_SET_FS:
604                 /* Not strictly needed for fs, but do it for symmetry
605                    with gs */
606                 if (addr >= TASK_SIZE_OF(task))
607                         return -EPERM;
608                 cpu = get_cpu();
609                 /* handle small bases via the GDT because that's faster to
610                    switch. */
611                 if (addr <= 0xffffffff) {
612                         set_32bit_tls(task, FS_TLS, addr);
613                         if (doit) {
614                                 load_TLS(&task->thread, cpu);
615                                 loadsegment(fs, FS_TLS_SEL);
616                         }
617                         task->thread.fsindex = FS_TLS_SEL;
618                         task->thread.fs = 0;
619                 } else {
620                         task->thread.fsindex = 0;
621                         task->thread.fs = addr;
622                         if (doit) {
623                                 /* set the selector to 0 to not confuse
624                                    __switch_to */
625                                 loadsegment(fs, 0);
626                                 ret = checking_wrmsrl(MSR_FS_BASE, addr);
627                         }
628                 }
629                 put_cpu();
630                 break;
631         case ARCH_GET_FS: {
632                 unsigned long base;
633                 if (task->thread.fsindex == FS_TLS_SEL)
634                         base = read_32bit_tls(task, FS_TLS);
635                 else if (doit)
636                         rdmsrl(MSR_FS_BASE, base);
637                 else
638                         base = task->thread.fs;
639                 ret = put_user(base, (unsigned long __user *)addr);
640                 break;
641         }
642         case ARCH_GET_GS: {
643                 unsigned long base;
644                 unsigned gsindex;
645                 if (task->thread.gsindex == GS_TLS_SEL)
646                         base = read_32bit_tls(task, GS_TLS);
647                 else if (doit) {
648                         savesegment(gs, gsindex);
649                         if (gsindex)
650                                 rdmsrl(MSR_KERNEL_GS_BASE, base);
651                         else
652                                 base = task->thread.gs;
653                 } else
654                         base = task->thread.gs;
655                 ret = put_user(base, (unsigned long __user *)addr);
656                 break;
657         }
658 
659         default:
660                 ret = -EINVAL;
661                 break;
662         }
663 
664         return ret;
665 }
666 
667 long sys_arch_prctl(int code, unsigned long addr)
668 {
669         return do_arch_prctl(current, code, addr);
670 }
671 
672 
  This page was automatically generated by the LXR engine.