to other notes

Linux Application-Level System Calls

($Revision: 1.1 $)
 

These notes explain how an application-level call to the C library routine read() makes its way down to a device driver, through the various levels of implementation.

The code in the notes below came from Linux kernel version 2.4.22, and from glibc-2.3.2, for the Intel Pentium architecture. There will be variations for different versions of the kernel and C library, as well as different architectures, but the overall design and flow of control will be similar.

  1. The user-level library call read(fd, &buf, len) is implemented by a call to the function read, whose code is generated dynamically by the make process for glibc-2.3.2 as follows:

    echo '#include <sysdep-cancel.h>'; \
    echo 'PSEUDO (__libc_read, read, 3)'; \
    echo '	ret'; \
    echo 'PSEUDO_END(__libc_read)'; \
    echo 'libc_hidden_def (__libc_read)'; \
    echo 'weak_alias (__libc_read, __read)'; \
    echo 'libc_hidden_weak (__read)'; \
    echo 'weak_alias (__libc_read, read)'; \
    echo 'libc_hidden_weak (read)'; \

    The macro PSEUDO is defined in glibc-2.3.2/sysdeps/unix/sysv/linux/i386/sysdeps/sysdep.h as follows:

    #define	PSEUDO(name, syscall_name, args)		      \
      .text;						      \
      ENTRY (name)						      \
        DO_CALL (syscall_name, args);			      \
        cmpl $-4095, %eax;					      \
        jae SYSCALL_ERROR_LABEL;				      \
      L(pseudo_end):
  2. The macro DO_CALL is defined in glibc-2.3.2/sysdeps/unix/sysv/linux/i386/sysdeps/sysdep.h as follows:

    #define DO_CALL(syscall_name, args)			      \
        PUSHARGS_##args					      \
        DOARGS_##args					      \
        movl $SYS_ify (syscall_name), %eax;			      \
        ENTER_KERNEL					      \
        POPARGS_##args

    The macro ENTER_KERNEL is defined in glibc-2.3.2/sysdeps/unix/sysv/linux/i386/sysdeps/sysdep.h as follows:

    /* The original calling convention for system calls on Linux/i386 is
       to use int $0x80.  */
    #ifdef I386_USE_SYSENTER
    # ifdef SHARED
    #  define ENTER_KERNEL call *%gs:SYSINFO_OFFSET
    # else
    #  define ENTER_KERNEL call *_dl_sysinfo
    # endif
    #else
    # define ENTER_KERNEL int $0x80
    #endif

    Using the disassembly feature of the gdb debugger, running on my workstation with Red Hat Linux 9.0, I found the following:

    Dump of assembler code for function read:
    0x420d23d0 :	cmpl   $0x0,%gs:0xc
    0x420d23d8 :	jne    0x420d23fc 
    0x420d23da :	push   %ebx
    0x420d23db :	mov    0x10(%esp,1),%edx
    0x420d23df :	mov    0xc(%esp,1),%ecx
    0x420d23e3 :	mov    0x8(%esp,1),%ebx
    0x420d23e7 :	mov    $0x3,%eax
    0x420d23ec :	call   *%gs:0x10
    0x420d23f3 :	pop    %ebx
    0x420d23f4 :	cmp    $0xfffff001,%eax
    0x420d23f9 :	jae    0x420d242d 
    0x420d23fb :	ret    
    ...

    The location of the call is (apparently) a "call gate", a special descriptor that was set up previously by the kernel for user processes to make kernel calls

    The literal 3, identifying the desired system service as "read", is passed as argument in the register EAX.

    The call gate was setup previously by the following code in arch/i386/kernel/trap.h:

    set_system_gate(SYSCALL_VECTOR,&system_call);
    static void __init set_system_gate(unsigned int n, void *addr)
    {
    	_set_gate(idt_table+n,15,3,addr);
    }
    #define _set_gate(gate_addr,type,dpl,addr) \
    do { \
      int __d0, __d1; \
      __asm__ __volatile__ ("movw %%dx,%%ax\n\t" \
    	"movw %4,%%dx\n\t" \
    	"movl %%eax,%0\n\t" \
    	"movl %%edx,%1" \
    	:"=m" (*((long *) (gate_addr))), \
    	 "=m" (*(1+(long *) (gate_addr))), "=&a" (__d0), "=&d" (__d1) \
    	:"i" ((short) (0x8000+(dpl<<13)+(type<<8))), \
    	 "3" ((char *) (addr)),"2" (__KERNEL_CS << 16)); \
    } while (0)
  3. The gate transfers control to the entry point system_call in the assembly language code module in file arch/i386/kernel/entry.S. The first part of this code is:

    ENTRY(system_call)
    	pushl %eax			# save orig_eax
    	SAVE_ALL
    	GET_CURRENT(%ebx)
    	testb $0x02,tsk_ptrace(%ebx)	# PT_TRACESYS
    	jne tracesys
    	cmpl $(NR_syscalls),%eax
    	jae badsys
    	call *SYMBOL_NAME(sys_call_table)(,%eax,4)
    	movl %eax,EAX(%esp)		# save the return value
    ENTRY(ret_from_sys_call)
    	cli				# need_resched and signals atomic test
    	cmpl $0,need_resched(%ebx)
    	jne reschedule
    	cmpl $0,sigpending(%ebx)
    	jne signal_return
    restore_all:
    	RESTORE_ALL
    ...
  4. The routine above makes an indirect call using the address stored at the offset specified by the call paramter, which in this case is 3, in the data structure sys_call_table:

    .data
    ENTRY(sys_call_table)
    	.long SYMBOL_NAME(sys_ni_syscall)	/* 0  -  old "setup()" system call*/
    	.long SYMBOL_NAME(sys_exit)
    	.long SYMBOL_NAME(sys_fork)
    	.long SYMBOL_NAME(sys_read)
    	.long SYMBOL_NAME(sys_write)
    ...
  5. The function sys_read is defined in fs/read_write.c as:

    asmlinkage ssize_t sys_read(unsigned int fd, char * buf, size_t count)
    {
            ssize_t ret;
            struct file * file;
            ret = -EBADF;
            file = fget(fd);
            if (file) {
                    if (file->f_mode & FMODE_READ) {
                            ret = locks_verify_area(FLOCK_VERIFY_READ, file->f_dentry->d_inode,
                                                    file, file->f_pos, count);
                            if (!ret) {
                                    ssize_t (*read)(struct file *, char *, size_t, loff_t *);
                                    ret = -EINVAL;
                                    if (file->f_op && (read = file->f_op->read) != NULL)
                                            ret = read(file, buf, count, &file->f_pos);
                            }
                    }
                    if (ret > 0)
                            dnotify_parent(file->f_dentry, DN_ACCESS);
                    fput(file);
            }
            return ret;
    }

    This code calls fget(fd) to get a pointer to the struct file object that corresponds to the file descriptor fd passed in by the user, uses the f_ops pointer of this object to find the appropriate read routine, and eventually calls that routine.

    It passes along the user's buf and count parameters, but also passes along a pointer to the struct file object and a pointer to the f_pos field of that object.

  6. Finally, the read method associated with the file object is called.

© 2004 T. P. Baker. No part of this publication may be reproduced, stored in a retrieval system, or transmitted in any form or by any means without written permission. (Last updated by $Author: baker $ on $Date: 2008/04/28 12:41:35 $.)