但它们又不是全然无关的,不能设计成单独的进程。所以就须要比进程更小的单位,它们独立被调度,又共享一些资源。
这就是线程组。
struct task_struct {volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */struct thread_info *thread_info;atomic_t usage;unsigned long flags; /* per process flags, defined below */unsigned long ptrace;int lock_depth; /* Lock depth */int prio, static_prio;struct list_head run_list;prio_array_t *array;unsigned long sleep_avg;unsigned long long timestamp, last_ran;int activated;unsigned long policy;cpumask_t cpus_allowed;unsigned int time_slice, first_time_slice;#ifdef CONFIG_SCHEDSTATSstruct sched_info sched_info;#endifstruct list_head tasks;/** ptrace_list/ptrace_children forms the list of my children* that were stolen by a ptracer.*/struct list_head ptrace_children;struct list_head ptrace_list;struct mm_struct *mm, *active_mm;/* task state */struct linux_binfmt *binfmt;long exit_state;int exit_code, exit_signal;int pdeath_signal; /* The signal sent when the parent dies *//* ???
*/unsigned long personality;unsigned did_exec:1;pid_t pid;pid_t tgid;/** pointers to (original) parent process, youngest child, younger sibling,* older sibling, respectively. (p->father can be replaced with* p->parent->pid)*/struct task_struct *real_parent; /* real parent process (when being debugged) */struct task_struct *parent; /* parent process *//** children/sibling forms the list of my children plus the* tasks I‘m ptracing.*/struct list_head children; /* list of my children */struct list_head sibling; /* linkage in my parent‘s children list */struct task_struct *group_leader; /* threadgroup leader *//* PID/PID hash table linkage. */struct pid pids[PIDTYPE_MAX];struct completion *vfork_done; /* for vfork() */int __user *set_child_tid; /* CLONE_CHILD_SETTID */int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */unsigned long rt_priority;unsigned long it_real_value, it_real_incr;cputime_t it_virt_value, it_virt_incr;cputime_t it_prof_value, it_prof_incr;struct timer_list real_timer;cputime_t utime, stime;unsigned long nvcsw, nivcsw; /* context switch counts */struct timespec start_time;/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */unsigned long min_flt, maj_flt;/* process credentials */uid_t uid,euid,suid,fsuid;gid_t gid,egid,sgid,fsgid;struct group_info *group_info;kernel_cap_t cap_effective, cap_inheritable, cap_permitted;unsigned keep_capabilities:1;struct user_struct *user;#ifdef CONFIG_KEYSstruct key *session_keyring; /* keyring inherited over fork */struct key *process_keyring; /* keyring private to this process (CLONE_THREAD) */struct key *thread_keyring; /* keyring private to this thread */#endifint oomkilladj; /* OOM kill score adjustment (bit shift). */char comm[TASK_COMM_LEN];/* file system info */int link_count, total_link_count;/* ipc stuff */struct sysv_sem sysvsem;/* CPU-specific state of this task */struct thread_struct thread;/* filesystem information */struct fs_struct *fs;/* open file information */struct files_struct *files;/* namespace */struct namespace *namespace;/* signal handlers */struct signal_struct *signal;struct sighand_struct *sighand;sigset_t blocked, real_blocked;struct sigpending pending;unsigned long sas_ss_sp;size_t sas_ss_size;int (*notifier)(void *priv);void *notifier_data;sigset_t *notifier_mask;void *security;struct audit_context *audit_context;/* Thread group tracking */u32 parent_exec_id;u32 self_exec_id;/* Protection of (de-)allocation: mm, files, fs, tty, keyrings */spinlock_t alloc_lock;/* Protection of proc_dentry: nesting proc_lock, dcache_lock, write_lock_irq(&tasklist_lock); */spinlock_t proc_lock;/* context-switch lock */spinlock_t switch_lock;/* journalling filesystem info */void *journal_info;/* VM state */struct reclaim_state *reclaim_state;struct dentry *proc_dentry;struct backing_dev_info *backing_dev_info;struct io_context *io_context;unsigned long ptrace_message;siginfo_t *last_siginfo; /* For ptrace use. *//** current io wait handle: wait queue entry to use for io waits* If this thread is processing aio, this points at the waitqueue* inside the currently handled kiocb. It may be NULL (i.e. default* to a stack based synchronous wait) if its doing sync IO.*/wait_queue_t *io_wait;/* i/o counters(bytes read/written, #syscalls */u64 rchar, wchar, syscr, syscw;#if defined(CONFIG_BSD_PROCESS_ACCT)u64 acct_rss_mem1; /* accumulated rss usage */u64 acct_vm_mem1; /* accumulated virtual memory usage */clock_t acct_stimexpd; /* clock_t-converted stime since last update */#endif#ifdef CONFIG_NUMAstruct mempolicy *mempolicy;short il_next;#endif};
// include/linux/sched.hunion thread_union{struct thread_info thread_info;unsigned long stack[2048];}
链表头是init_task描写叙述符,它是cpu0上的0进程。也叫swapper进程。
task_struct中的state字段描写叙述了进程当前所处的状态:
如硬件中断。等待的资源被释放。接受一个信号。
宏:
进程状态转换图:
这种情况下,进程状态没有提供足够的信息来高速恢复进程。所以有必要引进附加的进程链表。linux用等待队列实现这种链表。
也就是说,Linux进程之间的关系能够组织为一棵树,其根节点为0号进程。
task_struct中相关字段:
除了父子关系,进程还存在其它关系(线程组。进程组,登录会话,调试跟踪):
进程组和会话中的进程安排:
proc1 | proc2 &proc3 | proc4 | proc5
struct pid{int nr;//冗余?struct hlist_node pid_chain;struct list_head pid_list;}
进程是在系统执行过程中动态创建的。比如:用户在shell中输入一条命令、程序执行fork或pthread_create等。
此时,进程怎样创建呢?-->
fork系统调用,曾经的做法是,子进程复制父进程所拥有的资源。
可是非常多情况下,子进程要做与父进程不同的事。所以子进程马上调用execve(),复制的数据马上丢弃。所以效率低。
后来引入了vfork系统调用,子进程共享其父进程的内存地址空间,并堵塞父进程的运行,一直到子进程退出或运行一个新的程序。
如今的fork引入了写时复制技术(copy-on-write) --> vfrok的优势不再,应避免使用。
此外,clone系统调用同意仔细地控制子进程共享哪些父进程的数据。被用来实现轻量级进程。下表列出了clone的共享标志:
// include/linux/sched.h/** cloning flags:*/#define CSIGNAL 0x000000ff /* signal mask to be sent at exit */#define CLONE_VM 0x00000100 /* set if VM shared between processes */#define CLONE_FS 0x00000200 /* set if fs info shared between processes */#define CLONE_FILES 0x00000400 /* set if open files shared between processes */#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */#define CLONE_THREAD 0x00010000 /* Same thread group? */#define CLONE_NEWNS 0x00020000 /* New namespace group?
*/#define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */#define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */#define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */#define CLONE_CHILD_CLEARTID 0x00200000 /* clear the TID in the child */#define CLONE_DETACHED 0x00400000 /* Unused, ignored */#define CLONE_UNTRACED 0x00800000 /* set if the tracing process can‘t force CLONE_PTRACE on this clone */#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */#define CLONE_STOPPED 0x02000000 /* Start in stopped state *//** List of flags we want to share for kernel threads,* if only because they are not used by them anyway.*/#define CLONE_KERNEL (CLONE_FS | CLONE_FILES | CLONE_SIGHAND)
// kernel/fork.c/** Ok, this is the main fork-routine.** It copies the process, and if successful kick-starts* it and waits for it to finish using the VM if required.*/long do_fork(unsigned long clone_flags,unsigned long stack_start,struct pt_regs *regs,unsigned long stack_size,int __user *parent_tidptr,int __user *child_tidptr){struct task_struct *p;int trace = 0;long pid = alloc_pidmap();//通过查找pidmap_array位图,为子进程分配新的PIDif (pid < 0)return -EAGAIN;/* 检查子进程是否要跟踪*/if (unlikely(current->ptrace)) {trace = fork_traceflag (clone_flags);if (trace)clone_flags |= CLONE_PTRACE;}/* 核心!复制父进程的task_struct,并申请了内核栈和thread_info */p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr, pid);/** Do this prior waking up the new thread - the thread pointer* might get invalid after that point, if the thread exits quickly.*/if (!IS_ERR(p)) {struct completion vfork;if (clone_flags & CLONE_VFORK) {p->vfork_done = &vfork;init_completion(&vfork);}/* 假设设置了CLONE_STOPPED标志。或要跟踪子进程,那么子进程被设置成TASK_STOPPED,并为子进程添加挂起的SIGSTOP信号。在还有一进程把子进程的状态恢复为TASK_RUNNING之前(一般是SIGCONT信号)。子进程不得执行*/if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) {/** We‘ll start up with an immediate SIGSTOP.*/sigaddset(&p->pending.signal, SIGSTOP);set_tsk_thread_flag(p, TIF_SIGPENDING);}/* 唤醒子进程,1)若父子进程在同一cpu且不能共享页表(CLONE_VM=0),则在执行队列中,把子进程插入在父进程前面,以避免不必要的写时复制开销;2)不同cpu或CLONE_VM=1,把子进程插入现在成执行队列的队尾 */if (!(clone_flags & CLONE_STOPPED))wake_up_new_task(p, clone_flags);elsep->state = TASK_STOPPED;/* 假设父进程被跟踪,则把子进程pid保存,以使祖父进程(debugger)获取 */if (unlikely (trace)) {current->ptrace_message = pid;ptrace_notify ((trace << 8) | SIGTRAP);}/* vfrok要求父进程挂起,直到子进程结束或执行新的程序 */if (clone_flags & CLONE_VFORK) {wait_for_completion(&vfork);if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE))ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);}} else {free_pidmap(pid);pid = PTR_ERR(p);}return pid;}
例如以下所看到的:
#include <stdio.h>#include <bits/types.h>#include <unistd.h>#include <stdlib.h>int glob = 6;char buf[] = "a write to stdout\n";int main(void){int var;pid_t pid;var = 88;if(write(STDOUT_FILENO, buf, sizeof(buf)-1) != sizeof(buf)-1)printf("write error\n");printf("before fork\n");if((pid = fork()) < 0){printf("fork error\n");} else if(pid == 0) { /* child */glob++;var++;} else{ /* parent */sleep(2);}printf("pid = %d, glob = %d, var = %d\n", getpid(), glob, var);exit(0);}
fork时。父进程数据空间拷贝到子进程中时,缓冲区也被拷贝到子进程中。
(详见《UNIX环境高级编程》)
从用户态看来,子进程继承了父进程的(有些须要结合《UNIX环境高级编程》上下文才干看懂):
打开文件
实际用户ID、实际组ID、有效用户ID、有效组ID
附加组ID
进程组ID
会话ID
控制终端
设置用户ID标志和设置组ID标志
当前工作文件夹
根文件夹
文件模式创建屏蔽字
信号屏蔽和安排
针对随意打开文件描写叙述符的在运行时关闭标志
环境
连接的共享存储段
存储映射
资源限制
fork的返回值
进程ID不同
父进程ID
子进程的tms_utime、tms_stime、 tms_cutime以及tms_ustime均被设置为0
父进程设置的文件所不会被子进程继承
子进程的未处理的闹钟被清除
子进程的未处理信号集设置为空集
// init/main.c/** Activate the first processor.*/asmlinkage void __init start_kernel(void){char * command_line;extern struct kernel_param __start___param[], __stop___param[];/** Interrupts are still disabled. Do necessary setups, then* enable them*/lock_kernel();page_address_init();printk(linux_banner);setup_arch(&command_line);setup_per_cpu_areas();/** Mark the boot cpu "online" so that it can call console drivers in* printk() and can access its per-cpu storage.*/smp_prepare_boot_cpu();/** Set up the scheduler prior starting any interrupts (such as the* timer interrupt). Full topology setup happens at smp_init()* time - but meanwhile we still have a functioning scheduler.*/sched_init();/** Disable preemption - early bootup scheduling is extremely* fragile until we cpu_idle() for the first time.*/preempt_disable();build_all_zonelists();page_alloc_init();//初始化伙伴系统printk("Kernel command line: %s\n", saved_command_line);parse_early_param();parse_args("Booting kernel", command_line, __start___param,__stop___param - __start___param,&unknown_bootoption);sort_main_extable();trap_init();rcu_init();init_IRQ();pidhash_init();init_timers();softirq_init();time_init();/** HACK ALERT! This is early. We‘re enabling the console before* we‘ve done PCI setups etc, and console_init() must be aware of* this. But we do want output early, in case something goes wrong.*/console_init();if (panic_later)panic(panic_later, panic_param);profile_init();local_irq_enable();#ifdef CONFIG_BLK_DEV_INITRDif (initrd_start && !initrd_below_start_ok &&initrd_start < min_low_pfn << PAGE_SHIFT) {printk(KERN_CRIT "initrd overwritten (0x%08lx < 0x%08lx) - ""disabling it.\n",initrd_start,min_low_pfn << PAGE_SHIFT);initrd_start = 0;}#endifvfs_caches_init_early();mem_init();kmem_cache_init();//初始化slabnuma_policy_init();if (late_time_init)late_time_init();calibrate_delay();//确定cpu时钟速度pidmap_init();pgtable_cache_init();prio_tree_init();anon_vma_init();#ifdef CONFIG_X86if (efi_enabled)efi_enter_virtual_mode();#endiffork_init(num_physpages);proc_caches_init();buffer_init();unnamed_dev_init();security_init();vfs_caches_init(num_physpages);radix_tree_init();signals_init();/* rootfs populating might need page-writeback */page_writeback_init();#ifdef CONFIG_PROC_FSproc_root_init();#endifcheck_bugs();acpi_early_init(); /* before LAPIC and SMP init *//* Do the rest non-__init‘ed, we‘re now alive */rest_init();//继续,后面会创建1号init进程,最后cpu_idle(),用以cpu没进程运行时替补}/** We need to finalize in a non-__init function or else race conditions* between the root thread and the init thread may cause start_kernel to* be reaped by free_initmem before the root thread has proceeded to* cpu_idle.** gcc-3.4 accidentally inlines this function, so use noinline.*/static void noinline rest_init(void)__releases(kernel_lock){kernel_thread(init, NULL, CLONE_FS | CLONE_SIGHAND);numa_default_policy();unlock_kernel();preempt_enable_no_resched();cpu_idle();}
static int init(void * unused){lock_kernel();/** Tell the world that we‘re going to be the grim* reaper of innocent orphaned children.** We don‘t want people to have to make incorrect* assumptions about where in the task array this* can be found.*/child_reaper = current;/* Sets up cpus_possible() */smp_prepare_cpus(max_cpus); /* 这里创建其它0号进程 */do_pre_smp_initcalls();fixup_cpu_present_map();smp_init();sched_init_smp();/** Do this before initcalls, because some drivers want to access* firmware files.*/populate_rootfs();do_basic_setup();/** check if there is an early userspace init. If yes, let it do all* the work*/if (sys_access((const char __user *) "/init", 0) == 0)execute_command = "/init";elseprepare_namespace();/** Ok, we have completed the initial bootup, and* we‘re essentially up and running. Get rid of the* initmem segments and start the user-mode stuff..*/free_initmem();unlock_kernel();system_state = SYSTEM_RUNNING;numa_default_policy();if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0)printk("Warning: unable to open an initial console.\n");(void) sys_dup(0);(void) sys_dup(0);/** We try each of these until one succeeds.** The Bourne shell can be used instead of init if we are* trying to recover a really broken machine.*/if (execute_command)run_init_process(execute_command);run_init_process("/sbin/init");run_init_process("/etc/init");run_init_process("/bin/init");run_init_process("/bin/sh");panic("No init found. Try passing init= option to kernel.");}
一些内核线程的样例:
进程终止有8种方式:
两个进程终止的系统调用:
c库函数exit()基于此系统调用
// kernel/exit.cfastcall NORET_TYPE void do_exit(long code){struct task_struct *tsk = current;int group_dead;profile_task_exit(tsk);if (unlikely(in_interrupt()))panic("Aiee, killing interrupt handler!");if (unlikely(!tsk->pid))panic("Attempted to kill the idle task!");if (unlikely(tsk->pid == 1))panic("Attempted to kill init!");if (tsk->io_context)exit_io_context();if (unlikely(current->ptrace & PT_TRACE_EXIT)) {current->ptrace_message = code;ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP);}/* 更新状态,进程正在退出*/tsk->flags |= PF_EXITING;del_timer_sync(&tsk->real_timer);if (unlikely(in_atomic()))printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",current->comm, current->pid,preempt_count());acct_update_integrals();update_mem_hiwater();group_dead = atomic_dec_and_test(&tsk->signal->live);if (group_dead)acct_process(code);/* 解除对内存。信号量,文件系统。打开文件,命名空间等的引用。非共享则删除 */exit_mm(tsk);exit_sem(tsk);__exit_files(tsk);__exit_fs(tsk);exit_namespace(tsk);exit_thread();exit_keys(tsk);if (group_dead && tsk->signal->leader)disassociate_ctty(1);module_put(tsk->thread_info->exec_domain->module);if (tsk->binfmt)module_put(tsk->binfmt->module);/* exit_code,系统调用參数(正常终止)或内核提供的错误码(异常终止)*/tsk->exit_code = code;/* 更新亲属关系,子进程将被兄弟进程或init收养* 是否须要向父进程发送SIGCHLD信号* release_task()回收进程其它数据结构占用的内存* 进程EXIT_DEAD或EXIT_ZOMBIE*/exit_notify(tsk);#ifdef CONFIG_NUMAmpol_free(tsk->mempolicy);tsk->mempolicy = NULL;#endifBUG_ON(!(current->flags & PF_DEAD));/* 进程调度,一去不回 */schedule();BUG();/* Avoid "noreturn function does return". */for (;;) ;}
这里仅仅涉及内核怎样完毕进程切换。而不涉及调度机制和算法策略。也就是说。这里假定调度程序已经选好了合适的进程,怎样换下旧进程。装上新进程。
因此cpu寄存器的保存和恢复是进程切换的重要内容。
在运行进程切换之前,用户态进程使用的全部寄存器内容都已保存在内核态堆栈上。
进程切换由两步组成:
这些资源包含:
// include/asm-generic/resource.h#define RLIMIT_CPU 0 /* CPU time in ms */#define RLIMIT_FSIZE 1 /* Maximum filesize */#define RLIMIT_DATA 2 /* max data size */#define RLIMIT_STACK 3 /* max stack size */#define RLIMIT_CORE 4 /* max core file size */#define RLIMIT_RSS 5 /* max resident set size */#define RLIMIT_NPROC 6 /* max number of processes */#define RLIMIT_NOFILE 7 /* max number of open files */#define RLIMIT_MEMLOCK 8 /* max locked-in-memory address space */#define RLIMIT_AS 9 /* address space limit */#define RLIMIT_LOCKS 10 /* maximum file locks held */#define RLIMIT_SIGPENDING 11 /* max number of pending signals */#define RLIMIT_MSGQUEUE 12 /* maximum bytes in POSIX mqueues */#define RLIM_NLIMITS 13
// include/linux/resource.hstruct rlimit {unsigned long rlim_cur;unsigned long rlim_max;};
原文:http://www.cnblogs.com/cxchanpin/p/6915152.html