Linux源码阅读笔记02-进程原理及系统调用-个人在线分享

进程和进程的生命周期

进程：指计算机中已运行的程序。进程本身不是基本的运行单位，而是线程的容器。程序本身不是基本的运行单位，而是线程的容器。程序是指令、数据和组织形式的描述，进程才是程序的真正运行实例。
Linux内核把进程叫做Task，进程的虚拟地址空间可分为用户虚拟地址空间和内核虚拟地址空间，所有进程共享内核虚拟地址空间，每个进程有独立的用户虚拟地址空间。

进程的两种特殊形式

没有用户虚拟地址空间的进程叫做内核线程
共享用户虚拟地址空间的进程叫做用户线程
共享同一个用户虚拟地址空间的所有用户线程叫做线程组

C语言标准库进程	Linux内核进程
包括多个线程的进程	线程组
一个线程的进程	进程或者任务
线程	共享用户虚拟地址的空间的进程

Linux源码阅读笔记02-进程原理及系统调用插图

Linux通过：ps输出当前系统的进程状态。显示瞬间进程状态，不是动态连续；如果想动态连续，使用top命令。

USER：使用者
PID：进程编号
VSZ：进程占用的虚拟内存容量（KB）
RSS：进程占用的固定内存量是多少
TTY：在哪个终端运行
STAT：程序目前状态
- S：睡眠状态，但是基于唤醒
- R：在运行

进程的生命周期

进程状态
- 创建状态
- 就绪状态
- 执行状态
- 阻塞状态
- 终止状态
- Linux内核提供API设置进程状态
  - TASK_RUNNING：运行态或者就绪态，在内核中时运行态和就绪态的集合；
  - TASK_INterRUPTIBLE：可中断睡眠状态（又叫浅睡眠状态），进程阻塞时如果条件满足，内核就将进程状态该问RUN状态，加入就绪队列；
  - TASK_UNINTERRUPTIBLE：不可中断状态（又叫深度睡眠状态），进程在睡眠不被干扰，我们可以通过ps命令查看被标记为D得到进程就是不可中断状态进程；
  - __TASK_STOPPED：终止状态；
  - EXIT_ZOMBIE：僵尸状态

task_struct数据结构分析

将进程抽象为进程控制块（PCB，Process Control BLock），Linux内核中使用task_struct结构描述进程控制块。

Linux内核进程描述符（控制块）task_struct核心成员分析

// 进程控制块
struct task_struct {
#ifdef CONFIG_THREAD_INFO_IN_TASK
/*
* For reasons of header soup (see current_thread_info()), this
* must be the first element of task_struct.
*/
struct thread_info		thread_info;
#endif
/* -1 unrunnable, 0 runnable, >0 stopped: */
volatile long			state; // 进程状态标志
/*
* This begins the randomizable portion of task_struct. Only
* scheduling-critical items should be added above here.
*/
randomized_struct_fields_start
void				*stack; // 纸箱内核栈
refcount_t			usage;
/* Per task flags (PF_*), defined further below: */
unsigned int			flags;
unsigned int			ptrace;
#ifdef CONFIG_SMP
struct llist_node		wake_entry;
int				on_cpu;
#ifdef CONFIG_THREAD_INFO_IN_TASK
/* Current CPU: */
unsigned int			cpu;
#endif
unsigned int			wakee_flips;
unsigned long			wakee_flip_decay_ts;
struct task_struct		*last_wakee;
/*
* recent_used_cpu is initially set as the last CPU used by a task
* that wakes affine another task. Waker/wakee relationships can
* push tasks around a CPU where each wakeup moves to the next one.
* Tracking a recently used CPU allows a quick search for a recently
* used CPU that may be idle.
*/
int				recent_used_cpu;
int				wake_cpu;
#endif
int				on_rq;
/*调度策略和优先级*/
int				prio;
int				static_prio;
int				normal_prio;
unsigned int			rt_priority;
const struct sched_class	*sched_class;
struct sched_entity		se;
struct sched_rt_entity		rt;
#ifdef CONFIG_CGROUP_SCHED
struct task_group		*sched_task_group;
#endif
struct sched_dl_entity		dl;
#ifdef CONFIG_UCLAMP_TASK
/* Clamp values requested for a scheduling entity */
struct uclamp_se		uclamp_req[UCLAMP_CNT];
/* Effective clamp values used for a scheduling entity */
struct uclamp_se		uclamp[UCLAMP_CNT];
#endif
#ifdef CONFIG_PREEMPT_NOTIFIERS
/* List of struct preempt_notifier: */
struct hlist_head		preempt_notifiers;
#endif
#ifdef CONFIG_BLK_DEV_IO_TRACE
unsigned int			btrace_seq;
#endif
unsigned int			policy;
int				nr_cpus_allowed; // 
const cpumask_t			*cpus_ptr; // 此成员允许进程在哪个cpu上运行
cpumask_t			cpus_mask;
#ifdef CONFIG_PREEMPT_RCU
int				rcu_read_lock_nesting;
union rcu_special		rcu_read_unlock_special;
struct list_head		rcu_node_entry;
struct rcu_node			*rcu_blocked_node;
#endif /* #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_TASKS_RCU
unsigned long			rcu_tasks_nvcsw;
u8				rcu_tasks_holdout;
u8				rcu_tasks_idx;
int				rcu_tasks_idle_cpu;
struct list_head		rcu_tasks_holdout_list;
#endif /* #ifdef CONFIG_TASKS_RCU */
struct sched_info		sched_info;
struct list_head		tasks;
#ifdef CONFIG_SMP
struct plist_node		pushable_tasks;
struct rb_node			pushable_dl_tasks;
#endif
// 指向内存描述符。进程：mm和active_mm指向同一个内存描述符。内核线程：mm是空指针
// 当内核线程运行时，active_mm指向从进程借用内存描述符
struct mm_struct		*mm;
struct mm_struct		*active_mm; 
/* Per-thread vma caching: */
struct vmacache			vmacache;
#ifdef SPLIT_RSS_COUNTING
struct task_rss_stat		rss_stat;
#endif
int				exit_state;
int				exit_code;
int				exit_signal;
/* The signal sent when the parent dies: */
int				pdeath_signal;
/* JOBCTL_*, siglock protected: */
unsigned long			jobctl;
/* Used for emulating ABI behavior of previous Linux versions: */
unsigned int			personality;
/* Scheduler bits, serialized by scheduler locks: */
unsigned			sched_reset_on_fork:1;
unsigned			sched_contributes_to_load:1;
unsigned			sched_migrated:1;
unsigned			sched_remote_wakeup:1;
#ifdef CONFIG_PSI
unsigned			sched_psi_wake_requeue:1;
#endif
/* Force alignment to the next boundary: */
unsigned			:0;
/* Unserialized, strictly 'current' */
/* Bit to tell LSMs we're in execve(): */
unsigned			in_execve:1;
unsigned			in_iowait:1;
#ifndef TIF_RESTORE_SIGMASK
unsigned			restore_sigmask:1;
#endif
#ifdef CONFIG_MEMCG
unsigned			in_user_fault:1;
#endif
#ifdef CONFIG_COMPAT_BRK
unsigned			brk_randomized:1;
#endif
#ifdef CONFIG_CGROUPS
/* disallow userland-initiated cgroup migration */
unsigned			no_cgroup_migration:1;
/* task is frozen/stopped (used by the cgroup freezer) */
unsigned			frozen:1;
#endif
#ifdef CONFIG_BLK_CGROUP
/* to be used once the psi infrastructure lands upstream. */
unsigned			use_memdelay:1;
#endif
unsigned long			atomic_flags; /* Flags requiring atomic access. */
struct restart_block		restart_block;
pid_t				pid; // 全局进程号
pid_t				tgid; // 全局线程组的标识符
#ifdef CONFIG_STACKPROTECTOR
/* Canary value for the -fstack-protector GCC feature: */
unsigned long			stack_canary;
#endif
/*
* Pointers to the (original) parent process, youngest child, younger sibling,
* older sibling, respectively.  (p->father can be replaced with
* p->real_parent->pid)
*/
/* Real parent process: */
struct task_struct __rcu	*real_parent; // 指向真实父进程
/* Recipient of SIGCHLD, wait4() reports: */
struct task_struct __rcu	*parent; // 指向父进程 如果使用系统调用跟踪进程，这个是跟踪进程，否则和real_parent是相同的
/*
* Children/sibling form the list of natural children:
*/
struct list_head		children;
struct list_head		sibling;
struct task_struct		*group_leader; // 指向线程组的组长
/*
* 'ptraced' is the list of tasks this task is using ptrace() on.
*
* This includes both natural children and PTRACE_ATTACH targets.
* 'ptrace_entry' is this task's link on the p->parent->ptraced list.
*/
struct list_head		ptraced;
struct list_head		ptrace_entry;
/* PID/PID hash table linkage. */
struct pid			*thread_pid;
struct hlist_node		pid_links[PIDTYPE_MAX]; // 进程号，进程组标识符和会话标识符
struct list_head		thread_group;
struct list_head		thread_node;
struct completion		*vfork_done;
/* CLONE_CHILD_SETTID: */
int __user			*set_child_tid;
/* CLONE_CHILD_CLEARTID: */
int __user			*clear_child_tid;
u64				utime;
u64				stime;
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
u64				utimescaled;
u64				stimescaled;
#endif
u64				gtime;
struct prev_cputime		prev_cputime;
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
struct vtime			vtime;
#endif
#ifdef CONFIG_NO_HZ_FULL
atomic_t			tick_dep_mask;
#endif
/* Context switch counts: */
unsigned long			nvcsw;
unsigned long			nivcsw;
/* Monotonic time in nsecs: */
u64				start_time;
/* Boot based time in nsecs: */
u64				start_boottime;
/* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */
unsigned long			min_flt;
unsigned long			maj_flt;
/* Empty if CONFIG_POSIX_CPUTIMERS=n */
struct posix_cputimers		posix_cputimers;
/* Process credentials: */
/* Tracer's credentials at attach: */
const struct cred __rcu		*ptracer_cred;
/* Objective and real subjective task credentials (COW): */
const struct cred __rcu		*real_cred; // 此成员指向主体和真实客体证书
/* Effective (overridable) subjective task credentials (COW): */
const struct cred __rcu		*cred; // 指向有效证书 但是可以临时改变
#ifdef CONFIG_KEYS
/* Cached requested key. */
struct key			*cached_requested_key;
#endif
/*
* executable name, excluding path.
*
* - normally initialized setup_new_exec()
* - access it with [gs]et_task_comm()
* - lock it with task_lock()
*/
char				comm[TASK_COMM_LEN]; // 进程名称
struct nameidata		*nameidata;
// 下面这两个成员用于UNIX系统，型号量和共享内存
#ifdef CONFIG_SYSVIPC
struct sysv_sem			sysvsem;
struct sysv_shm			sysvshm;
#endif
#ifdef CONFIG_DETECT_HUNG_TASK
unsigned long			last_switch_count;
unsigned long			last_switch_time;
#endif
/* Filesystem information: */
struct fs_struct		*fs; // 文件系统信息，主要是进程根目录和当前工作目录
/* Open file information: */
struct files_struct		*files; // 打开文件表
/* Namespaces: */
struct nsproxy			*nsproxy; // 命名空间
// 下面这快成员用于信号处理
/* Signal handlers: */
struct signal_struct		*signal;
struct sighand_struct __rcu		*sighand;
sigset_t			blocked;
sigset_t			real_blocked;
/* Restored if set_restore_sigmask() was used: */
sigset_t			saved_sigmask;
struct sigpending		pending;
unsigned long			sas_ss_sp;
size_t				sas_ss_size;
unsigned int			sas_ss_flags;
struct callback_head		*task_works;
#ifdef CONFIG_AUDIT
#ifdef CONFIG_AUDITSYSCALL
struct audit_context		*audit_context;
#endif
kuid_t				loginuid;
unsigned int			sessionid;
#endif
struct seccomp			seccomp;
/* Thread group tracking: */
u64				parent_exec_id;
u64				self_exec_id;
/* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */
spinlock_t			alloc_lock;
/* Protection of the PI data structures: */
raw_spinlock_t			pi_lock;
struct wake_q_node		wake_q;
#ifdef CONFIG_RT_MUTEXES
/* PI waiters blocked on a rt_mutex held by this task: */
struct rb_root_cached		pi_waiters;
/* Updated under owner's pi_lock and rq lock */
struct task_struct		*pi_top_task;
/* Deadlock detection and priority inheritance handling: */
struct rt_mutex_waiter		*pi_blocked_on;
#endif
#ifdef CONFIG_DEBUG_MUTEXES
/* Mutex deadlock detection: */
struct mutex_waiter		*blocked_on;
#endif
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
int				non_block_count;
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
unsigned int			irq_events;
unsigned long			hardirq_enable_ip;
unsigned long			hardirq_disable_ip;
unsigned int			hardirq_enable_event;
unsigned int			hardirq_disable_event;
int				hardirqs_enabled;
int				hardirq_context;
unsigned long			softirq_disable_ip;
unsigned long			softirq_enable_ip;
unsigned int			softirq_disable_event;
unsigned int			softirq_enable_event;
int				softirqs_enabled;
int				softirq_context;
#endif
#ifdef CONFIG_LOCKDEP
# define MAX_LOCK_DEPTH			48UL
u64				curr_chain_key;
int				lockdep_depth;
unsigned int			lockdep_recursion;
struct held_lock		held_locks[MAX_LOCK_DEPTH];
#endif
#ifdef CONFIG_UBSAN
unsigned int			in_ubsan;
#endif
/* Journalling filesystem info: */
void				*journal_info;
/* Stacked block device info: */
struct bio_list			*bio_list;
#ifdef CONFIG_BLOCK
/* Stack plugging: */
struct blk_plug			*plug;
#endif
/* VM state: */
struct reclaim_state		*reclaim_state;
struct backing_dev_info		*backing_dev_info;
struct io_context		*io_context;
#ifdef CONFIG_COMPACTION
struct capture_control		*capture_control;
#endif
/* Ptrace state: */
unsigned long			ptrace_message;
kernel_siginfo_t		*last_siginfo;
struct task_io_accounting	ioac;
#ifdef CONFIG_PSI
/* Pressure stall state */
unsigned int			psi_flags;
#endif
#ifdef CONFIG_TASK_XACCT
/* Accumulated RSS usage: */
u64				acct_rss_mem1;
/* Accumulated virtual memory usage: */
u64				acct_vm_mem1;
/* stime + utime since last update: */
u64				acct_timexpd;
#endif
#ifdef CONFIG_CPUSETS
/* Protected by ->alloc_lock: */
nodemask_t			mems_allowed;
/* Seqence number to catch updates: */
seqcount_t			mems_allowed_seq;
int				cpuset_mem_spread_rotor;
int				cpuset_slab_spread_rotor;
#endif
#ifdef CONFIG_CGROUPS
/* Control Group info protected by css_set_lock: */
struct css_set __rcu		*cgroups;
/* cg_list protected by css_set_lock and tsk->alloc_lock: */
struct list_head		cg_list;
#endif
#ifdef CONFIG_X86_CPU_RESCTRL
u32				closid;
u32				rmid;
#endif
#ifdef CONFIG_FUTEX
struct robust_list_head __user	*robust_list;
#ifdef CONFIG_COMPAT
struct compat_robust_list_head __user *compat_robust_list;
#endif
struct list_head		pi_state_list;
struct futex_pi_state		*pi_state_cache;
struct mutex			futex_exit_mutex;
unsigned int			futex_state;
#endif
#ifdef CONFIG_PERF_EVENTS
struct perf_event_context	*perf_event_ctxp[perf_nr_task_contexts];
struct mutex			perf_event_mutex;
struct list_head		perf_event_list;
#endif
#ifdef CONFIG_DEBUG_PREEMPT
unsigned long			preempt_disable_ip;
#endif
#ifdef CONFIG_NUMA
/* Protected by alloc_lock: */
struct mempolicy		*mempolicy;
short				il_prev;
short				pref_node_fork;
#endif
#ifdef CONFIG_NUMA_BALANCING
int				numa_scan_seq;
unsigned int			numa_scan_period;
unsigned int			numa_scan_period_max;
int				numa_preferred_nid;
unsigned long			numa_migrate_retry;
/* Migration stamp: */
u64				node_stamp;
u64				last_task_numa_placement;
u64				last_sum_exec_runtime;
struct callback_head		numa_work;
/*
* This pointer is only modified for current in syscall and
* pagefault context (and for tasks being destroyed), so it can be read
* from any of the following contexts:
*  - RCU read-side critical section
*  - current->numa_group from everywhere
*  - task's runqueue locked, task not running
*/
struct numa_group __rcu		*numa_group;
/*
* numa_faults is an array split into four regions:
* faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer
* in this precise order.
*
* faults_memory: Exponential decaying average of faults on a per-node
* basis. Scheduling placement decisions are made based on these
* counts. The values remain static for the duration of a PTE scan.
* faults_cpu: Track the nodes the process was running on when a NUMA
* hinting fault was incurred.
* faults_memory_buffer and faults_cpu_buffer: Record faults per node
* during the current scan window. When the scan completes, the counts
* in faults_memory and faults_cpu decay and these values are copied.
*/
unsigned long			*numa_faults;
unsigned long			total_numa_faults;
/*
* numa_faults_locality tracks if faults recorded during the last
* scan window were remote/local or failed to migrate. The task scan
* period is adapted based on the locality of the faults with different
* weights depending on whether they were shared or private faults
*/
unsigned long			numa_faults_locality[3];
unsigned long			numa_pages_migrated;
#endif /* CONFIG_NUMA_BALANCING */
#ifdef CONFIG_RSEQ
struct rseq __user *rseq;
u32 rseq_sig;
/*
* RmW on rseq_event_mask must be performed atomically
* with respect to preemption.
*/
unsigned long rseq_event_mask;
#endif
struct tlbflush_unmap_batch	tlb_ubc;
union {
refcount_t		rcu_users;
struct rcu_head		rcu;
};
/* Cache last used pipe for splice(): */
struct pipe_inode_info		*splice_pipe;
struct page_frag		task_frag;
#ifdef CONFIG_TASK_DELAY_ACCT
struct task_delay_info		*delays;
#endif
#ifdef CONFIG_FAULT_INJECTION
int				make_it_fail;
unsigned int			fail_nth;
#endif
/*
* When (nr_dirtied >= nr_dirtied_pause), it's time to call
* balance_dirty_pages() for a dirty throttling pause:
*/
int				nr_dirtied;
int				nr_dirtied_pause;
/* Start of a write-and-pause period: */
unsigned long			dirty_paused_when;
#ifdef CONFIG_LATENCYTOP
int				latency_record_count;
struct latency_record		latency_record[LT_SAVECOUNT];
#endif
/*
* Time slack values; these are used to round up poll() and
* select() etc timeout values. These are in nanoseconds.
*/
u64				timer_slack_ns;
u64				default_timer_slack_ns;
#ifdef CONFIG_KASAN
unsigned int			kasan_depth;
#endif
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
/* Index of current stored address in ret_stack: */
int				curr_ret_stack;
int				curr_ret_depth;
/* Stack of return addresses for return function tracing: */
struct ftrace_ret_stack		*ret_stack;
/* Timestamp for last schedule: */
unsigned long long		ftrace_timestamp;
/*
* Number of functions that haven't been traced
* because of depth overrun:
*/
atomic_t			trace_overrun;
/* Pause tracing: */
atomic_t			tracing_graph_pause;
#endif
#ifdef CONFIG_TRACING
/* State flags for use by tracers: */
unsigned long			trace;
/* Bitmask and counter of trace recursion: */
unsigned long			trace_recursion;
#endif /* CONFIG_TRACING */
#ifdef CONFIG_KCOV
/* See kernel/kcov.c for more details. */
/* Coverage collection mode enabled for this task (0 if disabled): */
unsigned int			kcov_mode;
/* Size of the kcov_area: */
unsigned int			kcov_size;
/* Buffer for coverage collection: */
void				*kcov_area;
/* KCOV descriptor wired with this task or NULL: */
struct kcov			*kcov;
/* KCOV common handle for remote coverage collection: */
u64				kcov_handle;
/* KCOV sequence number: */
int				kcov_sequence;
#endif
#ifdef CONFIG_MEMCG
struct mem_cgroup		*memcg_in_oom;
gfp_t				memcg_oom_gfp_mask;
int				memcg_oom_order;
/* Number of pages to reclaim on returning to userland: */
unsigned int			memcg_nr_pages_over_high;
/* Used by memcontrol for targeted memcg charge: */
struct mem_cgroup		*active_memcg;
#endif
#ifdef CONFIG_BLK_CGROUP
struct request_queue		*throttle_queue;
#endif
#ifdef CONFIG_UPROBES
struct uprobe_task		*utask;
#endif
#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)
unsigned int			sequential_io;
unsigned int			sequential_io_avg;
#endif
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
unsigned long			task_state_change;
#endif
int				pagefault_disabled;
#ifdef CONFIG_MMU
struct task_struct		*oom_reaper_list;
#endif
#ifdef CONFIG_VMAP_STACK
struct vm_struct		*stack_vm_area;
#endif
#ifdef CONFIG_THREAD_INFO_IN_TASK
/* A live task holds one reference: */
refcount_t			stack_refcount;
#endif
#ifdef CONFIG_LIVEPATCH
int patch_state;
#endif
#ifdef CONFIG_SECURITY
/* Used by LSM modules for access restriction: */
void				*security;
#endif
#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
unsigned long			lowest_stack;
unsigned long			prev_lowest_stack;
#endif
/*
* New fields for task_struct should be added above here, so that
* they are included in the randomized portion of task_struct.
*/
randomized_struct_fields_end
/* CPU-specific state of this task: */
struct thread_struct		thread;
/*
* WARNING: on x86, 'thread_struct' contains a variable-sized
* structure.  It *MUST* be at the end of 'task_struct'.
*
* Do not put anything below here!
*/
};

进程优先级和系统调用

进程优先级

限期进程的优先级是-1；
实时进程的优先级是1-99，优先级数值越大，表示优先级越高；
普通进程的静态优先级为100-139，优先级数值越小，优先级越高，可以通过修改nice值改变普通进程的优先级，优先级等于120+nice值。

Linux源码阅读笔记02-进程原理及系统调用插图(2)

系统调用

运行应用程序时，调用fork()/vfork()/clone()函数就是系统调用。系统调用就是应用程序进入内核空间执行任务，比如：创建进程、文件IO等等。具体如图：

Linux源码阅读笔记02-进程原理及系统调用插图(3)

如何研究系统调用，举个例子：

Linux源码阅读笔记02-进程原理及系统调用插图(5) Linux源码阅读笔记02-进程原理及系统调用插图(6)

Linux源码阅读笔记02-进程原理及系统调用插图(7)

内核线程

他是独立运行在内核中的进程，与普通用户进程区别在于内核线程没有独立的进程地址空间。task_struct结构里面有一个成员指针mm设置为NULL，他只能运行在内核空间通常被称为守护线程。一般用于执行一下任务：

周期性修改内存页与页来源块设备同步；
如果内存页很少使用，写入交换区；
管理延时动作（defferred action）；
实现文件系统的事务日志。

Linux源码阅读笔记02-进程原理及系统调用插图(8)

退出进程

主动终止：从某个主函数返回（链接程序会主动添加到exit系统调用，主动调用exit’函数）
被动终止：接收到SIGKILL等杀死信号或异常被终止

一	二	三	四	五	六	日
				1	2	3
4	5	6	7	8	9	10
11	12	13	14	15	16	17
18	19	20	21	22	23	24
25	26	27	28	29	30	31