跳到主要内容

syscall

  • Ring 0 - 内核态
    • 初始化中断或异常处理机制
    • 代码可以直接访问硬件、管理内存和执行CPU的所有指令集
    • 内核和驱动程序
  • Ring 1 & Ring 2 - 很少使用 - 例如 驱动程序
  • Ring 3 - 用户态
    • 不能直接访问硬件或执行某些敏感的CPU指令
    • 通过 系统调用/syscall 访问内核态

  • 汇编指令
    • x86 int 0x80
      • IDT - Interrupt Descriptor Table - 中断描述符表
      • 0x80 - 0x85
      • lidt - load IDT
    • x86-64 syscall
      • MSR - Model Specific Register - 模型特定寄存器
      • rdmsr - read MSR
      • wrmsr - write MSR
    • arm swi 0 - software interrupt
    • aaarch64 svc #0 - supervisor call
  • 当 linux 完成 init 之后 syscall 就是唯一的用户态和内核态交互方式
    • 设置系统调用表
      • arch/x86/include/generated/uapi/asm/unistd_64.h - 调用号
      • arch/x86/entry/syscalls/syscall_64.tbl
    • 初始化中断或异常处理机制
    • 配置用户空间到内核空间的切换
提示
  • 大多数应用通过 libc 的封装调用 syscall
    • 例如 write, read 而不是 syscall(SYS_write, 1, "Hello, world!\n", 14)
    • 大多动态依赖 libc - 引入环境依赖
    • 不少应用可通过静态链接 musl 来避免依赖 libc
  • Golang 是直接调用 syscall - https://pkg.go.dev/syscall
    • 不依赖 libc - 跨平台
    • 静态编译
// SYS_write=1
syscall(SYS_write, 1, "Hello, world!\n", 14);
ausyscall 2 # open
ausyscall --dump
Nnamenote
0read
1write
2open
3close
4stat
5fstat
6lstat
7poll
8lseek
9mmap
10mprotect
11munmap
12brk
13rt_sigaction
14rt_sigprocmask
15rt_sigreturn
16ioctl
17pread
18pwrite
19readv
20writev
21access
22pipe
23select
24sched_yield
25mremap
26msync
27mincore
28madvise
29shmget
30shmat
31shmctl
32dup
33dup2
34pause
35nanosleep
36getitimer
37alarm
38setitimer
39getpid
40sendfile
41socket
42connect
43accept
44sendto
45recvfrom
46sendmsg
47recvmsg
48shutdown
49bind
50listen
51getsockname
52getpeername
53socketpair
54setsockopt
55getsockopt
56clone
57fork
58vfork
59execve
60exit
61wait4
62kill
63uname
64semget
65semop
66semctl
67shmdt
68msgget
69msgsnd
70msgrcv
71msgctl
72fcntl
73flock
74fsync
75fdatasync
76truncate
77ftruncate
78getdents
79getcwd
80chdir
81fchdir
82rename
83mkdir
84rmdir
85creat
86link
87unlink
88symlink
89readlink
90chmod
91fchmod
92chown
93fchown
94lchown
95umask
96gettimeofday
97getrlimit
98getrusage
99sysinfo
100times
101ptrace
102getuid
103syslog
104getgid
105setuid
106setgid
107geteuid
108getegid
109setpgid
110getppid
111getpgrp
112setsid
113setreuid
114setregid
115getgroups
116setgroups
117setresuid
118getresuid
119setresgid
120getresgid
121getpgid
122setfsuid
123setfsgid
124getsid
125capget
126capset
127rt_sigpending
128rt_sigtimedwait
129rt_sigqueueinfo
130rt_sigsuspend
131sigaltstack
132utime
133mknod
134uselib
135personality
136ustat
137statfs
138fstatfs
139sysfs
140getpriority
141setpriority
142sched_setparam
143sched_getparam
144sched_setscheduler
145sched_getscheduler
146sched_get_priority_max
147sched_get_priority_min
148sched_rr_get_interval
149mlock
150munlock
151mlockall
152munlockall
153vhangup
154modify_ldt
155pivot_root
156_sysctl
157prctl
158arch_prctl
159adjtimex
160setrlimit
161chroot
162sync
163acct
164settimeofday
165mount
166umount2
167swapon
168swapoff
169reboot
170sethostname
171setdomainname
172iopl
173ioperm
174create_module
175init_module
176delete_module
177get_kernel_syms
178query_module
179quotactl
180nfsservctl
181getpmsg
182putpmsg
183afs_syscall
184tuxcall
185security
186gettid
187readahead
188setxattr
189lsetxattr
190fsetxattr
191getxattr
192lgetxattr
193fgetxattr
194listxattr
195llistxattr
196flistxattr
197removexattr
198lremovexattr
199fremovexattr
200tkill
201time
202futex
203sched_setaffinity
204sched_getaffinity
205set_thread_area
206io_setup
207io_destroy
208io_getevents
209io_submit
210io_cancel
211get_thread_area
212lookup_dcookie
213epoll_create
214epoll_ctl_old
215epoll_wait_old
216remap_file_pages
217getdents64
218set_tid_address
219restart_syscall
220semtimedop
221fadvise64
222timer_create
223timer_settime
224timer_gettime
225timer_getoverrun
226timer_delete
227clock_settime
228clock_gettime
229clock_getres
230clock_nanosleep
231exit_group
232epoll_wait
233epoll_ctl
234tgkill
235utimes
236vserver
237mbind
238set_mempolicy
239get_mempolicy
240mq_open
241mq_unlink
242mq_timedsend
243mq_timedreceive
244mq_notify
245mq_getsetattr
246kexec_load
247waitid
248add_key
249request_key
250keyctl
251ioprio_set
252ioprio_get
253inotify_init
254inotify_add_watch
255inotify_rm_watch
256migrate_pages
257openat
258mkdirat
259mknodat
260fchownat
261futimesat
262newfstatat
263unlinkat
264renameat
265linkat
266symlinkat
267readlinkat
268fchmodat
269faccessat
270pselect6
271ppoll
272unshare
273set_robust_list
274get_robust_list
275splice
276tee
277sync_file_range
278vmsplice
279move_pages
280utimensat
281epoll_pwait
282signalfd
283timerfd_create
284eventfd
285fallocate
286timerfd_settime
287timerfd_gettime
288accept4
289signalfd4
290eventfd2
291epoll_create1
292dup3
293pipe2
294inotify_init1
295preadv
296pwritev
297rt_tgsigqueueinfo
298perf_event_open
299recvmmsg
300fanotify_init
301fanotify_mark
302prlimit64
303name_to_handle_at
304open_by_handle_at
305clock_adjtime
306syncfs
307sendmmsg
308setns
309getcpu
310process_vm_readv
311process_vm_writev
312kcmpKernel Samepage Merging
313finit_module
314sched_setattr
315sched_getattr
316renameat2
317seccompSecure Computing Mode
318getrandom
319memfd_create
320kexec_file_load
321bpf
322execveat
323userfaultfd
324membarrier
325mlock2
326copy_file_range
327preadv2
328pwritev2
329pkey_mprotect
330pkey_alloc
331pkey_free
332statx
333io_pgetevents
334rseq
424pidfd_send_signal
425io_uring_setup
426io_uring_enter
427io_uring_register
428open_tree
429move_mount
430fsopen
431fsconfig
432fsmount
433fspick
434pidfd_open
435clone3
436close_range
437openat2
438pidfd_getfd
439faccessat2
440process_madvise
441epoll_pwait2
442mount_setattr
443quotactl_fd
444landlock_create_ruleset
445landlock_add_rule
446landlock_restrict_self
447memfd_secret
448process_mrelease
449futex_waitv
450set_mempolicy_home_node
  • seccomp.2
    • 一种沙箱机制
    • SECCOMP_MODE_FILTER
      • 通过回调函数过滤 syscall
      • 过滤器适用 BPF 语言
  • google/gvisor
    • 拦截 syscall 实现容器隔离

FAQ

BPF vs eBPF

  • BPF - Berkeley Packet Filter
    • 1992年由Steven McCanne和Van Jacobson在BSD上实现
    • 用于过滤网络数据包
  • eBPF - extended Berkeley Packet Filter
    • 2014年由Alexei Starovoitov在Linux内核中实现
    • 用于过滤系统调用
    • 提升内核监控和分析能力