process_linux.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488
  1. // +build linux
  2. package libcontainer
  3. import (
  4. "encoding/json"
  5. "errors"
  6. "fmt"
  7. "io"
  8. "os"
  9. "os/exec"
  10. "path/filepath"
  11. "strconv"
  12. "syscall"
  13. "github.com/opencontainers/runc/libcontainer/cgroups"
  14. "github.com/opencontainers/runc/libcontainer/configs"
  15. "github.com/opencontainers/runc/libcontainer/system"
  16. "github.com/opencontainers/runc/libcontainer/utils"
  17. )
  18. type parentProcess interface {
  19. // pid returns the pid for the running process.
  20. pid() int
  21. // start starts the process execution.
  22. start() error
  23. // send a SIGKILL to the process and wait for the exit.
  24. terminate() error
  25. // wait waits on the process returning the process state.
  26. wait() (*os.ProcessState, error)
  27. // startTime return's the process start time.
  28. startTime() (string, error)
  29. signal(os.Signal) error
  30. externalDescriptors() []string
  31. setExternalDescriptors(fds []string)
  32. }
  33. type setnsProcess struct {
  34. cmd *exec.Cmd
  35. parentPipe *os.File
  36. childPipe *os.File
  37. cgroupPaths map[string]string
  38. config *initConfig
  39. fds []string
  40. process *Process
  41. bootstrapData io.Reader
  42. }
  43. func (p *setnsProcess) startTime() (string, error) {
  44. return system.GetProcessStartTime(p.pid())
  45. }
  46. func (p *setnsProcess) signal(sig os.Signal) error {
  47. s, ok := sig.(syscall.Signal)
  48. if !ok {
  49. return errors.New("os: unsupported signal type")
  50. }
  51. return syscall.Kill(p.pid(), s)
  52. }
  53. func (p *setnsProcess) start() (err error) {
  54. defer p.parentPipe.Close()
  55. err = p.cmd.Start()
  56. p.childPipe.Close()
  57. if err != nil {
  58. return newSystemError(err)
  59. }
  60. if p.bootstrapData != nil {
  61. if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
  62. return newSystemError(err)
  63. }
  64. }
  65. if err = p.execSetns(); err != nil {
  66. return newSystemError(err)
  67. }
  68. if len(p.cgroupPaths) > 0 {
  69. if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil {
  70. return newSystemError(err)
  71. }
  72. }
  73. // set oom_score_adj
  74. if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil {
  75. return newSystemError(err)
  76. }
  77. // set rlimits, this has to be done here because we lose permissions
  78. // to raise the limits once we enter a user-namespace
  79. if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
  80. return newSystemError(err)
  81. }
  82. if err := utils.WriteJSON(p.parentPipe, p.config); err != nil {
  83. return newSystemError(err)
  84. }
  85. if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
  86. return newSystemError(err)
  87. }
  88. // wait for the child process to fully complete and receive an error message
  89. // if one was encoutered
  90. var ierr *genericError
  91. if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
  92. return newSystemError(err)
  93. }
  94. // Must be done after Shutdown so the child will exit and we can wait for it.
  95. if ierr != nil {
  96. p.wait()
  97. return newSystemError(ierr)
  98. }
  99. return nil
  100. }
  101. // execSetns runs the process that executes C code to perform the setns calls
  102. // because setns support requires the C process to fork off a child and perform the setns
  103. // before the go runtime boots, we wait on the process to die and receive the child's pid
  104. // over the provided pipe.
  105. func (p *setnsProcess) execSetns() error {
  106. status, err := p.cmd.Process.Wait()
  107. if err != nil {
  108. p.cmd.Wait()
  109. return newSystemError(err)
  110. }
  111. if !status.Success() {
  112. p.cmd.Wait()
  113. return newSystemError(&exec.ExitError{ProcessState: status})
  114. }
  115. var pid *pid
  116. if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
  117. p.cmd.Wait()
  118. return newSystemError(err)
  119. }
  120. process, err := os.FindProcess(pid.Pid)
  121. if err != nil {
  122. return err
  123. }
  124. p.cmd.Process = process
  125. p.process.ops = p
  126. return nil
  127. }
  128. // terminate sends a SIGKILL to the forked process for the setns routine then waits to
  129. // avoid the process becomming a zombie.
  130. func (p *setnsProcess) terminate() error {
  131. if p.cmd.Process == nil {
  132. return nil
  133. }
  134. err := p.cmd.Process.Kill()
  135. if _, werr := p.wait(); err == nil {
  136. err = werr
  137. }
  138. return err
  139. }
  140. func (p *setnsProcess) wait() (*os.ProcessState, error) {
  141. err := p.cmd.Wait()
  142. // Return actual ProcessState even on Wait error
  143. return p.cmd.ProcessState, err
  144. }
  145. func (p *setnsProcess) pid() int {
  146. return p.cmd.Process.Pid
  147. }
  148. func (p *setnsProcess) externalDescriptors() []string {
  149. return p.fds
  150. }
  151. func (p *setnsProcess) setExternalDescriptors(newFds []string) {
  152. p.fds = newFds
  153. }
  154. type initProcess struct {
  155. cmd *exec.Cmd
  156. parentPipe *os.File
  157. childPipe *os.File
  158. config *initConfig
  159. manager cgroups.Manager
  160. container *linuxContainer
  161. fds []string
  162. process *Process
  163. bootstrapData io.Reader
  164. sharePidns bool
  165. }
  166. func (p *initProcess) pid() int {
  167. return p.cmd.Process.Pid
  168. }
  169. func (p *initProcess) externalDescriptors() []string {
  170. return p.fds
  171. }
  172. // execSetns runs the process that executes C code to perform the setns calls
  173. // because setns support requires the C process to fork off a child and perform the setns
  174. // before the go runtime boots, we wait on the process to die and receive the child's pid
  175. // over the provided pipe.
  176. // This is called by initProcess.start function
  177. func (p *initProcess) execSetns() error {
  178. status, err := p.cmd.Process.Wait()
  179. if err != nil {
  180. p.cmd.Wait()
  181. return err
  182. }
  183. if !status.Success() {
  184. p.cmd.Wait()
  185. return &exec.ExitError{ProcessState: status}
  186. }
  187. var pid *pid
  188. if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
  189. p.cmd.Wait()
  190. return err
  191. }
  192. process, err := os.FindProcess(pid.Pid)
  193. if err != nil {
  194. return err
  195. }
  196. p.cmd.Process = process
  197. return nil
  198. }
  199. func (p *initProcess) start() error {
  200. defer p.parentPipe.Close()
  201. err := p.cmd.Start()
  202. p.process.ops = p
  203. p.childPipe.Close()
  204. if err != nil {
  205. p.process.ops = nil
  206. return newSystemError(err)
  207. }
  208. if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
  209. return err
  210. }
  211. if err := p.execSetns(); err != nil {
  212. return newSystemError(err)
  213. }
  214. // Save the standard descriptor names before the container process
  215. // can potentially move them (e.g., via dup2()). If we don't do this now,
  216. // we won't know at checkpoint time which file descriptor to look up.
  217. fds, err := getPipeFds(p.pid())
  218. if err != nil {
  219. return newSystemError(err)
  220. }
  221. p.setExternalDescriptors(fds)
  222. // Do this before syncing with child so that no children
  223. // can escape the cgroup
  224. if err := p.manager.Apply(p.pid()); err != nil {
  225. return newSystemError(err)
  226. }
  227. defer func() {
  228. if err != nil {
  229. // TODO: should not be the responsibility to call here
  230. p.manager.Destroy()
  231. }
  232. }()
  233. if err := p.createNetworkInterfaces(); err != nil {
  234. return newSystemError(err)
  235. }
  236. if err := p.sendConfig(); err != nil {
  237. return newSystemError(err)
  238. }
  239. var (
  240. procSync syncT
  241. sentRun bool
  242. sentResume bool
  243. ierr *genericError
  244. )
  245. dec := json.NewDecoder(p.parentPipe)
  246. loop:
  247. for {
  248. if err := dec.Decode(&procSync); err != nil {
  249. if err == io.EOF {
  250. break loop
  251. }
  252. return newSystemError(err)
  253. }
  254. switch procSync.Type {
  255. case procReady:
  256. if err := p.manager.Set(p.config.Config); err != nil {
  257. return newSystemError(err)
  258. }
  259. // set oom_score_adj
  260. if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil {
  261. return newSystemError(err)
  262. }
  263. // set rlimits, this has to be done here because we lose permissions
  264. // to raise the limits once we enter a user-namespace
  265. if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
  266. return newSystemError(err)
  267. }
  268. // call prestart hooks
  269. if !p.config.Config.Namespaces.Contains(configs.NEWNS) {
  270. if p.config.Config.Hooks != nil {
  271. s := configs.HookState{
  272. Version: p.container.config.Version,
  273. ID: p.container.id,
  274. Pid: p.pid(),
  275. Root: p.config.Config.Rootfs,
  276. }
  277. for _, hook := range p.config.Config.Hooks.Prestart {
  278. if err := hook.Run(s); err != nil {
  279. return newSystemError(err)
  280. }
  281. }
  282. }
  283. }
  284. // Sync with child.
  285. if err := utils.WriteJSON(p.parentPipe, syncT{procRun}); err != nil {
  286. return newSystemError(err)
  287. }
  288. sentRun = true
  289. case procHooks:
  290. if p.config.Config.Hooks != nil {
  291. s := configs.HookState{
  292. Version: p.container.config.Version,
  293. ID: p.container.id,
  294. Pid: p.pid(),
  295. Root: p.config.Config.Rootfs,
  296. BundlePath: utils.SearchLabels(p.config.Config.Labels, "bundle"),
  297. }
  298. for _, hook := range p.config.Config.Hooks.Prestart {
  299. if err := hook.Run(s); err != nil {
  300. return newSystemError(err)
  301. }
  302. }
  303. }
  304. // Sync with child.
  305. if err := utils.WriteJSON(p.parentPipe, syncT{procResume}); err != nil {
  306. return newSystemError(err)
  307. }
  308. sentResume = true
  309. case procError:
  310. // wait for the child process to fully complete and receive an error message
  311. // if one was encoutered
  312. if err := dec.Decode(&ierr); err != nil && err != io.EOF {
  313. return newSystemError(err)
  314. }
  315. if ierr != nil {
  316. break loop
  317. }
  318. // Programmer error.
  319. panic("No error following JSON procError payload.")
  320. default:
  321. return newSystemError(fmt.Errorf("invalid JSON synchronisation payload from child"))
  322. }
  323. }
  324. if !sentRun {
  325. return newSystemError(fmt.Errorf("could not synchronise with container process: %v", ierr))
  326. }
  327. if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume {
  328. return newSystemError(fmt.Errorf("could not synchronise after executing prestart hooks with container process"))
  329. }
  330. if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
  331. return newSystemError(err)
  332. }
  333. // Must be done after Shutdown so the child will exit and we can wait for it.
  334. if ierr != nil {
  335. p.wait()
  336. return newSystemError(ierr)
  337. }
  338. return nil
  339. }
  340. func (p *initProcess) wait() (*os.ProcessState, error) {
  341. err := p.cmd.Wait()
  342. if err != nil {
  343. return p.cmd.ProcessState, err
  344. }
  345. // we should kill all processes in cgroup when init is died if we use host PID namespace
  346. if p.sharePidns {
  347. killCgroupProcesses(p.manager)
  348. }
  349. return p.cmd.ProcessState, nil
  350. }
  351. func (p *initProcess) terminate() error {
  352. if p.cmd.Process == nil {
  353. return nil
  354. }
  355. err := p.cmd.Process.Kill()
  356. if _, werr := p.wait(); err == nil {
  357. err = werr
  358. }
  359. return err
  360. }
  361. func (p *initProcess) startTime() (string, error) {
  362. return system.GetProcessStartTime(p.pid())
  363. }
  364. func (p *initProcess) sendConfig() error {
  365. // send the config to the container's init process, we don't use JSON Encode
  366. // here because there might be a problem in JSON decoder in some cases, see:
  367. // https://github.com/docker/docker/issues/14203#issuecomment-174177790
  368. return utils.WriteJSON(p.parentPipe, p.config)
  369. }
  370. func (p *initProcess) createNetworkInterfaces() error {
  371. for _, config := range p.config.Config.Networks {
  372. strategy, err := getStrategy(config.Type)
  373. if err != nil {
  374. return err
  375. }
  376. n := &network{
  377. Network: *config,
  378. }
  379. if err := strategy.create(n, p.pid()); err != nil {
  380. return err
  381. }
  382. p.config.Networks = append(p.config.Networks, n)
  383. }
  384. return nil
  385. }
  386. func (p *initProcess) signal(sig os.Signal) error {
  387. s, ok := sig.(syscall.Signal)
  388. if !ok {
  389. return errors.New("os: unsupported signal type")
  390. }
  391. return syscall.Kill(p.pid(), s)
  392. }
  393. func (p *initProcess) setExternalDescriptors(newFds []string) {
  394. p.fds = newFds
  395. }
  396. func getPipeFds(pid int) ([]string, error) {
  397. fds := make([]string, 3)
  398. dirPath := filepath.Join("/proc", strconv.Itoa(pid), "/fd")
  399. for i := 0; i < 3; i++ {
  400. f := filepath.Join(dirPath, strconv.Itoa(i))
  401. target, err := os.Readlink(f)
  402. if err != nil {
  403. return fds, err
  404. }
  405. fds[i] = target
  406. }
  407. return fds, nil
  408. }
  409. // InitializeIO creates pipes for use with the process's STDIO
  410. // and returns the opposite side for each
  411. func (p *Process) InitializeIO(rootuid int) (i *IO, err error) {
  412. var fds []uintptr
  413. i = &IO{}
  414. // cleanup in case of an error
  415. defer func() {
  416. if err != nil {
  417. for _, fd := range fds {
  418. syscall.Close(int(fd))
  419. }
  420. }
  421. }()
  422. // STDIN
  423. r, w, err := os.Pipe()
  424. if err != nil {
  425. return nil, err
  426. }
  427. fds = append(fds, r.Fd(), w.Fd())
  428. p.Stdin, i.Stdin = r, w
  429. // STDOUT
  430. if r, w, err = os.Pipe(); err != nil {
  431. return nil, err
  432. }
  433. fds = append(fds, r.Fd(), w.Fd())
  434. p.Stdout, i.Stdout = w, r
  435. // STDERR
  436. if r, w, err = os.Pipe(); err != nil {
  437. return nil, err
  438. }
  439. fds = append(fds, r.Fd(), w.Fd())
  440. p.Stderr, i.Stderr = w, r
  441. // change ownership of the pipes incase we are in a user namespace
  442. for _, fd := range fds {
  443. if err := syscall.Fchown(int(fd), rootuid, rootuid); err != nil {
  444. return nil, err
  445. }
  446. }
  447. return i, nil
  448. }