123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488 |
- // +build linux
- package libcontainer
- import (
- "encoding/json"
- "errors"
- "fmt"
- "io"
- "os"
- "os/exec"
- "path/filepath"
- "strconv"
- "syscall"
- "github.com/opencontainers/runc/libcontainer/cgroups"
- "github.com/opencontainers/runc/libcontainer/configs"
- "github.com/opencontainers/runc/libcontainer/system"
- "github.com/opencontainers/runc/libcontainer/utils"
- )
- type parentProcess interface {
- // pid returns the pid for the running process.
- pid() int
- // start starts the process execution.
- start() error
- // send a SIGKILL to the process and wait for the exit.
- terminate() error
- // wait waits on the process returning the process state.
- wait() (*os.ProcessState, error)
- // startTime return's the process start time.
- startTime() (string, error)
- signal(os.Signal) error
- externalDescriptors() []string
- setExternalDescriptors(fds []string)
- }
- type setnsProcess struct {
- cmd *exec.Cmd
- parentPipe *os.File
- childPipe *os.File
- cgroupPaths map[string]string
- config *initConfig
- fds []string
- process *Process
- bootstrapData io.Reader
- }
- func (p *setnsProcess) startTime() (string, error) {
- return system.GetProcessStartTime(p.pid())
- }
- func (p *setnsProcess) signal(sig os.Signal) error {
- s, ok := sig.(syscall.Signal)
- if !ok {
- return errors.New("os: unsupported signal type")
- }
- return syscall.Kill(p.pid(), s)
- }
- func (p *setnsProcess) start() (err error) {
- defer p.parentPipe.Close()
- err = p.cmd.Start()
- p.childPipe.Close()
- if err != nil {
- return newSystemError(err)
- }
- if p.bootstrapData != nil {
- if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
- return newSystemError(err)
- }
- }
- if err = p.execSetns(); err != nil {
- return newSystemError(err)
- }
- if len(p.cgroupPaths) > 0 {
- if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil {
- return newSystemError(err)
- }
- }
- // set oom_score_adj
- if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil {
- return newSystemError(err)
- }
- // set rlimits, this has to be done here because we lose permissions
- // to raise the limits once we enter a user-namespace
- if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
- return newSystemError(err)
- }
- if err := utils.WriteJSON(p.parentPipe, p.config); err != nil {
- return newSystemError(err)
- }
- if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
- return newSystemError(err)
- }
- // wait for the child process to fully complete and receive an error message
- // if one was encoutered
- var ierr *genericError
- if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
- return newSystemError(err)
- }
- // Must be done after Shutdown so the child will exit and we can wait for it.
- if ierr != nil {
- p.wait()
- return newSystemError(ierr)
- }
- return nil
- }
- // execSetns runs the process that executes C code to perform the setns calls
- // because setns support requires the C process to fork off a child and perform the setns
- // before the go runtime boots, we wait on the process to die and receive the child's pid
- // over the provided pipe.
- func (p *setnsProcess) execSetns() error {
- status, err := p.cmd.Process.Wait()
- if err != nil {
- p.cmd.Wait()
- return newSystemError(err)
- }
- if !status.Success() {
- p.cmd.Wait()
- return newSystemError(&exec.ExitError{ProcessState: status})
- }
- var pid *pid
- if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
- p.cmd.Wait()
- return newSystemError(err)
- }
- process, err := os.FindProcess(pid.Pid)
- if err != nil {
- return err
- }
- p.cmd.Process = process
- p.process.ops = p
- return nil
- }
- // terminate sends a SIGKILL to the forked process for the setns routine then waits to
- // avoid the process becomming a zombie.
- func (p *setnsProcess) terminate() error {
- if p.cmd.Process == nil {
- return nil
- }
- err := p.cmd.Process.Kill()
- if _, werr := p.wait(); err == nil {
- err = werr
- }
- return err
- }
- func (p *setnsProcess) wait() (*os.ProcessState, error) {
- err := p.cmd.Wait()
- // Return actual ProcessState even on Wait error
- return p.cmd.ProcessState, err
- }
- func (p *setnsProcess) pid() int {
- return p.cmd.Process.Pid
- }
- func (p *setnsProcess) externalDescriptors() []string {
- return p.fds
- }
- func (p *setnsProcess) setExternalDescriptors(newFds []string) {
- p.fds = newFds
- }
- type initProcess struct {
- cmd *exec.Cmd
- parentPipe *os.File
- childPipe *os.File
- config *initConfig
- manager cgroups.Manager
- container *linuxContainer
- fds []string
- process *Process
- bootstrapData io.Reader
- sharePidns bool
- }
- func (p *initProcess) pid() int {
- return p.cmd.Process.Pid
- }
- func (p *initProcess) externalDescriptors() []string {
- return p.fds
- }
- // execSetns runs the process that executes C code to perform the setns calls
- // because setns support requires the C process to fork off a child and perform the setns
- // before the go runtime boots, we wait on the process to die and receive the child's pid
- // over the provided pipe.
- // This is called by initProcess.start function
- func (p *initProcess) execSetns() error {
- status, err := p.cmd.Process.Wait()
- if err != nil {
- p.cmd.Wait()
- return err
- }
- if !status.Success() {
- p.cmd.Wait()
- return &exec.ExitError{ProcessState: status}
- }
- var pid *pid
- if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
- p.cmd.Wait()
- return err
- }
- process, err := os.FindProcess(pid.Pid)
- if err != nil {
- return err
- }
- p.cmd.Process = process
- return nil
- }
- func (p *initProcess) start() error {
- defer p.parentPipe.Close()
- err := p.cmd.Start()
- p.process.ops = p
- p.childPipe.Close()
- if err != nil {
- p.process.ops = nil
- return newSystemError(err)
- }
- if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
- return err
- }
- if err := p.execSetns(); err != nil {
- return newSystemError(err)
- }
- // Save the standard descriptor names before the container process
- // can potentially move them (e.g., via dup2()). If we don't do this now,
- // we won't know at checkpoint time which file descriptor to look up.
- fds, err := getPipeFds(p.pid())
- if err != nil {
- return newSystemError(err)
- }
- p.setExternalDescriptors(fds)
- // Do this before syncing with child so that no children
- // can escape the cgroup
- if err := p.manager.Apply(p.pid()); err != nil {
- return newSystemError(err)
- }
- defer func() {
- if err != nil {
- // TODO: should not be the responsibility to call here
- p.manager.Destroy()
- }
- }()
- if err := p.createNetworkInterfaces(); err != nil {
- return newSystemError(err)
- }
- if err := p.sendConfig(); err != nil {
- return newSystemError(err)
- }
- var (
- procSync syncT
- sentRun bool
- sentResume bool
- ierr *genericError
- )
- dec := json.NewDecoder(p.parentPipe)
- loop:
- for {
- if err := dec.Decode(&procSync); err != nil {
- if err == io.EOF {
- break loop
- }
- return newSystemError(err)
- }
- switch procSync.Type {
- case procReady:
- if err := p.manager.Set(p.config.Config); err != nil {
- return newSystemError(err)
- }
- // set oom_score_adj
- if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil {
- return newSystemError(err)
- }
- // set rlimits, this has to be done here because we lose permissions
- // to raise the limits once we enter a user-namespace
- if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
- return newSystemError(err)
- }
- // call prestart hooks
- if !p.config.Config.Namespaces.Contains(configs.NEWNS) {
- if p.config.Config.Hooks != nil {
- s := configs.HookState{
- Version: p.container.config.Version,
- ID: p.container.id,
- Pid: p.pid(),
- Root: p.config.Config.Rootfs,
- }
- for _, hook := range p.config.Config.Hooks.Prestart {
- if err := hook.Run(s); err != nil {
- return newSystemError(err)
- }
- }
- }
- }
- // Sync with child.
- if err := utils.WriteJSON(p.parentPipe, syncT{procRun}); err != nil {
- return newSystemError(err)
- }
- sentRun = true
- case procHooks:
- if p.config.Config.Hooks != nil {
- s := configs.HookState{
- Version: p.container.config.Version,
- ID: p.container.id,
- Pid: p.pid(),
- Root: p.config.Config.Rootfs,
- BundlePath: utils.SearchLabels(p.config.Config.Labels, "bundle"),
- }
- for _, hook := range p.config.Config.Hooks.Prestart {
- if err := hook.Run(s); err != nil {
- return newSystemError(err)
- }
- }
- }
- // Sync with child.
- if err := utils.WriteJSON(p.parentPipe, syncT{procResume}); err != nil {
- return newSystemError(err)
- }
- sentResume = true
- case procError:
- // wait for the child process to fully complete and receive an error message
- // if one was encoutered
- if err := dec.Decode(&ierr); err != nil && err != io.EOF {
- return newSystemError(err)
- }
- if ierr != nil {
- break loop
- }
- // Programmer error.
- panic("No error following JSON procError payload.")
- default:
- return newSystemError(fmt.Errorf("invalid JSON synchronisation payload from child"))
- }
- }
- if !sentRun {
- return newSystemError(fmt.Errorf("could not synchronise with container process: %v", ierr))
- }
- if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume {
- return newSystemError(fmt.Errorf("could not synchronise after executing prestart hooks with container process"))
- }
- if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
- return newSystemError(err)
- }
- // Must be done after Shutdown so the child will exit and we can wait for it.
- if ierr != nil {
- p.wait()
- return newSystemError(ierr)
- }
- return nil
- }
- func (p *initProcess) wait() (*os.ProcessState, error) {
- err := p.cmd.Wait()
- if err != nil {
- return p.cmd.ProcessState, err
- }
- // we should kill all processes in cgroup when init is died if we use host PID namespace
- if p.sharePidns {
- killCgroupProcesses(p.manager)
- }
- return p.cmd.ProcessState, nil
- }
- func (p *initProcess) terminate() error {
- if p.cmd.Process == nil {
- return nil
- }
- err := p.cmd.Process.Kill()
- if _, werr := p.wait(); err == nil {
- err = werr
- }
- return err
- }
- func (p *initProcess) startTime() (string, error) {
- return system.GetProcessStartTime(p.pid())
- }
- func (p *initProcess) sendConfig() error {
- // send the config to the container's init process, we don't use JSON Encode
- // here because there might be a problem in JSON decoder in some cases, see:
- // https://github.com/docker/docker/issues/14203#issuecomment-174177790
- return utils.WriteJSON(p.parentPipe, p.config)
- }
- func (p *initProcess) createNetworkInterfaces() error {
- for _, config := range p.config.Config.Networks {
- strategy, err := getStrategy(config.Type)
- if err != nil {
- return err
- }
- n := &network{
- Network: *config,
- }
- if err := strategy.create(n, p.pid()); err != nil {
- return err
- }
- p.config.Networks = append(p.config.Networks, n)
- }
- return nil
- }
- func (p *initProcess) signal(sig os.Signal) error {
- s, ok := sig.(syscall.Signal)
- if !ok {
- return errors.New("os: unsupported signal type")
- }
- return syscall.Kill(p.pid(), s)
- }
- func (p *initProcess) setExternalDescriptors(newFds []string) {
- p.fds = newFds
- }
- func getPipeFds(pid int) ([]string, error) {
- fds := make([]string, 3)
- dirPath := filepath.Join("/proc", strconv.Itoa(pid), "/fd")
- for i := 0; i < 3; i++ {
- f := filepath.Join(dirPath, strconv.Itoa(i))
- target, err := os.Readlink(f)
- if err != nil {
- return fds, err
- }
- fds[i] = target
- }
- return fds, nil
- }
- // InitializeIO creates pipes for use with the process's STDIO
- // and returns the opposite side for each
- func (p *Process) InitializeIO(rootuid int) (i *IO, err error) {
- var fds []uintptr
- i = &IO{}
- // cleanup in case of an error
- defer func() {
- if err != nil {
- for _, fd := range fds {
- syscall.Close(int(fd))
- }
- }
- }()
- // STDIN
- r, w, err := os.Pipe()
- if err != nil {
- return nil, err
- }
- fds = append(fds, r.Fd(), w.Fd())
- p.Stdin, i.Stdin = r, w
- // STDOUT
- if r, w, err = os.Pipe(); err != nil {
- return nil, err
- }
- fds = append(fds, r.Fd(), w.Fd())
- p.Stdout, i.Stdout = w, r
- // STDERR
- if r, w, err = os.Pipe(); err != nil {
- return nil, err
- }
- fds = append(fds, r.Fd(), w.Fd())
- p.Stderr, i.Stderr = w, r
- // change ownership of the pipes incase we are in a user namespace
- for _, fd := range fds {
- if err := syscall.Fchown(int(fd), rootuid, rootuid); err != nil {
- return nil, err
- }
- }
- return i, nil
- }
|