container_linux.go 32 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229
  1. // +build linux
  2. package libcontainer
  3. import (
  4. "bytes"
  5. "encoding/json"
  6. "fmt"
  7. "io"
  8. "io/ioutil"
  9. "os"
  10. "os/exec"
  11. "path/filepath"
  12. "reflect"
  13. "strings"
  14. "sync"
  15. "syscall"
  16. "time"
  17. "github.com/Sirupsen/logrus"
  18. "github.com/golang/protobuf/proto"
  19. "github.com/opencontainers/runc/libcontainer/cgroups"
  20. "github.com/opencontainers/runc/libcontainer/configs"
  21. "github.com/opencontainers/runc/libcontainer/criurpc"
  22. "github.com/opencontainers/runc/libcontainer/utils"
  23. "github.com/syndtr/gocapability/capability"
  24. "github.com/vishvananda/netlink/nl"
  25. )
  26. const stdioFdCount = 3
  27. type linuxContainer struct {
  28. id string
  29. root string
  30. config *configs.Config
  31. cgroupManager cgroups.Manager
  32. initPath string
  33. initArgs []string
  34. initProcess parentProcess
  35. criuPath string
  36. m sync.Mutex
  37. criuVersion int
  38. state containerState
  39. created time.Time
  40. }
  41. // State represents a running container's state
  42. type State struct {
  43. BaseState
  44. // Platform specific fields below here
  45. // Path to all the cgroups setup for a container. Key is cgroup subsystem name
  46. // with the value as the path.
  47. CgroupPaths map[string]string `json:"cgroup_paths"`
  48. // NamespacePaths are filepaths to the container's namespaces. Key is the namespace type
  49. // with the value as the path.
  50. NamespacePaths map[configs.NamespaceType]string `json:"namespace_paths"`
  51. // Container's standard descriptors (std{in,out,err}), needed for checkpoint and restore
  52. ExternalDescriptors []string `json:"external_descriptors,omitempty"`
  53. }
  54. // A libcontainer container object.
  55. //
  56. // Each container is thread-safe within the same process. Since a container can
  57. // be destroyed by a separate process, any function may return that the container
  58. // was not found.
  59. type Container interface {
  60. BaseContainer
  61. // Methods below here are platform specific
  62. // Checkpoint checkpoints the running container's state to disk using the criu(8) utility.
  63. //
  64. // errors:
  65. // Systemerror - System error.
  66. Checkpoint(criuOpts *CriuOpts) error
  67. // Restore restores the checkpointed container to a running state using the criu(8) utility.
  68. //
  69. // errors:
  70. // Systemerror - System error.
  71. Restore(process *Process, criuOpts *CriuOpts) error
  72. // If the Container state is RUNNING or PAUSING, sets the Container state to PAUSING and pauses
  73. // the execution of any user processes. Asynchronously, when the container finished being paused the
  74. // state is changed to PAUSED.
  75. // If the Container state is PAUSED, do nothing.
  76. //
  77. // errors:
  78. // ContainerDestroyed - Container no longer exists,
  79. // Systemerror - System error.
  80. Pause() error
  81. // If the Container state is PAUSED, resumes the execution of any user processes in the
  82. // Container before setting the Container state to RUNNING.
  83. // If the Container state is RUNNING, do nothing.
  84. //
  85. // errors:
  86. // ContainerDestroyed - Container no longer exists,
  87. // Systemerror - System error.
  88. Resume() error
  89. // NotifyOOM returns a read-only channel signaling when the container receives an OOM notification.
  90. //
  91. // errors:
  92. // Systemerror - System error.
  93. NotifyOOM() (<-chan struct{}, error)
  94. // NotifyMemoryPressure returns a read-only channel signaling when the container reaches a given pressure level
  95. //
  96. // errors:
  97. // Systemerror - System error.
  98. NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error)
  99. }
  100. // ID returns the container's unique ID
  101. func (c *linuxContainer) ID() string {
  102. return c.id
  103. }
  104. // Config returns the container's configuration
  105. func (c *linuxContainer) Config() configs.Config {
  106. return *c.config
  107. }
  108. func (c *linuxContainer) Status() (Status, error) {
  109. c.m.Lock()
  110. defer c.m.Unlock()
  111. return c.currentStatus()
  112. }
  113. func (c *linuxContainer) State() (*State, error) {
  114. c.m.Lock()
  115. defer c.m.Unlock()
  116. return c.currentState()
  117. }
  118. func (c *linuxContainer) Processes() ([]int, error) {
  119. pids, err := c.cgroupManager.GetAllPids()
  120. if err != nil {
  121. return nil, newSystemError(err)
  122. }
  123. return pids, nil
  124. }
  125. func (c *linuxContainer) Stats() (*Stats, error) {
  126. var (
  127. err error
  128. stats = &Stats{}
  129. )
  130. if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil {
  131. return stats, newSystemError(err)
  132. }
  133. for _, iface := range c.config.Networks {
  134. switch iface.Type {
  135. case "veth":
  136. istats, err := getNetworkInterfaceStats(iface.HostInterfaceName)
  137. if err != nil {
  138. return stats, newSystemError(err)
  139. }
  140. stats.Interfaces = append(stats.Interfaces, istats)
  141. }
  142. }
  143. return stats, nil
  144. }
  145. func (c *linuxContainer) Set(config configs.Config) error {
  146. c.m.Lock()
  147. defer c.m.Unlock()
  148. c.config = &config
  149. return c.cgroupManager.Set(c.config)
  150. }
  151. func (c *linuxContainer) Start(process *Process) error {
  152. c.m.Lock()
  153. defer c.m.Unlock()
  154. status, err := c.currentStatus()
  155. if err != nil {
  156. return err
  157. }
  158. doInit := status == Destroyed
  159. parent, err := c.newParentProcess(process, doInit)
  160. if err != nil {
  161. return newSystemError(err)
  162. }
  163. if err := parent.start(); err != nil {
  164. // terminate the process to ensure that it properly is reaped.
  165. if err := parent.terminate(); err != nil {
  166. logrus.Warn(err)
  167. }
  168. return newSystemError(err)
  169. }
  170. // generate a timestamp indicating when the container was started
  171. c.created = time.Now().UTC()
  172. c.state = &runningState{
  173. c: c,
  174. }
  175. if doInit {
  176. if err := c.updateState(parent); err != nil {
  177. return err
  178. }
  179. if c.config.Hooks != nil {
  180. s := configs.HookState{
  181. Version: c.config.Version,
  182. ID: c.id,
  183. Pid: parent.pid(),
  184. Root: c.config.Rootfs,
  185. BundlePath: utils.SearchLabels(c.config.Labels, "bundle"),
  186. }
  187. for _, hook := range c.config.Hooks.Poststart {
  188. if err := hook.Run(s); err != nil {
  189. if err := parent.terminate(); err != nil {
  190. logrus.Warn(err)
  191. }
  192. return newSystemError(err)
  193. }
  194. }
  195. }
  196. }
  197. return nil
  198. }
  199. func (c *linuxContainer) Signal(s os.Signal) error {
  200. if err := c.initProcess.signal(s); err != nil {
  201. return newSystemError(err)
  202. }
  203. return nil
  204. }
  205. func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProcess, error) {
  206. parentPipe, childPipe, err := newPipe()
  207. if err != nil {
  208. return nil, newSystemError(err)
  209. }
  210. cmd, err := c.commandTemplate(p, childPipe)
  211. if err != nil {
  212. return nil, newSystemError(err)
  213. }
  214. if !doInit {
  215. return c.newSetnsProcess(p, cmd, parentPipe, childPipe)
  216. }
  217. return c.newInitProcess(p, cmd, parentPipe, childPipe)
  218. }
  219. func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) {
  220. cmd := &exec.Cmd{
  221. Path: c.initPath,
  222. Args: c.initArgs,
  223. }
  224. cmd.Stdin = p.Stdin
  225. cmd.Stdout = p.Stdout
  226. cmd.Stderr = p.Stderr
  227. cmd.Dir = c.config.Rootfs
  228. if cmd.SysProcAttr == nil {
  229. cmd.SysProcAttr = &syscall.SysProcAttr{}
  230. }
  231. cmd.ExtraFiles = append(p.ExtraFiles, childPipe)
  232. cmd.Env = append(cmd.Env, fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
  233. // NOTE: when running a container with no PID namespace and the parent process spawning the container is
  234. // PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason
  235. // even with the parent still running.
  236. if c.config.ParentDeathSignal > 0 {
  237. cmd.SysProcAttr.Pdeathsig = syscall.Signal(c.config.ParentDeathSignal)
  238. }
  239. return cmd, nil
  240. }
  241. func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) {
  242. cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
  243. nsMaps := make(map[configs.NamespaceType]string)
  244. for _, ns := range c.config.Namespaces {
  245. if ns.Path != "" {
  246. nsMaps[ns.Type] = ns.Path
  247. }
  248. }
  249. _, sharePidns := nsMaps[configs.NEWPID]
  250. data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, "")
  251. if err != nil {
  252. return nil, err
  253. }
  254. return &initProcess{
  255. cmd: cmd,
  256. childPipe: childPipe,
  257. parentPipe: parentPipe,
  258. manager: c.cgroupManager,
  259. config: c.newInitConfig(p),
  260. container: c,
  261. process: p,
  262. bootstrapData: data,
  263. sharePidns: sharePidns,
  264. }, nil
  265. }
  266. func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) {
  267. cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
  268. state, err := c.currentState()
  269. if err != nil {
  270. return nil, newSystemError(err)
  271. }
  272. // for setns process, we dont have to set cloneflags as the process namespaces
  273. // will only be set via setns syscall
  274. data, err := c.bootstrapData(0, state.NamespacePaths, p.consolePath)
  275. if err != nil {
  276. return nil, err
  277. }
  278. // TODO: set on container for process management
  279. return &setnsProcess{
  280. cmd: cmd,
  281. cgroupPaths: c.cgroupManager.GetPaths(),
  282. childPipe: childPipe,
  283. parentPipe: parentPipe,
  284. config: c.newInitConfig(p),
  285. process: p,
  286. bootstrapData: data,
  287. }, nil
  288. }
  289. func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
  290. cfg := &initConfig{
  291. Config: c.config,
  292. Args: process.Args,
  293. Env: process.Env,
  294. User: process.User,
  295. Cwd: process.Cwd,
  296. Console: process.consolePath,
  297. Capabilities: process.Capabilities,
  298. PassedFilesCount: len(process.ExtraFiles),
  299. ContainerId: c.ID(),
  300. NoNewPrivileges: c.config.NoNewPrivileges,
  301. AppArmorProfile: c.config.AppArmorProfile,
  302. ProcessLabel: c.config.ProcessLabel,
  303. Rlimits: c.config.Rlimits,
  304. }
  305. if process.NoNewPrivileges != nil {
  306. cfg.NoNewPrivileges = *process.NoNewPrivileges
  307. }
  308. if process.AppArmorProfile != "" {
  309. cfg.AppArmorProfile = process.AppArmorProfile
  310. }
  311. if process.Label != "" {
  312. cfg.ProcessLabel = process.Label
  313. }
  314. if len(process.Rlimits) > 0 {
  315. cfg.Rlimits = process.Rlimits
  316. }
  317. return cfg
  318. }
  319. func newPipe() (parent *os.File, child *os.File, err error) {
  320. fds, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_STREAM|syscall.SOCK_CLOEXEC, 0)
  321. if err != nil {
  322. return nil, nil, err
  323. }
  324. return os.NewFile(uintptr(fds[1]), "parent"), os.NewFile(uintptr(fds[0]), "child"), nil
  325. }
  326. func (c *linuxContainer) Destroy() error {
  327. c.m.Lock()
  328. defer c.m.Unlock()
  329. return c.state.destroy()
  330. }
  331. func (c *linuxContainer) Pause() error {
  332. c.m.Lock()
  333. defer c.m.Unlock()
  334. status, err := c.currentStatus()
  335. if err != nil {
  336. return err
  337. }
  338. if status != Running {
  339. return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning)
  340. }
  341. if err := c.cgroupManager.Freeze(configs.Frozen); err != nil {
  342. return err
  343. }
  344. return c.state.transition(&pausedState{
  345. c: c,
  346. })
  347. }
  348. func (c *linuxContainer) Resume() error {
  349. c.m.Lock()
  350. defer c.m.Unlock()
  351. status, err := c.currentStatus()
  352. if err != nil {
  353. return err
  354. }
  355. if status != Paused {
  356. return newGenericError(fmt.Errorf("container not paused"), ContainerNotPaused)
  357. }
  358. if err := c.cgroupManager.Freeze(configs.Thawed); err != nil {
  359. return err
  360. }
  361. return c.state.transition(&runningState{
  362. c: c,
  363. })
  364. }
  365. func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) {
  366. return notifyOnOOM(c.cgroupManager.GetPaths())
  367. }
  368. func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) {
  369. return notifyMemoryPressure(c.cgroupManager.GetPaths(), level)
  370. }
  371. // check Criu version greater than or equal to min_version
  372. func (c *linuxContainer) checkCriuVersion(min_version string) error {
  373. var x, y, z, versionReq int
  374. _, err := fmt.Sscanf(min_version, "%d.%d.%d\n", &x, &y, &z) // 1.5.2
  375. if err != nil {
  376. _, err = fmt.Sscanf(min_version, "Version: %d.%d\n", &x, &y) // 1.6
  377. }
  378. versionReq = x*10000 + y*100 + z
  379. out, err := exec.Command(c.criuPath, "-V").Output()
  380. if err != nil {
  381. return fmt.Errorf("Unable to execute CRIU command: %s", c.criuPath)
  382. }
  383. x = 0
  384. y = 0
  385. z = 0
  386. if ep := strings.Index(string(out), "-"); ep >= 0 {
  387. // criu Git version format
  388. var version string
  389. if sp := strings.Index(string(out), "GitID"); sp > 0 {
  390. version = string(out)[sp:ep]
  391. } else {
  392. return fmt.Errorf("Unable to parse the CRIU version: %s", c.criuPath)
  393. }
  394. n, err := fmt.Sscanf(string(version), "GitID: v%d.%d.%d", &x, &y, &z) // 1.5.2
  395. if err != nil {
  396. n, err = fmt.Sscanf(string(version), "GitID: v%d.%d", &x, &y) // 1.6
  397. y++
  398. } else {
  399. z++
  400. }
  401. if n < 2 || err != nil {
  402. return fmt.Errorf("Unable to parse the CRIU version: %s %d %s", version, n, err)
  403. }
  404. } else {
  405. // criu release version format
  406. n, err := fmt.Sscanf(string(out), "Version: %d.%d.%d\n", &x, &y, &z) // 1.5.2
  407. if err != nil {
  408. n, err = fmt.Sscanf(string(out), "Version: %d.%d\n", &x, &y) // 1.6
  409. }
  410. if n < 2 || err != nil {
  411. return fmt.Errorf("Unable to parse the CRIU version: %s %d %s", out, n, err)
  412. }
  413. }
  414. c.criuVersion = x*10000 + y*100 + z
  415. if c.criuVersion < versionReq {
  416. return fmt.Errorf("CRIU version must be %s or higher", min_version)
  417. }
  418. return nil
  419. }
  420. const descriptorsFilename = "descriptors.json"
  421. func (c *linuxContainer) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount) {
  422. mountDest := m.Destination
  423. if strings.HasPrefix(mountDest, c.config.Rootfs) {
  424. mountDest = mountDest[len(c.config.Rootfs):]
  425. }
  426. extMnt := &criurpc.ExtMountMap{
  427. Key: proto.String(mountDest),
  428. Val: proto.String(mountDest),
  429. }
  430. req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
  431. }
  432. func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
  433. c.m.Lock()
  434. defer c.m.Unlock()
  435. if err := c.checkCriuVersion("1.5.2"); err != nil {
  436. return err
  437. }
  438. if criuOpts.ImagesDirectory == "" {
  439. return fmt.Errorf("invalid directory to save checkpoint")
  440. }
  441. // Since a container can be C/R'ed multiple times,
  442. // the checkpoint directory may already exist.
  443. if err := os.Mkdir(criuOpts.ImagesDirectory, 0755); err != nil && !os.IsExist(err) {
  444. return err
  445. }
  446. if criuOpts.WorkDirectory == "" {
  447. criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work")
  448. }
  449. if err := os.Mkdir(criuOpts.WorkDirectory, 0755); err != nil && !os.IsExist(err) {
  450. return err
  451. }
  452. workDir, err := os.Open(criuOpts.WorkDirectory)
  453. if err != nil {
  454. return err
  455. }
  456. defer workDir.Close()
  457. imageDir, err := os.Open(criuOpts.ImagesDirectory)
  458. if err != nil {
  459. return err
  460. }
  461. defer imageDir.Close()
  462. rpcOpts := criurpc.CriuOpts{
  463. ImagesDirFd: proto.Int32(int32(imageDir.Fd())),
  464. WorkDirFd: proto.Int32(int32(workDir.Fd())),
  465. LogLevel: proto.Int32(4),
  466. LogFile: proto.String("dump.log"),
  467. Root: proto.String(c.config.Rootfs),
  468. ManageCgroups: proto.Bool(true),
  469. NotifyScripts: proto.Bool(true),
  470. Pid: proto.Int32(int32(c.initProcess.pid())),
  471. ShellJob: proto.Bool(criuOpts.ShellJob),
  472. LeaveRunning: proto.Bool(criuOpts.LeaveRunning),
  473. TcpEstablished: proto.Bool(criuOpts.TcpEstablished),
  474. ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections),
  475. FileLocks: proto.Bool(criuOpts.FileLocks),
  476. EmptyNs: proto.Uint32(criuOpts.EmptyNs),
  477. }
  478. // append optional criu opts, e.g., page-server and port
  479. if criuOpts.PageServer.Address != "" && criuOpts.PageServer.Port != 0 {
  480. rpcOpts.Ps = &criurpc.CriuPageServerInfo{
  481. Address: proto.String(criuOpts.PageServer.Address),
  482. Port: proto.Int32(criuOpts.PageServer.Port),
  483. }
  484. }
  485. // append optional manage cgroups mode
  486. if criuOpts.ManageCgroupsMode != 0 {
  487. if err := c.checkCriuVersion("1.7"); err != nil {
  488. return err
  489. }
  490. mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode)
  491. rpcOpts.ManageCgroupsMode = &mode
  492. }
  493. t := criurpc.CriuReqType_DUMP
  494. req := &criurpc.CriuReq{
  495. Type: &t,
  496. Opts: &rpcOpts,
  497. }
  498. for _, m := range c.config.Mounts {
  499. switch m.Device {
  500. case "bind":
  501. c.addCriuDumpMount(req, m)
  502. break
  503. case "cgroup":
  504. binds, err := getCgroupMounts(m)
  505. if err != nil {
  506. return err
  507. }
  508. for _, b := range binds {
  509. c.addCriuDumpMount(req, b)
  510. }
  511. break
  512. }
  513. }
  514. // Write the FD info to a file in the image directory
  515. fdsJSON, err := json.Marshal(c.initProcess.externalDescriptors())
  516. if err != nil {
  517. return err
  518. }
  519. err = ioutil.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0655)
  520. if err != nil {
  521. return err
  522. }
  523. err = c.criuSwrk(nil, req, criuOpts, false)
  524. if err != nil {
  525. return err
  526. }
  527. return nil
  528. }
  529. func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mount) {
  530. mountDest := m.Destination
  531. if strings.HasPrefix(mountDest, c.config.Rootfs) {
  532. mountDest = mountDest[len(c.config.Rootfs):]
  533. }
  534. extMnt := &criurpc.ExtMountMap{
  535. Key: proto.String(mountDest),
  536. Val: proto.String(m.Source),
  537. }
  538. req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
  539. }
  540. func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
  541. c.m.Lock()
  542. defer c.m.Unlock()
  543. if err := c.checkCriuVersion("1.5.2"); err != nil {
  544. return err
  545. }
  546. if criuOpts.WorkDirectory == "" {
  547. criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work")
  548. }
  549. // Since a container can be C/R'ed multiple times,
  550. // the work directory may already exist.
  551. if err := os.Mkdir(criuOpts.WorkDirectory, 0655); err != nil && !os.IsExist(err) {
  552. return err
  553. }
  554. workDir, err := os.Open(criuOpts.WorkDirectory)
  555. if err != nil {
  556. return err
  557. }
  558. defer workDir.Close()
  559. if criuOpts.ImagesDirectory == "" {
  560. return fmt.Errorf("invalid directory to restore checkpoint")
  561. }
  562. imageDir, err := os.Open(criuOpts.ImagesDirectory)
  563. if err != nil {
  564. return err
  565. }
  566. defer imageDir.Close()
  567. // CRIU has a few requirements for a root directory:
  568. // * it must be a mount point
  569. // * its parent must not be overmounted
  570. // c.config.Rootfs is bind-mounted to a temporary directory
  571. // to satisfy these requirements.
  572. root := filepath.Join(c.root, "criu-root")
  573. if err := os.Mkdir(root, 0755); err != nil {
  574. return err
  575. }
  576. defer os.Remove(root)
  577. root, err = filepath.EvalSymlinks(root)
  578. if err != nil {
  579. return err
  580. }
  581. err = syscall.Mount(c.config.Rootfs, root, "", syscall.MS_BIND|syscall.MS_REC, "")
  582. if err != nil {
  583. return err
  584. }
  585. defer syscall.Unmount(root, syscall.MNT_DETACH)
  586. t := criurpc.CriuReqType_RESTORE
  587. req := &criurpc.CriuReq{
  588. Type: &t,
  589. Opts: &criurpc.CriuOpts{
  590. ImagesDirFd: proto.Int32(int32(imageDir.Fd())),
  591. WorkDirFd: proto.Int32(int32(workDir.Fd())),
  592. EvasiveDevices: proto.Bool(true),
  593. LogLevel: proto.Int32(4),
  594. LogFile: proto.String("restore.log"),
  595. RstSibling: proto.Bool(true),
  596. Root: proto.String(root),
  597. ManageCgroups: proto.Bool(true),
  598. NotifyScripts: proto.Bool(true),
  599. ShellJob: proto.Bool(criuOpts.ShellJob),
  600. ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections),
  601. TcpEstablished: proto.Bool(criuOpts.TcpEstablished),
  602. FileLocks: proto.Bool(criuOpts.FileLocks),
  603. EmptyNs: proto.Uint32(criuOpts.EmptyNs),
  604. },
  605. }
  606. for _, m := range c.config.Mounts {
  607. switch m.Device {
  608. case "bind":
  609. c.addCriuRestoreMount(req, m)
  610. break
  611. case "cgroup":
  612. binds, err := getCgroupMounts(m)
  613. if err != nil {
  614. return err
  615. }
  616. for _, b := range binds {
  617. c.addCriuRestoreMount(req, b)
  618. }
  619. break
  620. }
  621. }
  622. for _, iface := range c.config.Networks {
  623. switch iface.Type {
  624. case "veth":
  625. veth := new(criurpc.CriuVethPair)
  626. veth.IfOut = proto.String(iface.HostInterfaceName)
  627. veth.IfIn = proto.String(iface.Name)
  628. req.Opts.Veths = append(req.Opts.Veths, veth)
  629. break
  630. case "loopback":
  631. break
  632. }
  633. }
  634. for _, i := range criuOpts.VethPairs {
  635. veth := new(criurpc.CriuVethPair)
  636. veth.IfOut = proto.String(i.HostInterfaceName)
  637. veth.IfIn = proto.String(i.ContainerInterfaceName)
  638. req.Opts.Veths = append(req.Opts.Veths, veth)
  639. }
  640. // append optional manage cgroups mode
  641. if criuOpts.ManageCgroupsMode != 0 {
  642. if err := c.checkCriuVersion("1.7"); err != nil {
  643. return err
  644. }
  645. mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode)
  646. req.Opts.ManageCgroupsMode = &mode
  647. }
  648. var (
  649. fds []string
  650. fdJSON []byte
  651. )
  652. if fdJSON, err = ioutil.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil {
  653. return err
  654. }
  655. if err := json.Unmarshal(fdJSON, &fds); err != nil {
  656. return err
  657. }
  658. for i := range fds {
  659. if s := fds[i]; strings.Contains(s, "pipe:") {
  660. inheritFd := new(criurpc.InheritFd)
  661. inheritFd.Key = proto.String(s)
  662. inheritFd.Fd = proto.Int32(int32(i))
  663. req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd)
  664. }
  665. }
  666. return c.criuSwrk(process, req, criuOpts, true)
  667. }
  668. func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
  669. if err := c.cgroupManager.Apply(pid); err != nil {
  670. return err
  671. }
  672. path := fmt.Sprintf("/proc/%d/cgroup", pid)
  673. cgroupsPaths, err := cgroups.ParseCgroupFile(path)
  674. if err != nil {
  675. return err
  676. }
  677. for c, p := range cgroupsPaths {
  678. cgroupRoot := &criurpc.CgroupRoot{
  679. Ctrl: proto.String(c),
  680. Path: proto.String(p),
  681. }
  682. req.Opts.CgRoot = append(req.Opts.CgRoot, cgroupRoot)
  683. }
  684. return nil
  685. }
  686. func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, applyCgroups bool) error {
  687. fds, err := syscall.Socketpair(syscall.AF_LOCAL, syscall.SOCK_SEQPACKET|syscall.SOCK_CLOEXEC, 0)
  688. if err != nil {
  689. return err
  690. }
  691. logPath := filepath.Join(opts.WorkDirectory, req.GetOpts().GetLogFile())
  692. criuClient := os.NewFile(uintptr(fds[0]), "criu-transport-client")
  693. criuServer := os.NewFile(uintptr(fds[1]), "criu-transport-server")
  694. defer criuClient.Close()
  695. defer criuServer.Close()
  696. args := []string{"swrk", "3"}
  697. logrus.Debugf("Using CRIU %d at: %s", c.criuVersion, c.criuPath)
  698. logrus.Debugf("Using CRIU with following args: %s", args)
  699. cmd := exec.Command(c.criuPath, args...)
  700. if process != nil {
  701. cmd.Stdin = process.Stdin
  702. cmd.Stdout = process.Stdout
  703. cmd.Stderr = process.Stderr
  704. }
  705. cmd.ExtraFiles = append(cmd.ExtraFiles, criuServer)
  706. if err := cmd.Start(); err != nil {
  707. return err
  708. }
  709. criuServer.Close()
  710. defer func() {
  711. criuClient.Close()
  712. _, err := cmd.Process.Wait()
  713. if err != nil {
  714. return
  715. }
  716. }()
  717. if applyCgroups {
  718. err := c.criuApplyCgroups(cmd.Process.Pid, req)
  719. if err != nil {
  720. return err
  721. }
  722. }
  723. var extFds []string
  724. if process != nil {
  725. extFds, err = getPipeFds(cmd.Process.Pid)
  726. if err != nil {
  727. return err
  728. }
  729. }
  730. logrus.Debugf("Using CRIU in %s mode", req.GetType().String())
  731. val := reflect.ValueOf(req.GetOpts())
  732. v := reflect.Indirect(val)
  733. for i := 0; i < v.NumField(); i++ {
  734. st := v.Type()
  735. name := st.Field(i).Name
  736. if strings.HasPrefix(name, "XXX_") {
  737. continue
  738. }
  739. value := val.MethodByName("Get" + name).Call([]reflect.Value{})
  740. logrus.Debugf("CRIU option %s with value %v", name, value[0])
  741. }
  742. data, err := proto.Marshal(req)
  743. if err != nil {
  744. return err
  745. }
  746. _, err = criuClient.Write(data)
  747. if err != nil {
  748. return err
  749. }
  750. buf := make([]byte, 10*4096)
  751. for true {
  752. n, err := criuClient.Read(buf)
  753. if err != nil {
  754. return err
  755. }
  756. if n == 0 {
  757. return fmt.Errorf("unexpected EOF")
  758. }
  759. if n == len(buf) {
  760. return fmt.Errorf("buffer is too small")
  761. }
  762. resp := new(criurpc.CriuResp)
  763. err = proto.Unmarshal(buf[:n], resp)
  764. if err != nil {
  765. return err
  766. }
  767. if !resp.GetSuccess() {
  768. typeString := req.GetType().String()
  769. return fmt.Errorf("criu failed: type %s errno %d\nlog file: %s", typeString, resp.GetCrErrno(), logPath)
  770. }
  771. t := resp.GetType()
  772. switch {
  773. case t == criurpc.CriuReqType_NOTIFY:
  774. if err := c.criuNotifications(resp, process, opts, extFds); err != nil {
  775. return err
  776. }
  777. t = criurpc.CriuReqType_NOTIFY
  778. req = &criurpc.CriuReq{
  779. Type: &t,
  780. NotifySuccess: proto.Bool(true),
  781. }
  782. data, err = proto.Marshal(req)
  783. if err != nil {
  784. return err
  785. }
  786. _, err = criuClient.Write(data)
  787. if err != nil {
  788. return err
  789. }
  790. continue
  791. case t == criurpc.CriuReqType_RESTORE:
  792. case t == criurpc.CriuReqType_DUMP:
  793. break
  794. default:
  795. return fmt.Errorf("unable to parse the response %s", resp.String())
  796. }
  797. break
  798. }
  799. // cmd.Wait() waits cmd.goroutines which are used for proxying file descriptors.
  800. // Here we want to wait only the CRIU process.
  801. st, err := cmd.Process.Wait()
  802. if err != nil {
  803. return err
  804. }
  805. if !st.Success() {
  806. return fmt.Errorf("criu failed: %s\nlog file: %s", st.String(), logPath)
  807. }
  808. return nil
  809. }
  810. // block any external network activity
  811. func lockNetwork(config *configs.Config) error {
  812. for _, config := range config.Networks {
  813. strategy, err := getStrategy(config.Type)
  814. if err != nil {
  815. return err
  816. }
  817. if err := strategy.detach(config); err != nil {
  818. return err
  819. }
  820. }
  821. return nil
  822. }
  823. func unlockNetwork(config *configs.Config) error {
  824. for _, config := range config.Networks {
  825. strategy, err := getStrategy(config.Type)
  826. if err != nil {
  827. return err
  828. }
  829. if err = strategy.attach(config); err != nil {
  830. return err
  831. }
  832. }
  833. return nil
  834. }
  835. func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Process, opts *CriuOpts, fds []string) error {
  836. notify := resp.GetNotify()
  837. if notify == nil {
  838. return fmt.Errorf("invalid response: %s", resp.String())
  839. }
  840. switch {
  841. case notify.GetScript() == "post-dump":
  842. f, err := os.Create(filepath.Join(c.root, "checkpoint"))
  843. if err != nil {
  844. return err
  845. }
  846. f.Close()
  847. case notify.GetScript() == "network-unlock":
  848. if err := unlockNetwork(c.config); err != nil {
  849. return err
  850. }
  851. case notify.GetScript() == "network-lock":
  852. if err := lockNetwork(c.config); err != nil {
  853. return err
  854. }
  855. case notify.GetScript() == "setup-namespaces":
  856. if c.config.Hooks != nil {
  857. s := configs.HookState{
  858. Version: c.config.Version,
  859. ID: c.id,
  860. Pid: int(notify.GetPid()),
  861. Root: c.config.Rootfs,
  862. }
  863. for _, hook := range c.config.Hooks.Prestart {
  864. if err := hook.Run(s); err != nil {
  865. return newSystemError(err)
  866. }
  867. }
  868. }
  869. case notify.GetScript() == "post-restore":
  870. pid := notify.GetPid()
  871. r, err := newRestoredProcess(int(pid), fds)
  872. if err != nil {
  873. return err
  874. }
  875. process.ops = r
  876. if err := c.state.transition(&restoredState{
  877. imageDir: opts.ImagesDirectory,
  878. c: c,
  879. }); err != nil {
  880. return err
  881. }
  882. if err := c.updateState(r); err != nil {
  883. return err
  884. }
  885. if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil {
  886. if !os.IsNotExist(err) {
  887. logrus.Error(err)
  888. }
  889. }
  890. }
  891. return nil
  892. }
  893. func (c *linuxContainer) updateState(process parentProcess) error {
  894. c.initProcess = process
  895. state, err := c.currentState()
  896. if err != nil {
  897. return err
  898. }
  899. return c.saveState(state)
  900. }
  901. func (c *linuxContainer) saveState(s *State) error {
  902. f, err := os.Create(filepath.Join(c.root, stateFilename))
  903. if err != nil {
  904. return err
  905. }
  906. defer f.Close()
  907. return utils.WriteJSON(f, s)
  908. }
  909. func (c *linuxContainer) deleteState() error {
  910. return os.Remove(filepath.Join(c.root, stateFilename))
  911. }
  912. func (c *linuxContainer) currentStatus() (Status, error) {
  913. if err := c.refreshState(); err != nil {
  914. return -1, err
  915. }
  916. return c.state.status(), nil
  917. }
  918. // refreshState needs to be called to verify that the current state on the
  919. // container is what is true. Because consumers of libcontainer can use it
  920. // out of process we need to verify the container's status based on runtime
  921. // information and not rely on our in process info.
  922. func (c *linuxContainer) refreshState() error {
  923. paused, err := c.isPaused()
  924. if err != nil {
  925. return err
  926. }
  927. if paused {
  928. return c.state.transition(&pausedState{c: c})
  929. }
  930. running, err := c.isRunning()
  931. if err != nil {
  932. return err
  933. }
  934. if running {
  935. return c.state.transition(&runningState{c: c})
  936. }
  937. return c.state.transition(&stoppedState{c: c})
  938. }
  939. func (c *linuxContainer) isRunning() (bool, error) {
  940. if c.initProcess == nil {
  941. return false, nil
  942. }
  943. // return Running if the init process is alive
  944. if err := syscall.Kill(c.initProcess.pid(), 0); err != nil {
  945. if err == syscall.ESRCH {
  946. return false, nil
  947. }
  948. return false, newSystemError(err)
  949. }
  950. return true, nil
  951. }
  952. func (c *linuxContainer) isPaused() (bool, error) {
  953. data, err := ioutil.ReadFile(filepath.Join(c.cgroupManager.GetPaths()["freezer"], "freezer.state"))
  954. if err != nil {
  955. if os.IsNotExist(err) {
  956. return false, nil
  957. }
  958. return false, newSystemError(err)
  959. }
  960. return bytes.Equal(bytes.TrimSpace(data), []byte("FROZEN")), nil
  961. }
  962. func (c *linuxContainer) currentState() (*State, error) {
  963. var (
  964. startTime string
  965. externalDescriptors []string
  966. pid = -1
  967. )
  968. if c.initProcess != nil {
  969. pid = c.initProcess.pid()
  970. startTime, _ = c.initProcess.startTime()
  971. externalDescriptors = c.initProcess.externalDescriptors()
  972. }
  973. state := &State{
  974. BaseState: BaseState{
  975. ID: c.ID(),
  976. Config: *c.config,
  977. InitProcessPid: pid,
  978. InitProcessStartTime: startTime,
  979. Created: c.created,
  980. },
  981. CgroupPaths: c.cgroupManager.GetPaths(),
  982. NamespacePaths: make(map[configs.NamespaceType]string),
  983. ExternalDescriptors: externalDescriptors,
  984. }
  985. if pid > 0 {
  986. for _, ns := range c.config.Namespaces {
  987. state.NamespacePaths[ns.Type] = ns.GetPath(pid)
  988. }
  989. for _, nsType := range configs.NamespaceTypes() {
  990. if !configs.IsNamespaceSupported(nsType) {
  991. continue
  992. }
  993. if _, ok := state.NamespacePaths[nsType]; !ok {
  994. ns := configs.Namespace{Type: nsType}
  995. state.NamespacePaths[ns.Type] = ns.GetPath(pid)
  996. }
  997. }
  998. }
  999. return state, nil
  1000. }
  1001. // orderNamespacePaths sorts namespace paths into a list of paths that we
  1002. // can setns in order.
  1003. func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) {
  1004. paths := []string{}
  1005. nsTypes := []configs.NamespaceType{
  1006. configs.NEWIPC,
  1007. configs.NEWUTS,
  1008. configs.NEWNET,
  1009. configs.NEWPID,
  1010. configs.NEWNS,
  1011. }
  1012. // join userns if the init process explicitly requires NEWUSER
  1013. if c.config.Namespaces.Contains(configs.NEWUSER) {
  1014. nsTypes = append(nsTypes, configs.NEWUSER)
  1015. }
  1016. for _, nsType := range nsTypes {
  1017. if p, ok := namespaces[nsType]; ok && p != "" {
  1018. // check if the requested namespace is supported
  1019. if !configs.IsNamespaceSupported(nsType) {
  1020. return nil, newSystemError(fmt.Errorf("namespace %s is not supported", nsType))
  1021. }
  1022. // only set to join this namespace if it exists
  1023. if _, err := os.Lstat(p); err != nil {
  1024. return nil, newSystemError(err)
  1025. }
  1026. // do not allow namespace path with comma as we use it to separate
  1027. // the namespace paths
  1028. if strings.ContainsRune(p, ',') {
  1029. return nil, newSystemError(fmt.Errorf("invalid path %s", p))
  1030. }
  1031. paths = append(paths, p)
  1032. }
  1033. }
  1034. return paths, nil
  1035. }
  1036. func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) {
  1037. data := bytes.NewBuffer(nil)
  1038. for _, im := range idMap {
  1039. line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size)
  1040. if _, err := data.WriteString(line); err != nil {
  1041. return nil, err
  1042. }
  1043. }
  1044. return data.Bytes(), nil
  1045. }
  1046. // bootstrapData encodes the necessary data in netlink binary format
  1047. // as a io.Reader.
  1048. // Consumer can write the data to a bootstrap program
  1049. // such as one that uses nsenter package to bootstrap the container's
  1050. // init process correctly, i.e. with correct namespaces, uid/gid
  1051. // mapping etc.
  1052. func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string, consolePath string) (io.Reader, error) {
  1053. // create the netlink message
  1054. r := nl.NewNetlinkRequest(int(InitMsg), 0)
  1055. // write cloneFlags
  1056. r.AddData(&Int32msg{
  1057. Type: CloneFlagsAttr,
  1058. Value: uint32(cloneFlags),
  1059. })
  1060. // write console path
  1061. if consolePath != "" {
  1062. r.AddData(&Bytemsg{
  1063. Type: ConsolePathAttr,
  1064. Value: []byte(consolePath),
  1065. })
  1066. }
  1067. // write custom namespace paths
  1068. if len(nsMaps) > 0 {
  1069. nsPaths, err := c.orderNamespacePaths(nsMaps)
  1070. if err != nil {
  1071. return nil, err
  1072. }
  1073. r.AddData(&Bytemsg{
  1074. Type: NsPathsAttr,
  1075. Value: []byte(strings.Join(nsPaths, ",")),
  1076. })
  1077. }
  1078. // write namespace paths only when we are not joining an existing user ns
  1079. _, joinExistingUser := nsMaps[configs.NEWUSER]
  1080. if !joinExistingUser {
  1081. // write uid mappings
  1082. if len(c.config.UidMappings) > 0 {
  1083. b, err := encodeIDMapping(c.config.UidMappings)
  1084. if err != nil {
  1085. return nil, err
  1086. }
  1087. r.AddData(&Bytemsg{
  1088. Type: UidmapAttr,
  1089. Value: b,
  1090. })
  1091. }
  1092. // write gid mappings
  1093. if len(c.config.GidMappings) > 0 {
  1094. b, err := encodeIDMapping(c.config.GidMappings)
  1095. if err != nil {
  1096. return nil, err
  1097. }
  1098. r.AddData(&Bytemsg{
  1099. Type: GidmapAttr,
  1100. Value: b,
  1101. })
  1102. // check if we have CAP_SETGID to setgroup properly
  1103. pid, err := capability.NewPid(os.Getpid())
  1104. if err != nil {
  1105. return nil, err
  1106. }
  1107. if !pid.Get(capability.EFFECTIVE, capability.CAP_SETGID) {
  1108. r.AddData(&Boolmsg{
  1109. Type: SetgroupAttr,
  1110. Value: true,
  1111. })
  1112. }
  1113. }
  1114. }
  1115. return bytes.NewReader(r.Serialize()), nil
  1116. }