apply_systemd.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480
  1. // +build linux
  2. package systemd
  3. import (
  4. "errors"
  5. "fmt"
  6. "io/ioutil"
  7. "os"
  8. "path/filepath"
  9. "strconv"
  10. "strings"
  11. "sync"
  12. "time"
  13. systemdDbus "github.com/coreos/go-systemd/dbus"
  14. systemdUtil "github.com/coreos/go-systemd/util"
  15. "github.com/godbus/dbus"
  16. "github.com/opencontainers/runc/libcontainer/cgroups"
  17. "github.com/opencontainers/runc/libcontainer/cgroups/fs"
  18. "github.com/opencontainers/runc/libcontainer/configs"
  19. )
  20. type Manager struct {
  21. mu sync.Mutex
  22. Cgroups *configs.Cgroup
  23. Paths map[string]string
  24. }
  25. type subsystem interface {
  26. // Name returns the name of the subsystem.
  27. Name() string
  28. // Returns the stats, as 'stats', corresponding to the cgroup under 'path'.
  29. GetStats(path string, stats *cgroups.Stats) error
  30. // Set the cgroup represented by cgroup.
  31. Set(path string, cgroup *configs.Cgroup) error
  32. }
  33. var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist")
  34. type subsystemSet []subsystem
  35. func (s subsystemSet) Get(name string) (subsystem, error) {
  36. for _, ss := range s {
  37. if ss.Name() == name {
  38. return ss, nil
  39. }
  40. }
  41. return nil, errSubsystemDoesNotExist
  42. }
  43. var subsystems = subsystemSet{
  44. &fs.CpusetGroup{},
  45. &fs.DevicesGroup{},
  46. &fs.MemoryGroup{},
  47. &fs.CpuGroup{},
  48. &fs.CpuacctGroup{},
  49. &fs.PidsGroup{},
  50. &fs.BlkioGroup{},
  51. &fs.HugetlbGroup{},
  52. &fs.PerfEventGroup{},
  53. &fs.FreezerGroup{},
  54. &fs.NetPrioGroup{},
  55. &fs.NetClsGroup{},
  56. &fs.NameGroup{GroupName: "name=systemd"},
  57. }
  58. const (
  59. testScopeWait = 4
  60. )
  61. var (
  62. connLock sync.Mutex
  63. theConn *systemdDbus.Conn
  64. hasStartTransientUnit bool
  65. hasTransientDefaultDependencies bool
  66. )
  67. func newProp(name string, units interface{}) systemdDbus.Property {
  68. return systemdDbus.Property{
  69. Name: name,
  70. Value: dbus.MakeVariant(units),
  71. }
  72. }
  73. func UseSystemd() bool {
  74. if !systemdUtil.IsRunningSystemd() {
  75. return false
  76. }
  77. connLock.Lock()
  78. defer connLock.Unlock()
  79. if theConn == nil {
  80. var err error
  81. theConn, err = systemdDbus.New()
  82. if err != nil {
  83. return false
  84. }
  85. // Assume we have StartTransientUnit
  86. hasStartTransientUnit = true
  87. // But if we get UnknownMethod error we don't
  88. if _, err := theConn.StartTransientUnit("test.scope", "invalid", nil, nil); err != nil {
  89. if dbusError, ok := err.(dbus.Error); ok {
  90. if dbusError.Name == "org.freedesktop.DBus.Error.UnknownMethod" {
  91. hasStartTransientUnit = false
  92. return hasStartTransientUnit
  93. }
  94. }
  95. }
  96. // Ensure the scope name we use doesn't exist. Use the Pid to
  97. // avoid collisions between multiple libcontainer users on a
  98. // single host.
  99. scope := fmt.Sprintf("libcontainer-%d-systemd-test-default-dependencies.scope", os.Getpid())
  100. testScopeExists := true
  101. for i := 0; i <= testScopeWait; i++ {
  102. if _, err := theConn.StopUnit(scope, "replace", nil); err != nil {
  103. if dbusError, ok := err.(dbus.Error); ok {
  104. if strings.Contains(dbusError.Name, "org.freedesktop.systemd1.NoSuchUnit") {
  105. testScopeExists = false
  106. break
  107. }
  108. }
  109. }
  110. time.Sleep(time.Millisecond)
  111. }
  112. // Bail out if we can't kill this scope without testing for DefaultDependencies
  113. if testScopeExists {
  114. return hasStartTransientUnit
  115. }
  116. // Assume StartTransientUnit on a scope allows DefaultDependencies
  117. hasTransientDefaultDependencies = true
  118. ddf := newProp("DefaultDependencies", false)
  119. if _, err := theConn.StartTransientUnit(scope, "replace", []systemdDbus.Property{ddf}, nil); err != nil {
  120. if dbusError, ok := err.(dbus.Error); ok {
  121. if strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.PropertyReadOnly") {
  122. hasTransientDefaultDependencies = false
  123. }
  124. }
  125. }
  126. // Not critical because of the stop unit logic above.
  127. theConn.StopUnit(scope, "replace", nil)
  128. }
  129. return hasStartTransientUnit
  130. }
  131. func (m *Manager) Apply(pid int) error {
  132. var (
  133. c = m.Cgroups
  134. unitName = getUnitName(c)
  135. slice = "system.slice"
  136. properties []systemdDbus.Property
  137. )
  138. if c.Paths != nil {
  139. paths := make(map[string]string)
  140. for name, path := range c.Paths {
  141. _, err := getSubsystemPath(m.Cgroups, name)
  142. if err != nil {
  143. // Don't fail if a cgroup hierarchy was not found, just skip this subsystem
  144. if cgroups.IsNotFound(err) {
  145. continue
  146. }
  147. return err
  148. }
  149. paths[name] = path
  150. }
  151. m.Paths = paths
  152. return cgroups.EnterPid(m.Paths, pid)
  153. }
  154. if c.Parent != "" {
  155. slice = c.Parent
  156. }
  157. properties = append(properties,
  158. systemdDbus.PropSlice(slice),
  159. systemdDbus.PropDescription("docker container "+c.Name),
  160. newProp("PIDs", []uint32{uint32(pid)}),
  161. // This is only supported on systemd versions 218 and above.
  162. newProp("Delegate", true),
  163. )
  164. // Always enable accounting, this gets us the same behaviour as the fs implementation,
  165. // plus the kernel has some problems with joining the memory cgroup at a later time.
  166. properties = append(properties,
  167. newProp("MemoryAccounting", true),
  168. newProp("CPUAccounting", true),
  169. newProp("BlockIOAccounting", true))
  170. if hasTransientDefaultDependencies {
  171. properties = append(properties,
  172. newProp("DefaultDependencies", false))
  173. }
  174. if c.Resources.Memory != 0 {
  175. properties = append(properties,
  176. newProp("MemoryLimit", uint64(c.Resources.Memory)))
  177. }
  178. if c.Resources.CpuShares != 0 {
  179. properties = append(properties,
  180. newProp("CPUShares", uint64(c.Resources.CpuShares)))
  181. }
  182. if c.Resources.BlkioWeight != 0 {
  183. properties = append(properties,
  184. newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight)))
  185. }
  186. // We need to set kernel memory before processes join cgroup because
  187. // kmem.limit_in_bytes can only be set when the cgroup is empty.
  188. // And swap memory limit needs to be set after memory limit, only
  189. // memory limit is handled by systemd, so it's kind of ugly here.
  190. if c.Resources.KernelMemory > 0 {
  191. if err := setKernelMemory(c); err != nil {
  192. return err
  193. }
  194. }
  195. if _, err := theConn.StartTransientUnit(unitName, "replace", properties, nil); err != nil {
  196. return err
  197. }
  198. if err := joinCgroups(c, pid); err != nil {
  199. return err
  200. }
  201. paths := make(map[string]string)
  202. for _, s := range subsystems {
  203. subsystemPath, err := getSubsystemPath(m.Cgroups, s.Name())
  204. if err != nil {
  205. // Don't fail if a cgroup hierarchy was not found, just skip this subsystem
  206. if cgroups.IsNotFound(err) {
  207. continue
  208. }
  209. return err
  210. }
  211. paths[s.Name()] = subsystemPath
  212. }
  213. m.Paths = paths
  214. return nil
  215. }
  216. func (m *Manager) Destroy() error {
  217. if m.Cgroups.Paths != nil {
  218. return nil
  219. }
  220. m.mu.Lock()
  221. defer m.mu.Unlock()
  222. theConn.StopUnit(getUnitName(m.Cgroups), "replace", nil)
  223. if err := cgroups.RemovePaths(m.Paths); err != nil {
  224. return err
  225. }
  226. m.Paths = make(map[string]string)
  227. return nil
  228. }
  229. func (m *Manager) GetPaths() map[string]string {
  230. m.mu.Lock()
  231. paths := m.Paths
  232. m.mu.Unlock()
  233. return paths
  234. }
  235. func writeFile(dir, file, data string) error {
  236. // Normally dir should not be empty, one case is that cgroup subsystem
  237. // is not mounted, we will get empty dir, and we want it fail here.
  238. if dir == "" {
  239. return fmt.Errorf("no such directory for %s.", file)
  240. }
  241. return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700)
  242. }
  243. func join(c *configs.Cgroup, subsystem string, pid int) (string, error) {
  244. path, err := getSubsystemPath(c, subsystem)
  245. if err != nil {
  246. return "", err
  247. }
  248. if err := os.MkdirAll(path, 0755); err != nil {
  249. return "", err
  250. }
  251. if err := writeFile(path, "cgroup.procs", strconv.Itoa(pid)); err != nil {
  252. return "", err
  253. }
  254. return path, nil
  255. }
  256. func joinCgroups(c *configs.Cgroup, pid int) error {
  257. for _, sys := range subsystems {
  258. name := sys.Name()
  259. switch name {
  260. case "name=systemd":
  261. // let systemd handle this
  262. break
  263. case "cpuset":
  264. path, err := getSubsystemPath(c, name)
  265. if err != nil && !cgroups.IsNotFound(err) {
  266. return err
  267. }
  268. s := &fs.CpusetGroup{}
  269. if err := s.ApplyDir(path, c, pid); err != nil {
  270. return err
  271. }
  272. break
  273. default:
  274. _, err := join(c, name, pid)
  275. if err != nil {
  276. // Even if it's `not found` error, we'll return err
  277. // because devices cgroup is hard requirement for
  278. // container security.
  279. if name == "devices" {
  280. return err
  281. }
  282. // For other subsystems, omit the `not found` error
  283. // because they are optional.
  284. if !cgroups.IsNotFound(err) {
  285. return err
  286. }
  287. }
  288. }
  289. }
  290. return nil
  291. }
  292. // systemd represents slice heirarchy using `-`, so we need to follow suit when
  293. // generating the path of slice. Essentially, test-a-b.slice becomes
  294. // test.slice/test-a.slice/test-a-b.slice.
  295. func expandSlice(slice string) (string, error) {
  296. suffix := ".slice"
  297. // Name has to end with ".slice", but can't be just ".slice".
  298. if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) {
  299. return "", fmt.Errorf("invalid slice name: %s", slice)
  300. }
  301. // Path-separators are not allowed.
  302. if strings.Contains(slice, "/") {
  303. return "", fmt.Errorf("invalid slice name: %s", slice)
  304. }
  305. var path, prefix string
  306. sliceName := strings.TrimSuffix(slice, suffix)
  307. for _, component := range strings.Split(sliceName, "-") {
  308. // test--a.slice isn't permitted, nor is -test.slice.
  309. if component == "" {
  310. return "", fmt.Errorf("invalid slice name: %s", slice)
  311. }
  312. // Append the component to the path and to the prefix.
  313. path += prefix + component + suffix + "/"
  314. prefix += component + "-"
  315. }
  316. return path, nil
  317. }
  318. func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) {
  319. mountpoint, err := cgroups.FindCgroupMountpoint(subsystem)
  320. if err != nil {
  321. return "", err
  322. }
  323. initPath, err := cgroups.GetInitCgroupDir(subsystem)
  324. if err != nil {
  325. return "", err
  326. }
  327. slice := "system.slice"
  328. if c.Parent != "" {
  329. slice = c.Parent
  330. }
  331. slice, err = expandSlice(slice)
  332. if err != nil {
  333. return "", err
  334. }
  335. return filepath.Join(mountpoint, initPath, slice, getUnitName(c)), nil
  336. }
  337. func (m *Manager) Freeze(state configs.FreezerState) error {
  338. path, err := getSubsystemPath(m.Cgroups, "freezer")
  339. if err != nil {
  340. return err
  341. }
  342. prevState := m.Cgroups.Resources.Freezer
  343. m.Cgroups.Resources.Freezer = state
  344. freezer, err := subsystems.Get("freezer")
  345. if err != nil {
  346. return err
  347. }
  348. err = freezer.Set(path, m.Cgroups)
  349. if err != nil {
  350. m.Cgroups.Resources.Freezer = prevState
  351. return err
  352. }
  353. return nil
  354. }
  355. func (m *Manager) GetPids() ([]int, error) {
  356. path, err := getSubsystemPath(m.Cgroups, "devices")
  357. if err != nil {
  358. return nil, err
  359. }
  360. return cgroups.GetPids(path)
  361. }
  362. func (m *Manager) GetAllPids() ([]int, error) {
  363. path, err := getSubsystemPath(m.Cgroups, "devices")
  364. if err != nil {
  365. return nil, err
  366. }
  367. return cgroups.GetAllPids(path)
  368. }
  369. func (m *Manager) GetStats() (*cgroups.Stats, error) {
  370. m.mu.Lock()
  371. defer m.mu.Unlock()
  372. stats := cgroups.NewStats()
  373. for name, path := range m.Paths {
  374. sys, err := subsystems.Get(name)
  375. if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) {
  376. continue
  377. }
  378. if err := sys.GetStats(path, stats); err != nil {
  379. return nil, err
  380. }
  381. }
  382. return stats, nil
  383. }
  384. func (m *Manager) Set(container *configs.Config) error {
  385. for _, sys := range subsystems {
  386. // Get the subsystem path, but don't error out for not found cgroups.
  387. path, err := getSubsystemPath(container.Cgroups, sys.Name())
  388. if err != nil && !cgroups.IsNotFound(err) {
  389. return err
  390. }
  391. if err := sys.Set(path, container.Cgroups); err != nil {
  392. return err
  393. }
  394. }
  395. if m.Paths["cpu"] != "" {
  396. if err := fs.CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil {
  397. return err
  398. }
  399. }
  400. return nil
  401. }
  402. func getUnitName(c *configs.Cgroup) string {
  403. return fmt.Sprintf("%s-%s.scope", c.ScopePrefix, c.Name)
  404. }
  405. func setKernelMemory(c *configs.Cgroup) error {
  406. path, err := getSubsystemPath(c, "memory")
  407. if err != nil && !cgroups.IsNotFound(err) {
  408. return err
  409. }
  410. if err := os.MkdirAll(path, 0755); err != nil {
  411. return err
  412. }
  413. // This doesn't get called by manager.Set, so we need to do it here.
  414. s := &fs.MemoryGroup{}
  415. return s.SetKernelMemory(path, c)
  416. }