nsexec.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473
  1. #define _GNU_SOURCE
  2. #include <endian.h>
  3. #include <errno.h>
  4. #include <fcntl.h>
  5. #include <linux/limits.h>
  6. #include <sys/socket.h>
  7. #include <linux/netlink.h>
  8. #include <sched.h>
  9. #include <setjmp.h>
  10. #include <signal.h>
  11. #include <stdint.h>
  12. #include <stdio.h>
  13. #include <stdlib.h>
  14. #include <string.h>
  15. #include <sys/ioctl.h>
  16. #include <sys/types.h>
  17. #include <sys/prctl.h>
  18. #include <unistd.h>
  19. #include <grp.h>
  20. #include <bits/sockaddr.h>
  21. #include <linux/types.h>
  22. // All arguments should be above the stack because it grows down
  23. struct clone_arg {
  24. /*
  25. * Reserve some space for clone() to locate arguments
  26. * and retcode in this place
  27. */
  28. char stack[4096] __attribute__((aligned(16)));
  29. char stack_ptr[0];
  30. jmp_buf *env;
  31. };
  32. struct nsenter_config {
  33. uint32_t cloneflags;
  34. char *uidmap;
  35. int uidmap_len;
  36. char *gidmap;
  37. int gidmap_len;
  38. uint8_t is_setgroup;
  39. int consolefd;
  40. };
  41. // list of known message types we want to send to bootstrap program
  42. // These are defined in libcontainer/message_linux.go
  43. #define INIT_MSG 62000
  44. #define CLONE_FLAGS_ATTR 27281
  45. #define CONSOLE_PATH_ATTR 27282
  46. #define NS_PATHS_ATTR 27283
  47. #define UIDMAP_ATTR 27284
  48. #define GIDMAP_ATTR 27285
  49. #define SETGROUP_ATTR 27286
  50. // Use raw setns syscall for versions of glibc that don't include it
  51. // (namely glibc-2.12)
  52. #if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14
  53. #define _GNU_SOURCE
  54. #include "syscall.h"
  55. #if defined(__NR_setns) && !defined(SYS_setns)
  56. #define SYS_setns __NR_setns
  57. #endif
  58. #ifdef SYS_setns
  59. int setns(int fd, int nstype)
  60. {
  61. return syscall(SYS_setns, fd, nstype);
  62. }
  63. #endif
  64. #endif
  65. #define pr_perror(fmt, ...) \
  66. fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__)
  67. static int child_func(void *_arg)
  68. {
  69. struct clone_arg *arg = (struct clone_arg *)_arg;
  70. longjmp(*arg->env, 1);
  71. }
  72. static int clone_parent(jmp_buf *env, int flags) __attribute__((noinline));
  73. static int clone_parent(jmp_buf *env, int flags)
  74. {
  75. struct clone_arg ca;
  76. int child;
  77. ca.env = env;
  78. child = clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD | flags,
  79. &ca);
  80. // On old kernels, CLONE_PARENT cannot work with CLONE_NEWPID,
  81. // unshare before clone to workaround this.
  82. if (child == -1 && errno == EINVAL) {
  83. if (unshare(flags)) {
  84. pr_perror("Unable to unshare namespaces");
  85. return -1;
  86. }
  87. child = clone(child_func, ca.stack_ptr, SIGCHLD | CLONE_PARENT,
  88. &ca);
  89. }
  90. return child;
  91. }
  92. // get init pipe from the parent. It's used to read bootstrap data, and to
  93. // write pid to after nsexec finishes setting up the environment.
  94. static int get_init_pipe()
  95. {
  96. char buf[PATH_MAX];
  97. char *initpipe;
  98. int pipenum = -1;
  99. initpipe = getenv("_LIBCONTAINER_INITPIPE");
  100. if (initpipe == NULL) {
  101. return -1;
  102. }
  103. pipenum = atoi(initpipe);
  104. snprintf(buf, sizeof(buf), "%d", pipenum);
  105. if (strcmp(initpipe, buf)) {
  106. pr_perror("Unable to parse _LIBCONTAINER_INITPIPE");
  107. exit(1);
  108. }
  109. return pipenum;
  110. }
  111. // num_namespaces returns the number of additional namespaces to setns. The
  112. // argument is a comma-separated string of namespace paths.
  113. static int num_namespaces(char *nspaths)
  114. {
  115. int i;
  116. int size = 0;
  117. for (i = 0; nspaths[i]; i++) {
  118. if (nspaths[i] == ',') {
  119. size += 1;
  120. }
  121. }
  122. return size + 1;
  123. }
  124. static uint32_t readint32(char *buf)
  125. {
  126. return *(uint32_t *)buf;
  127. }
  128. static uint8_t readint8(char *buf)
  129. {
  130. return *(uint8_t *)buf;
  131. }
  132. static void update_process_idmap(char *pathfmt, int pid, char *map, int map_len)
  133. {
  134. char buf[PATH_MAX];
  135. int len;
  136. int fd;
  137. len = snprintf(buf, sizeof(buf), pathfmt, pid);
  138. if (len < 0) {
  139. pr_perror("failed to construct '%s' for %d", pathfmt, pid);
  140. exit(1);
  141. }
  142. fd = open(buf, O_RDWR);
  143. if (fd == -1) {
  144. pr_perror("failed to open %s", buf);
  145. exit(1);
  146. }
  147. len = write(fd, map, map_len);
  148. if (len == -1) {
  149. pr_perror("failed to write to %s", buf);
  150. close(fd);
  151. exit(1);
  152. } else if (len != map_len) {
  153. pr_perror("Failed to write data to %s (%d/%d)",
  154. buf, len, map_len);
  155. close(fd);
  156. exit(1);
  157. }
  158. close(fd);
  159. }
  160. static void update_process_uidmap(int pid, char *map, int map_len)
  161. {
  162. if ((map == NULL) || (map_len <= 0)) {
  163. return;
  164. }
  165. update_process_idmap("/proc/%d/uid_map", pid, map, map_len);
  166. }
  167. static void update_process_gidmap(int pid, uint8_t is_setgroup, char *map, int map_len)
  168. {
  169. if ((map == NULL) || (map_len <= 0)) {
  170. return;
  171. }
  172. if (is_setgroup == 1) {
  173. int fd;
  174. int len;
  175. char buf[PATH_MAX];
  176. len = snprintf(buf, sizeof(buf), "/proc/%d/setgroups", pid);
  177. if (len < 0) {
  178. pr_perror("failed to get setgroups path for %d", pid);
  179. exit(1);
  180. }
  181. fd = open(buf, O_RDWR);
  182. if (fd == -1) {
  183. pr_perror("failed to open %s", buf);
  184. exit(1);
  185. }
  186. if (write(fd, "allow", 5) != 5) {
  187. // If the kernel is too old to support
  188. // /proc/PID/setgroups, write will return
  189. // ENOENT; this is OK.
  190. if (errno != ENOENT) {
  191. pr_perror("failed to write allow to %s", buf);
  192. close(fd);
  193. exit(1);
  194. }
  195. }
  196. close(fd);
  197. }
  198. update_process_idmap("/proc/%d/gid_map", pid, map, map_len);
  199. }
  200. static void start_child(int pipenum, jmp_buf *env, int syncpipe[2],
  201. struct nsenter_config *config)
  202. {
  203. int len;
  204. int childpid;
  205. char buf[PATH_MAX];
  206. uint8_t syncbyte = 1;
  207. // We must fork to actually enter the PID namespace, use CLONE_PARENT
  208. // so the child can have the right parent, and we don't need to forward
  209. // the child's exit code or resend its death signal.
  210. childpid = clone_parent(env, config->cloneflags);
  211. if (childpid < 0) {
  212. pr_perror("Unable to fork");
  213. exit(1);
  214. }
  215. // update uid_map and gid_map for the child process if they
  216. // were provided
  217. update_process_uidmap(childpid, config->uidmap, config->uidmap_len);
  218. update_process_gidmap(childpid, config->is_setgroup, config->gidmap, config->gidmap_len);
  219. // Send the sync signal to the child
  220. close(syncpipe[0]);
  221. syncbyte = 1;
  222. if (write(syncpipe[1], &syncbyte, 1) != 1) {
  223. pr_perror("failed to write sync byte to child");
  224. exit(1);
  225. }
  226. // Send the child pid back to our parent
  227. len = snprintf(buf, sizeof(buf), "{ \"pid\" : %d }\n", childpid);
  228. if ((len < 0) || (write(pipenum, buf, len) != len)) {
  229. pr_perror("Unable to send a child pid");
  230. kill(childpid, SIGKILL);
  231. exit(1);
  232. }
  233. exit(0);
  234. }
  235. static struct nsenter_config process_nl_attributes(int pipenum, char *data, int data_size)
  236. {
  237. struct nsenter_config config = {0};
  238. struct nlattr *nlattr;
  239. int payload_len;
  240. int start = 0;
  241. config.consolefd = -1;
  242. while (start < data_size) {
  243. nlattr = (struct nlattr *)(data + start);
  244. start += NLA_HDRLEN;
  245. payload_len = nlattr->nla_len - NLA_HDRLEN;
  246. if (nlattr->nla_type == CLONE_FLAGS_ATTR) {
  247. config.cloneflags = readint32(data + start);
  248. } else if (nlattr->nla_type == CONSOLE_PATH_ATTR) {
  249. // get the console path before setns because it may
  250. // change mnt namespace
  251. config.consolefd = open(data + start, O_RDWR);
  252. if (config.consolefd < 0) {
  253. pr_perror("Failed to open console %s",
  254. data + start);
  255. exit(1);
  256. }
  257. } else if (nlattr->nla_type == NS_PATHS_ATTR) {
  258. // if custom namespaces are required, open all
  259. // descriptors and perform setns on them
  260. int i, j;
  261. int nslen = num_namespaces(data + start);
  262. int fds[nslen];
  263. char *nslist[nslen];
  264. char *ns;
  265. char *saveptr;
  266. for (i = 0; i < nslen; i++) {
  267. char *str = NULL;
  268. if (i == 0) {
  269. str = data + start;
  270. }
  271. ns = strtok_r(str, ",", &saveptr);
  272. if (ns == NULL) {
  273. break;
  274. }
  275. fds[i] = open(ns, O_RDONLY);
  276. if (fds[i] == -1) {
  277. for (j = 0; j < i; j++) {
  278. close(fds[j]);
  279. }
  280. pr_perror("Failed to open %s", ns);
  281. exit(1);
  282. }
  283. nslist[i] = ns;
  284. }
  285. for (i = 0; i < nslen; i++) {
  286. if (setns(fds[i], 0) != 0) {
  287. pr_perror("Failed to setns to %s", nslist[i]);
  288. exit(1);
  289. }
  290. close(fds[i]);
  291. }
  292. } else if (nlattr->nla_type == UIDMAP_ATTR) {
  293. config.uidmap = data + start;
  294. config.uidmap_len = payload_len;
  295. } else if (nlattr->nla_type == GIDMAP_ATTR) {
  296. config.gidmap = data + start;
  297. config.gidmap_len = payload_len;
  298. } else if (nlattr->nla_type == SETGROUP_ATTR) {
  299. config.is_setgroup = readint8(data + start);
  300. } else {
  301. pr_perror("Unknown netlink message type %d",
  302. nlattr->nla_type);
  303. exit(1);
  304. }
  305. start += NLA_ALIGN(payload_len);
  306. }
  307. return config;
  308. }
  309. void nsexec(void)
  310. {
  311. int pipenum;
  312. // If we don't have init pipe, then just return to the go routine,
  313. // we'll only have init pipe for start or exec
  314. pipenum = get_init_pipe();
  315. if (pipenum == -1) {
  316. return;
  317. }
  318. // Retrieve the netlink header
  319. struct nlmsghdr nl_msg_hdr;
  320. int len;
  321. if ((len = read(pipenum, &nl_msg_hdr, NLMSG_HDRLEN)) != NLMSG_HDRLEN) {
  322. pr_perror("Invalid netlink header length %d", len);
  323. exit(1);
  324. }
  325. if (nl_msg_hdr.nlmsg_type == NLMSG_ERROR) {
  326. pr_perror("Failed to read netlink message");
  327. exit(1);
  328. }
  329. if (nl_msg_hdr.nlmsg_type != INIT_MSG) {
  330. pr_perror("Unexpected msg type %d", nl_msg_hdr.nlmsg_type);
  331. exit(1);
  332. }
  333. // Retrieve data
  334. int nl_total_size = NLMSG_PAYLOAD(&nl_msg_hdr, 0);
  335. char data[nl_total_size];
  336. if ((len = read(pipenum, data, nl_total_size)) != nl_total_size) {
  337. pr_perror("Failed to read netlink payload, %d != %d", len,
  338. nl_total_size);
  339. exit(1);
  340. }
  341. jmp_buf env;
  342. int syncpipe[2] = {-1, -1};
  343. struct nsenter_config config = process_nl_attributes(pipenum,
  344. data, nl_total_size);
  345. // required clone_flags to be passed
  346. if (config.cloneflags == -1) {
  347. pr_perror("Missing clone_flags");
  348. exit(1);
  349. }
  350. // prepare sync pipe between parent and child. We need this to let the
  351. // child know that the parent has finished setting up
  352. if (pipe(syncpipe) != 0) {
  353. pr_perror("Failed to setup sync pipe between parent and child");
  354. exit(1);
  355. }
  356. if (setjmp(env) == 1) {
  357. // Child
  358. uint8_t s = 0;
  359. int consolefd = config.consolefd;
  360. // close the writing side of pipe
  361. close(syncpipe[1]);
  362. // sync with parent
  363. if ((read(syncpipe[0], &s, 1) != 1) || (s != 1)) {
  364. pr_perror("Failed to read sync byte from parent");
  365. exit(1);
  366. }
  367. if (setsid() == -1) {
  368. pr_perror("setsid failed");
  369. exit(1);
  370. }
  371. if (setuid(0) == -1) {
  372. pr_perror("setuid failed");
  373. exit(1);
  374. }
  375. if (setgid(0) == -1) {
  376. pr_perror("setgid failed");
  377. exit(1);
  378. }
  379. if (setgroups(0, NULL) == -1) {
  380. pr_perror("setgroups failed");
  381. exit(1);
  382. }
  383. if (consolefd != -1) {
  384. if (ioctl(consolefd, TIOCSCTTY, 0) == -1) {
  385. pr_perror("ioctl TIOCSCTTY failed");
  386. exit(1);
  387. }
  388. if (dup3(consolefd, STDIN_FILENO, 0) != STDIN_FILENO) {
  389. pr_perror("Failed to dup stdin");
  390. exit(1);
  391. }
  392. if (dup3(consolefd, STDOUT_FILENO, 0) != STDOUT_FILENO) {
  393. pr_perror("Failed to dup stdout");
  394. exit(1);
  395. }
  396. if (dup3(consolefd, STDERR_FILENO, 0) != STDERR_FILENO) {
  397. pr_perror("Failed to dup stderr");
  398. exit(1);
  399. }
  400. }
  401. // Finish executing, let the Go runtime take over.
  402. return;
  403. }
  404. // Parent
  405. start_child(pipenum, &env, syncpipe, &config);
  406. }