Skip to content

Commit 4437810

Browse files
Merge pull request #208 from ndrewh/cgroupsv2-fix
Setup cgroup.subtree_control controllers when necessary in cgroupsv2
2 parents 90e2854 + 12df56b commit 4437810

File tree

9 files changed

+144
-4
lines changed

9 files changed

+144
-4
lines changed

cgroup2.cc

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828
#include <stdio.h>
2929
#include <string.h>
3030
#include <sys/stat.h>
31+
#include <sys/vfs.h>
32+
#include <linux/magic.h>
3133
#include <unistd.h>
3234

3335
#include <fstream>
@@ -39,9 +41,14 @@
3941

4042
namespace cgroup2 {
4143

44+
static bool addPidToProcList(const std::string &cgroup_path, pid_t pid);
45+
4246
static std::string getCgroupPath(nsjconf_t *nsjconf, pid_t pid) {
4347
return nsjconf->cgroupv2_mount + "/NSJAIL." + std::to_string(pid);
4448
}
49+
static std::string getJailCgroupPath(nsjconf_t *nsjconf) {
50+
return nsjconf->cgroupv2_mount + "/NSJAIL_SELF." + std::to_string(getpid());
51+
}
4552

4653
static bool createCgroup(const std::string &cgroup_path, pid_t pid) {
4754
LOG_D("Create '%s' for pid=%d", cgroup_path.c_str(), (int)pid);
@@ -52,6 +59,39 @@ static bool createCgroup(const std::string &cgroup_path, pid_t pid) {
5259
return true;
5360
}
5461

62+
static bool moveSelfIntoChildCgroup(nsjconf_t *nsjconf) {
63+
// Move ourselves into another group to avoid the 'No internal processes' rule
64+
// https://unix.stackexchange.com/a/713343
65+
std::string jail_cgroup_path = getJailCgroupPath(nsjconf);
66+
LOG_I("nsjail is moving itself to a new child cgroup: %s\n", jail_cgroup_path.c_str());
67+
RETURN_ON_FAILURE(createCgroup(jail_cgroup_path, getpid()));
68+
RETURN_ON_FAILURE(addPidToProcList(jail_cgroup_path, 0));
69+
return true;
70+
}
71+
72+
73+
static bool enableCgroupSubtree(nsjconf_t *nsjconf, const std::string &controller, pid_t pid) {
74+
std::string cgroup_path = nsjconf->cgroupv2_mount;
75+
LOG_D("Enable cgroup.subtree_control +'%s' to '%s' for pid=%d", controller.c_str(), cgroup_path.c_str(), pid);
76+
std::string val = "+" + controller;
77+
78+
// Try once without moving the nsjail process and if that fails then try moving the nsjail process
79+
// into a child cgroup before trying a second time.
80+
if (util::writeBufToFile(
81+
(cgroup_path + "/cgroup.subtree_control").c_str(), val.c_str(), val.length(), O_WRONLY, false)) {
82+
return true;
83+
}
84+
if (errno == EBUSY) {
85+
RETURN_ON_FAILURE(moveSelfIntoChildCgroup(nsjconf));
86+
if (util::writeBufToFile(
87+
(cgroup_path + "/cgroup.subtree_control").c_str(), val.c_str(), val.length(), O_WRONLY)) {
88+
return true;
89+
}
90+
}
91+
LOG_E("Could not apply '%s' to cgroup.subtree_control in '%s'. If you are running in Docker, nsjail MUST be the root process to use cgroups.", val.c_str(), cgroup_path.c_str());
92+
return false;
93+
}
94+
5595
static bool writeToCgroup(
5696
const std::string &cgroup_path, const std::string &resource, const std::string &value) {
5797
LOG_I("Setting '%s' to '%s'", resource.c_str(), value.c_str());
@@ -83,6 +123,76 @@ static void removeCgroup(const std::string &cgroup_path) {
83123
}
84124
}
85125

126+
static bool needMemoryController(nsjconf_t *nsjconf) {
127+
// Check if we need 'memory'
128+
// This matches the check in initNsFromParentMem
129+
ssize_t swap_max = nsjconf->cgroup_mem_swap_max;
130+
if (nsjconf->cgroup_mem_memsw_max > (size_t)0) {
131+
swap_max = nsjconf->cgroup_mem_memsw_max - nsjconf->cgroup_mem_max;
132+
}
133+
if (nsjconf->cgroup_mem_max == (size_t)0 && swap_max < (ssize_t)0) {
134+
return false;
135+
}
136+
return true;
137+
}
138+
139+
static bool needPidsController(nsjconf_t *nsjconf) {
140+
return nsjconf->cgroup_pids_max != 0;
141+
}
142+
143+
static bool needCpuController(nsjconf_t *nsjconf) {
144+
return nsjconf->cgroup_cpu_ms_per_sec != 0U;
145+
}
146+
147+
// We will use this buf to read from cgroup.subtree_control to see if
148+
// the root cgroup has the necessary controllers listed
149+
#define SUBTREE_CONTROL_BUF_LEN 0x40
150+
151+
bool setup(nsjconf_t *nsjconf) {
152+
// Read from cgroup.subtree_control in the root to see if
153+
// the controllers we need are there.
154+
auto p = nsjconf->cgroupv2_mount + "/cgroup.subtree_control";
155+
char buf[SUBTREE_CONTROL_BUF_LEN];
156+
int read = util::readFromFile(p.c_str(), buf, SUBTREE_CONTROL_BUF_LEN-1);
157+
if (read < 0) {
158+
LOG_W("cgroupv2 setup: Could not read root subtree_control");
159+
return false;
160+
}
161+
buf[read] = 0;
162+
163+
// Are the controllers we need there?
164+
bool subtree_ok = (!needMemoryController(nsjconf) || strstr(buf, "memory")) &&
165+
(!needPidsController(nsjconf) || strstr(buf, "pids")) &&
166+
(!needCpuController(nsjconf) || strstr(buf, "cpu"));
167+
if (!subtree_ok) {
168+
// Now we can write to the root cgroup.subtree_control
169+
if (needMemoryController(nsjconf)) {
170+
RETURN_ON_FAILURE(enableCgroupSubtree(nsjconf, "memory", getpid()));
171+
}
172+
173+
if (needPidsController(nsjconf)) {
174+
RETURN_ON_FAILURE(enableCgroupSubtree(nsjconf, "pids", getpid()));
175+
}
176+
177+
if (needCpuController(nsjconf)) {
178+
RETURN_ON_FAILURE(enableCgroupSubtree(nsjconf, "cpu", getpid()));
179+
}
180+
}
181+
return true;
182+
}
183+
184+
bool detectCgroupv2(nsjconf_t *nsjconf) {
185+
// Check cgroupv2_mount, if it is a cgroup2 mount, use it.
186+
struct statfs buf;
187+
if (statfs(nsjconf->cgroupv2_mount.c_str(), &buf)) {
188+
LOG_D("statfs %s failed with %d", nsjconf->cgroupv2_mount.c_str(), errno);
189+
nsjconf->use_cgroupv2 = false;
190+
return false;
191+
}
192+
nsjconf->use_cgroupv2 = (buf.f_type == CGROUP2_SUPER_MAGIC);
193+
return true;
194+
}
195+
86196
static bool initNsFromParentMem(nsjconf_t *nsjconf, pid_t pid) {
87197
ssize_t swap_max = nsjconf->cgroup_mem_swap_max;
88198
if (nsjconf->cgroup_mem_memsw_max > (size_t)0) {

cgroup2.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ namespace cgroup2 {
3232
bool initNsFromParent(nsjconf_t* nsjconf, pid_t pid);
3333
bool initNs(void);
3434
void finishFromParent(nsjconf_t* nsjconf, pid_t pid);
35+
bool setup(nsjconf_t *nsjconf);
36+
bool detectCgroupv2(nsjconf_t *nsjconf);
3537

3638
} // namespace cgroup2
3739

cmdline.cc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,7 @@ struct custom_option custom_opts[] = {
158158
{ { "cgroup_cpu_parent", required_argument, NULL, 0x0833 }, "Which pre-existing cpu cgroup to use as a parent (default: 'NSJAIL')" },
159159
{ { "cgroupv2_mount", required_argument, NULL, 0x0834}, "Location of cgroupv2 directory (default: '/sys/fs/cgroup')"},
160160
{ { "use_cgroupv2", no_argument, NULL, 0x0835}, "Use cgroup v2"},
161+
{ { "detect_cgroupv2", no_argument, NULL, 0x0836}, "Use cgroupv2, if it is available. (Specify instead of use_cgroupv2)"},
161162
{ { "iface_no_lo", no_argument, NULL, 0x700 }, "Don't bring the 'lo' interface up" },
162163
{ { "iface_own", required_argument, NULL, 0x704 }, "Move this existing network interface into the new NET namespace. Can be specified multiple times" },
163164
{ { "macvlan_iface", required_argument, NULL, 'I' }, "Interface which will be cloned (MACVLAN) and put inside the subprocess' namespace as 'vs'" },
@@ -473,6 +474,7 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) {
473474
nsjconf->cgroup_cpu_ms_per_sec = 0U;
474475
nsjconf->cgroupv2_mount = "/sys/fs/cgroup";
475476
nsjconf->use_cgroupv2 = false;
477+
nsjconf->detect_cgroupv2 = false;
476478
nsjconf->iface_lo = true;
477479
nsjconf->iface_vs_ip = "0.0.0.0";
478480
nsjconf->iface_vs_nm = "255.255.255.0";
@@ -912,6 +914,9 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) {
912914
case 0x835:
913915
nsjconf->use_cgroupv2 = true;
914916
break;
917+
case 0x836:
918+
nsjconf->detect_cgroupv2 = true;
919+
break;
915920
case 'P':
916921
nsjconf->kafel_file_path = optarg;
917922
break;

config.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,7 @@ static bool configParseInternal(nsjconf_t* nsjconf, const nsjail::NsJailConfig&
266266
nsjconf->cgroup_cpu_parent = njc.cgroup_cpu_parent();
267267
nsjconf->cgroupv2_mount = njc.cgroupv2_mount();
268268
nsjconf->use_cgroupv2 = njc.use_cgroupv2();
269+
nsjconf->detect_cgroupv2 = njc.detect_cgroupv2();
269270

270271
nsjconf->iface_lo = !(njc.iface_no_lo());
271272
for (ssize_t i = 0; i < njc.iface_own().size(); i++) {

config.proto

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,4 +272,7 @@ message NsJailConfig {
272272
/* Set this to true to forward fatal signals to the child process instead
273273
* of always using SIGKILL. */
274274
optional bool forward_signals = 94 [default = false];
275+
276+
/* Check whether cgroupv2 is available, and use it if available. */
277+
optional bool detect_cgroupv2 = 95 [default = false];
275278
}

nsjail.cc

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
#include "sandbox.h"
4747
#include "subproc.h"
4848
#include "util.h"
49+
#include "cgroup2.h"
4950

5051
namespace nsjail {
5152

@@ -342,6 +343,19 @@ int main(int argc, char* argv[]) {
342343
if (!nsjail::setTimer(nsjconf.get())) {
343344
LOG_F("nsjail::setTimer() failed");
344345
}
346+
347+
if (nsjconf->detect_cgroupv2) {
348+
cgroup2::detectCgroupv2(nsjconf.get());
349+
LOG_I("Detected cgroups version: %d", nsjconf->use_cgroupv2 ? 2 : 1);
350+
}
351+
352+
if (nsjconf->use_cgroupv2) {
353+
if (!cgroup2::setup(nsjconf.get())) {
354+
LOG_E("Couldn't setup parent cgroup (cgroupv2)");
355+
return -1;
356+
}
357+
}
358+
345359
if (!sandbox::preparePolicy(nsjconf.get())) {
346360
LOG_F("Couldn't prepare sandboxing policy");
347361
}

nsjail.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,7 @@ struct nsjconf_t {
163163
unsigned int cgroup_cpu_ms_per_sec;
164164
std::string cgroupv2_mount;
165165
bool use_cgroupv2;
166+
bool detect_cgroupv2;
166167
std::string kafel_file_path;
167168
std::string kafel_string;
168169
struct sock_fprog seccomp_fprog;

util.cc

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -89,16 +89,20 @@ bool writeToFd(int fd, const void* buf, size_t len) {
8989
return true;
9090
}
9191

92-
bool writeBufToFile(const char* filename, const void* buf, size_t len, int open_flags) {
92+
bool writeBufToFile(const char* filename, const void* buf, size_t len, int open_flags, bool log_errors) {
9393
int fd;
9494
TEMP_FAILURE_RETRY(fd = open(filename, open_flags, 0644));
9595
if (fd == -1) {
96-
PLOG_E("Couldn't open '%s' for writing", filename);
96+
if (log_errors) {
97+
PLOG_E("Couldn't open '%s' for writing", filename);
98+
}
9799
return false;
98100
}
99101

100102
if (!writeToFd(fd, buf, len)) {
101-
PLOG_E("Couldn't write '%zu' bytes to file '%s' (fd='%d')", len, filename, fd);
103+
if (log_errors) {
104+
PLOG_E("Couldn't write '%zu' bytes to file '%s' (fd='%d')", len, filename, fd);
105+
}
102106
close(fd);
103107
if (open_flags & O_CREAT) {
104108
unlink(filename);

util.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ namespace util {
4646
ssize_t readFromFd(int fd, void* buf, size_t len);
4747
ssize_t readFromFile(const char* fname, void* buf, size_t len);
4848
bool writeToFd(int fd, const void* buf, size_t len);
49-
bool writeBufToFile(const char* filename, const void* buf, size_t len, int open_flags);
49+
bool writeBufToFile(const char* filename, const void* buf, size_t len, int open_flags, bool log_errors = true);
5050
bool createDirRecursively(const char* dir);
5151
std::string* StrAppend(std::string* str, const char* format, ...)
5252
__attribute__((format(printf, 2, 3)));

0 commit comments

Comments
 (0)