28
28
#include < stdio.h>
29
29
#include < string.h>
30
30
#include < sys/stat.h>
31
+ #include < sys/vfs.h>
32
+ #include < linux/magic.h>
31
33
#include < unistd.h>
32
34
33
35
#include < fstream>
39
41
40
42
namespace cgroup2 {
41
43
44
+ static bool addPidToProcList (const std::string &cgroup_path, pid_t pid);
45
+
42
46
static std::string getCgroupPath (nsjconf_t *nsjconf, pid_t pid) {
43
47
return nsjconf->cgroupv2_mount + " /NSJAIL." + std::to_string (pid);
44
48
}
49
+ static std::string getJailCgroupPath (nsjconf_t *nsjconf) {
50
+ return nsjconf->cgroupv2_mount + " /NSJAIL_SELF." + std::to_string (getpid ());
51
+ }
45
52
46
53
static bool createCgroup (const std::string &cgroup_path, pid_t pid) {
47
54
LOG_D (" Create '%s' for pid=%d" , cgroup_path.c_str (), (int )pid);
@@ -52,6 +59,39 @@ static bool createCgroup(const std::string &cgroup_path, pid_t pid) {
52
59
return true ;
53
60
}
54
61
62
+ static bool moveSelfIntoChildCgroup (nsjconf_t *nsjconf) {
63
+ // Move ourselves into another group to avoid the 'No internal processes' rule
64
+ // https://unix.stackexchange.com/a/713343
65
+ std::string jail_cgroup_path = getJailCgroupPath (nsjconf);
66
+ LOG_I (" nsjail is moving itself to a new child cgroup: %s\n " , jail_cgroup_path.c_str ());
67
+ RETURN_ON_FAILURE (createCgroup (jail_cgroup_path, getpid ()));
68
+ RETURN_ON_FAILURE (addPidToProcList (jail_cgroup_path, 0 ));
69
+ return true ;
70
+ }
71
+
72
+
73
+ static bool enableCgroupSubtree (nsjconf_t *nsjconf, const std::string &controller, pid_t pid) {
74
+ std::string cgroup_path = nsjconf->cgroupv2_mount ;
75
+ LOG_D (" Enable cgroup.subtree_control +'%s' to '%s' for pid=%d" , controller.c_str (), cgroup_path.c_str (), pid);
76
+ std::string val = " +" + controller;
77
+
78
+ // Try once without moving the nsjail process and if that fails then try moving the nsjail process
79
+ // into a child cgroup before trying a second time.
80
+ if (util::writeBufToFile (
81
+ (cgroup_path + " /cgroup.subtree_control" ).c_str (), val.c_str (), val.length (), O_WRONLY, false )) {
82
+ return true ;
83
+ }
84
+ if (errno == EBUSY) {
85
+ RETURN_ON_FAILURE (moveSelfIntoChildCgroup (nsjconf));
86
+ if (util::writeBufToFile (
87
+ (cgroup_path + " /cgroup.subtree_control" ).c_str (), val.c_str (), val.length (), O_WRONLY)) {
88
+ return true ;
89
+ }
90
+ }
91
+ LOG_E (" Could not apply '%s' to cgroup.subtree_control in '%s'. If you are running in Docker, nsjail MUST be the root process to use cgroups." , val.c_str (), cgroup_path.c_str ());
92
+ return false ;
93
+ }
94
+
55
95
static bool writeToCgroup (
56
96
const std::string &cgroup_path, const std::string &resource, const std::string &value) {
57
97
LOG_I (" Setting '%s' to '%s'" , resource.c_str (), value.c_str ());
@@ -83,6 +123,76 @@ static void removeCgroup(const std::string &cgroup_path) {
83
123
}
84
124
}
85
125
126
+ static bool needMemoryController (nsjconf_t *nsjconf) {
127
+ // Check if we need 'memory'
128
+ // This matches the check in initNsFromParentMem
129
+ ssize_t swap_max = nsjconf->cgroup_mem_swap_max ;
130
+ if (nsjconf->cgroup_mem_memsw_max > (size_t )0 ) {
131
+ swap_max = nsjconf->cgroup_mem_memsw_max - nsjconf->cgroup_mem_max ;
132
+ }
133
+ if (nsjconf->cgroup_mem_max == (size_t )0 && swap_max < (ssize_t )0 ) {
134
+ return false ;
135
+ }
136
+ return true ;
137
+ }
138
+
139
+ static bool needPidsController (nsjconf_t *nsjconf) {
140
+ return nsjconf->cgroup_pids_max != 0 ;
141
+ }
142
+
143
+ static bool needCpuController (nsjconf_t *nsjconf) {
144
+ return nsjconf->cgroup_cpu_ms_per_sec != 0U ;
145
+ }
146
+
147
+ // We will use this buf to read from cgroup.subtree_control to see if
148
+ // the root cgroup has the necessary controllers listed
149
+ #define SUBTREE_CONTROL_BUF_LEN 0x40
150
+
151
+ bool setup (nsjconf_t *nsjconf) {
152
+ // Read from cgroup.subtree_control in the root to see if
153
+ // the controllers we need are there.
154
+ auto p = nsjconf->cgroupv2_mount + " /cgroup.subtree_control" ;
155
+ char buf[SUBTREE_CONTROL_BUF_LEN];
156
+ int read = util::readFromFile (p.c_str (), buf, SUBTREE_CONTROL_BUF_LEN-1 );
157
+ if (read < 0 ) {
158
+ LOG_W (" cgroupv2 setup: Could not read root subtree_control" );
159
+ return false ;
160
+ }
161
+ buf[read] = 0 ;
162
+
163
+ // Are the controllers we need there?
164
+ bool subtree_ok = (!needMemoryController (nsjconf) || strstr (buf, " memory" )) &&
165
+ (!needPidsController (nsjconf) || strstr (buf, " pids" )) &&
166
+ (!needCpuController (nsjconf) || strstr (buf, " cpu" ));
167
+ if (!subtree_ok) {
168
+ // Now we can write to the root cgroup.subtree_control
169
+ if (needMemoryController (nsjconf)) {
170
+ RETURN_ON_FAILURE (enableCgroupSubtree (nsjconf, " memory" , getpid ()));
171
+ }
172
+
173
+ if (needPidsController (nsjconf)) {
174
+ RETURN_ON_FAILURE (enableCgroupSubtree (nsjconf, " pids" , getpid ()));
175
+ }
176
+
177
+ if (needCpuController (nsjconf)) {
178
+ RETURN_ON_FAILURE (enableCgroupSubtree (nsjconf, " cpu" , getpid ()));
179
+ }
180
+ }
181
+ return true ;
182
+ }
183
+
184
+ bool detectCgroupv2 (nsjconf_t *nsjconf) {
185
+ // Check cgroupv2_mount, if it is a cgroup2 mount, use it.
186
+ struct statfs buf;
187
+ if (statfs (nsjconf->cgroupv2_mount .c_str (), &buf)) {
188
+ LOG_D (" statfs %s failed with %d" , nsjconf->cgroupv2_mount .c_str (), errno);
189
+ nsjconf->use_cgroupv2 = false ;
190
+ return false ;
191
+ }
192
+ nsjconf->use_cgroupv2 = (buf.f_type == CGROUP2_SUPER_MAGIC);
193
+ return true ;
194
+ }
195
+
86
196
static bool initNsFromParentMem (nsjconf_t *nsjconf, pid_t pid) {
87
197
ssize_t swap_max = nsjconf->cgroup_mem_swap_max ;
88
198
if (nsjconf->cgroup_mem_memsw_max > (size_t )0 ) {
0 commit comments