diff options
author | Daniel Dao <dqminh89@gmail.com> | 2020-10-12 14:57:47 +0100 |
---|---|---|
committer | Daniel Dao <dqminh89@gmail.com> | 2021-01-26 15:01:21 +0000 |
commit | bd5eb8a9db2bf3154d8bc4231ac0c655c78df3ae (patch) | |
tree | 5715edce61354b4bbe52f8c9dd37eaac44d76d54 | |
parent | f5736fa2bf91e1bb3fd9f9625dba8c800bf2adb5 (diff) |
runsc: check for nested cgroup when generating croup paths
in nested container, we see paths from host in /proc/self/cgroup, so we
need to re-process that path to get a relative path to be used inside
the container.
Without it, runsc generates ugly paths that may trip other cgroup
watchers that expect clean paths. An example of ugly path is:
```
/sys/fs/cgroup/memory/docker/e383892b29290ae8005d535f2dadc4a583bb354d5bb1ba8c10bf900d92c4db93/docker/e383892b29290ae8005d535f2dadc4a583bb354d5bb1ba8c10bf900d92c4db93/cgroupPath
```
Notice duplication of `docker/e383892b29290ae8005d535f2dadc4a583bb354d5bb1ba8c10bf900d92c4db93`
`/proc/1/cgroup` looks like
```
12:perf_event:/docker/e383892b29290ae8005d535f2dadc4a583bb354d5bb1ba8c10bf900d92c4db93
11:blkio:/docker/e383892b29290ae8005d535f2dadc4a583bb354d5bb1ba8c10bf900d92c4db93
10:freezer:/docker/e383892b29290ae8005d535f2dadc4a583bb354d5bb1ba8c10bf900d92c4db93
9:hugetlb:/docker/e383892b29290ae8005d535f2dadc4a583bb354d5bb1ba8c10bf900d92c4db93
8:devices:/docker/e383892b29290ae8005d535f2dadc4a583bb354d5bb1ba8c10bf900d92c4db93
7:rdma:/
6:pids:/docker/e383892b29290ae8005d535f2dadc4a583bb354d5bb1ba8c10bf900d92c4db93
5:cpuset:/docker/e383892b29290ae8005d535f2dadc4a583bb354d5bb1ba8c10bf900d92c4db93
4:cpu,cpuacct:/docker/e383892b29290ae8005d535f2dadc4a583bb354d5bb1ba8c10bf900d92c4db93
3:memory:/docker/e383892b29290ae8005d535f2dadc4a583bb354d5bb1ba8c10bf900d92c4db93
2:net_cls,net_prio:/docker/e383892b29290ae8005d535f2dadc4a583bb354d5bb1ba8c10bf900d92c4db93
1:name=systemd:/docker/e383892b29290ae8005d535f2dadc4a583bb354d5bb1ba8c10bf900d92c4db93
0::/system.slice/containerd.service
```
This is not necessary when the parent container was created with cgroup
namespace, but that setup is not very common right now.
Signed-off-by: Daniel Dao <dqminh89@gmail.com>
-rw-r--r-- | runsc/cgroup/cgroup.go | 38 | ||||
-rw-r--r-- | runsc/cgroup/cgroup_test.go | 149 |
2 files changed, 165 insertions, 22 deletions
diff --git a/runsc/cgroup/cgroup.go b/runsc/cgroup/cgroup.go index 13c6a16a0..e9ae59a92 100644 --- a/runsc/cgroup/cgroup.go +++ b/runsc/cgroup/cgroup.go @@ -203,6 +203,19 @@ func LoadPaths(pid string) (map[string]string, error) { } func loadPathsHelper(cgroup io.Reader) (map[string]string, error) { + // For nested containers, in /proc/self/cgroup we see paths from host, + // which don't exist in container, so recover the container paths here by + // double-checking with /proc/pid/mountinfo + mountinfo, err := os.Open("/proc/self/mountinfo") + if err != nil { + return nil, err + } + defer mountinfo.Close() + + return loadPathsHelperWithMountinfo(cgroup, mountinfo) +} + +func loadPathsHelperWithMountinfo(cgroup, mountinfo io.Reader) (map[string]string, error) { paths := make(map[string]string) scanner := bufio.NewScanner(cgroup) @@ -225,6 +238,31 @@ func loadPathsHelper(cgroup io.Reader) (map[string]string, error) { if err := scanner.Err(); err != nil { return nil, err } + + mfScanner := bufio.NewScanner(mountinfo) + for mfScanner.Scan() { + txt := mfScanner.Text() + fields := strings.Fields(txt) + if len(fields) < 9 || fields[len(fields)-3] != "cgroup" { + continue + } + for _, opt := range strings.Split(fields[len(fields)-1], ",") { + // Remove prefix for cgroups with no controller, eg. systemd. + opt = strings.TrimPrefix(opt, "name=") + if cgroupPath, ok := paths[opt]; ok { + root := fields[3] + relCgroupPath, err := filepath.Rel(root, cgroupPath) + if err != nil { + return nil, err + } + paths[opt] = relCgroupPath + } + } + } + if err := mfScanner.Err(); err != nil { + return nil, err + } + return paths, nil } diff --git a/runsc/cgroup/cgroup_test.go b/runsc/cgroup/cgroup_test.go index 931144cf9..2ff3c9e69 100644 --- a/runsc/cgroup/cgroup_test.go +++ b/runsc/cgroup/cgroup_test.go @@ -25,6 +25,83 @@ import ( "gvisor.dev/gvisor/pkg/test/testutil" ) +var debianMountinfo = ` +24 31 0:22 / /sys rw,nosuid,nodev,noexec,relatime shared:7 - sysfs sysfs rw +25 31 0:23 / /proc rw,nosuid,nodev,noexec,relatime shared:15 - proc proc rw +26 31 0:5 / /dev rw,nosuid,noexec,relatime shared:2 - devtmpfs udev rw,size=16294760k,nr_inodes=4073690,mode=755 +27 26 0:24 / /dev/pts rw,nosuid,noexec,relatime shared:3 - devpts devpts rw,gid=5,mode=620,ptmxmode=000 +28 31 0:25 / /run rw,nosuid,nodev,noexec,relatime shared:5 - tmpfs tmpfs rw,size=3268816k,mode=755 +31 1 253:1 / / rw,noatime shared:1 - ext4 /dev/mapper/data-root rw,errors=remount-ro +32 24 0:7 / /sys/kernel/security rw,nosuid,nodev,noexec,relatime shared:8 - securityfs securityfs rw +33 26 0:28 / /dev/shm rw,nosuid,nodev shared:4 - tmpfs tmpfs rw +34 28 0:29 / /run/lock rw,nosuid,nodev,noexec,relatime shared:6 - tmpfs tmpfs rw,size=5120k +35 24 0:30 / /sys/fs/cgroup ro,nosuid,nodev,noexec shared:9 - tmpfs tmpfs ro,size=4096k,nr_inodes=1024,mode=755 +36 35 0:31 / /sys/fs/cgroup/unified rw,nosuid,nodev,noexec,relatime shared:10 - cgroup2 cgroup2 rw,nsdelegate +37 35 0:32 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:11 - cgroup cgroup rw,xattr,name=systemd +38 24 0:33 / /sys/fs/pstore rw,nosuid,nodev,noexec,relatime shared:12 - pstore pstore rw +39 24 0:34 / /sys/firmware/efi/efivars rw,nosuid,nodev,noexec,relatime shared:13 - efivarfs efivarfs rw +40 24 0:35 / /sys/fs/bpf rw,nosuid,nodev,noexec,relatime shared:14 - bpf none rw,mode=700 +41 35 0:36 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,cpu,cpuacct +42 35 0:37 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,freezer +43 35 0:38 / /sys/fs/cgroup/hugetlb rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,hugetlb +44 35 0:39 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:19 - cgroup cgroup rw,cpuset +45 35 0:40 / /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:20 - cgroup cgroup rw,net_cls,net_prio +46 35 0:41 / /sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:21 - cgroup cgroup rw,pids +47 35 0:42 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:22 - cgroup cgroup rw,perf_event +48 35 0:43 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:23 - cgroup cgroup rw,memory +49 35 0:44 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:24 - cgroup cgroup rw,blkio +50 35 0:45 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:25 - cgroup cgroup rw,devices +51 35 0:46 / /sys/fs/cgroup/rdma rw,nosuid,nodev,noexec,relatime shared:26 - cgroup cgroup rw,rdma +52 25 0:47 / /proc/sys/fs/binfmt_misc rw,relatime shared:27 - autofs systemd-1 rw,fd=28,pgrp=1,timeout=0,minproto=5,maxproto=5,direct,pipe_ino=23671 +53 26 0:20 / /dev/mqueue rw,nosuid,nodev,noexec,relatime shared:28 - mqueue mqueue rw +54 26 0:48 / /dev/hugepages rw,relatime shared:29 - hugetlbfs hugetlbfs rw,pagesize=2M +55 24 0:6 / /sys/kernel/debug rw,nosuid,nodev,noexec,relatime shared:30 - debugfs debugfs rw +56 24 0:11 / /sys/kernel/tracing rw,nosuid,nodev,noexec,relatime shared:31 - tracefs tracefs rw +57 24 0:49 / /sys/fs/fuse/connections rw,nosuid,nodev,noexec,relatime shared:32 - fusectl fusectl rw +58 24 0:21 / /sys/kernel/config rw,nosuid,nodev,noexec,relatime shared:33 - configfs configfs rw +` + +var dindMountinfo = ` +1300 1252 0:55 / / rw,relatime master:665 - overlay overlay rw,lowerdir=/var/lib/docker/overlay2/l/4FX5VCS5UM46IN3FMFIQ5Z3UPH:/var/lib/docker/overlay2/l/3LYKDG2G7WMWFN7KKKZJNQB7AO:/var/lib/docker/overlay2/l/X4N4WIO64ERVFM35SGMCXMW5HX:/var/lib/docker/overlay2/l/WLV7ZCKK2OJHEADMAKFKCITYVA:/var/lib/docker/overlay2/l/RB6D5GFMA2JVMWGG5N7ZWEXQII:/var/lib/docker/overlay2/l/U3TWA3AQ6HAGG67SIDEBFJ2JJF:/var/lib/docker/overlay2/l/WC6XFGD7YWGQLOSNQWLPVCCQX2:/var/lib/docker/overlay2/l/DW235S3RJLDSGSNXHL2U3WVCCL:/var/lib/docker/overlay2/l/D4YM6NOOKDBR7QRG6L6LWHQUZK:/var/lib/docker/overlay2/l/YRLU243KN3AMWHZVPNUMGYD75M:/var/lib/docker/overlay2/l/IISAPU47O4JN6JC5I4A43SFWM7:/var/lib/docker/overlay2/l/UVIPA27BMQWS6NRHHU3QEI5YZT,upperdir=/var/lib/docker/overlay2/749721f78c6ec4d47aacbf01f29a4bd495b1b7a2e9b861fb10f14126d359fd04/diff,workdir=/var/lib/docker/overlay2/749721f78c6ec4d47aacbf01f29a4bd495b1b7a2e9b861fb10f14126d359fd04/work +1301 1300 0:59 / /proc rw,nosuid,nodev,noexec,relatime - proc proc rw +1302 1300 0:61 / /dev rw,nosuid - tmpfs tmpfs rw,size=65536k,mode=755 +1303 1302 0:62 / /dev/pts rw,nosuid,noexec,relatime - devpts devpts rw,gid=5,mode=620,ptmxmode=666 +1304 1300 0:63 / /sys ro,nosuid,nodev,noexec,relatime - sysfs sysfs ro +1305 1304 0:64 / /sys/fs/cgroup rw,nosuid,nodev,noexec,relatime - tmpfs tmpfs rw,mode=755 +1306 1305 0:32 /docker/136811d8fa1136e2746d10f6443c4c787c3cfbab5273270cc3aeeb3a94b3cc51 /sys/fs/cgroup/systemd ro,nosuid,nodev,noexec,relatime master:11 - cgroup cgroup rw,xattr,name=systemd +1307 1305 0:36 /docker/136811d8fa1136e2746d10f6443c4c787c3cfbab5273270cc3aeeb3a94b3cc51 /sys/fs/cgroup/cpu,cpuacct ro,nosuid,nodev,noexec,relatime master:16 - cgroup cgroup rw,cpu,cpuacct +1308 1305 0:37 /docker/136811d8fa1136e2746d10f6443c4c787c3cfbab5273270cc3aeeb3a94b3cc51 /sys/fs/cgroup/freezer ro,nosuid,nodev,noexec,relatime master:17 - cgroup cgroup rw,freezer +1309 1305 0:38 /docker/136811d8fa1136e2746d10f6443c4c787c3cfbab5273270cc3aeeb3a94b3cc51 /sys/fs/cgroup/hugetlb ro,nosuid,nodev,noexec,relatime master:18 - cgroup cgroup rw,hugetlb +1310 1305 0:39 /docker/136811d8fa1136e2746d10f6443c4c787c3cfbab5273270cc3aeeb3a94b3cc51 /sys/fs/cgroup/cpuset ro,nosuid,nodev,noexec,relatime master:19 - cgroup cgroup rw,cpuset +1311 1305 0:40 /docker/136811d8fa1136e2746d10f6443c4c787c3cfbab5273270cc3aeeb3a94b3cc51 /sys/fs/cgroup/net_cls,net_prio ro,nosuid,nodev,noexec,relatime master:20 - cgroup cgroup rw,net_cls,net_prio +1312 1305 0:41 /docker/136811d8fa1136e2746d10f6443c4c787c3cfbab5273270cc3aeeb3a94b3cc51 /sys/fs/cgroup/pids ro,nosuid,nodev,noexec,relatime master:21 - cgroup cgroup rw,pids +1313 1305 0:42 /docker/136811d8fa1136e2746d10f6443c4c787c3cfbab5273270cc3aeeb3a94b3cc51 /sys/fs/cgroup/perf_event ro,nosuid,nodev,noexec,relatime master:22 - cgroup cgroup rw,perf_event +1314 1305 0:43 /docker/136811d8fa1136e2746d10f6443c4c787c3cfbab5273270cc3aeeb3a94b3cc51 /sys/fs/cgroup/memory ro,nosuid,nodev,noexec,relatime master:23 - cgroup cgroup rw,memory +1316 1305 0:44 /docker/136811d8fa1136e2746d10f6443c4c787c3cfbab5273270cc3aeeb3a94b3cc51 /sys/fs/cgroup/blkio ro,nosuid,nodev,noexec,relatime master:24 - cgroup cgroup rw,blkio +1317 1305 0:45 /docker/136811d8fa1136e2746d10f6443c4c787c3cfbab5273270cc3aeeb3a94b3cc51 /sys/fs/cgroup/devices ro,nosuid,nodev,noexec,relatime master:25 - cgroup cgroup rw,devices +1318 1305 0:46 / /sys/fs/cgroup/rdma ro,nosuid,nodev,noexec,relatime master:26 - cgroup cgroup rw,rdma +1319 1302 0:58 / /dev/mqueue rw,nosuid,nodev,noexec,relatime - mqueue mqueue rw +1320 1302 0:65 / /dev/shm rw,nosuid,nodev,noexec,relatime - tmpfs shm rw,size=65536k +1321 1300 253:1 /var/lib/docker/containers/136811d8fa1136e2746d10f6443c4c787c3cfbab5273270cc3aeeb3a94b3cc51/resolv.conf /etc/resolv.conf rw,noatime - ext4 /dev/mapper/data-root rw,errors=remount-ro +1322 1300 253:1 /var/lib/docker/containers/136811d8fa1136e2746d10f6443c4c787c3cfbab5273270cc3aeeb3a94b3cc51/hostname /etc/hostname rw,noatime - ext4 /dev/mapper/data-root rw,errors=remount-ro +1323 1300 253:1 /var/lib/docker/containers/136811d8fa1136e2746d10f6443c4c787c3cfbab5273270cc3aeeb3a94b3cc51/hosts /etc/hosts rw,noatime - ext4 /dev/mapper/data-root rw,errors=remount-ro +1324 1300 253:1 /var/lib/docker/volumes/76f4f27c7bdba8207958a4aed6692c400f98819aa32af1faf38ebb21fcb4bea3/_data /var/lib/docker rw,noatime master:1 - ext4 /dev/mapper/data-root rw,errors=remount-ro +1253 1302 0:62 /0 /dev/console rw,nosuid,noexec,relatime - devpts devpts rw,gid=5,mode=620,ptmxmode=666 +1254 1301 0:59 /bus /proc/bus ro,relatime - proc proc rw +1255 1301 0:59 /fs /proc/fs ro,relatime - proc proc rw +1256 1301 0:59 /irq /proc/irq ro,relatime - proc proc rw +1257 1301 0:59 /sys /proc/sys ro,relatime - proc proc rw +1258 1301 0:59 /sysrq-trigger /proc/sysrq-trigger ro,relatime - proc proc rw +1259 1301 0:66 / /proc/asound ro,relatime - tmpfs tmpfs ro +1260 1301 0:67 / /proc/acpi ro,relatime - tmpfs tmpfs ro +1261 1301 0:61 /null /proc/kcore rw,nosuid - tmpfs tmpfs rw,size=65536k,mode=755 +1262 1301 0:61 /null /proc/keys rw,nosuid - tmpfs tmpfs rw,size=65536k,mode=755 +1263 1301 0:61 /null /proc/timer_list rw,nosuid - tmpfs tmpfs rw,size=65536k,mode=755 +1264 1301 0:61 /null /proc/sched_debug rw,nosuid - tmpfs tmpfs rw,size=65536k,mode=755 +1265 1301 0:68 / /proc/scsi ro,relatime - tmpfs tmpfs ro +1266 1304 0:69 / /sys/firmware ro,relatime - tmpfs tmpfs ro +` + func TestUninstallEnoent(t *testing.T) { c := Cgroup{ // set a non-existent name @@ -653,60 +730,88 @@ func TestPids(t *testing.T) { func TestLoadPaths(t *testing.T) { for _, tc := range []struct { - name string - cgroups string - want map[string]string - err string + name string + cgroups string + mountinfo string + want map[string]string + err string }{ { - name: "abs-path", - cgroups: "0:ctr:/path", - want: map[string]string{"ctr": "/path"}, + name: "abs-path", + cgroups: "0:ctr:/path", + mountinfo: debianMountinfo, + want: map[string]string{"ctr": "/path"}, }, { - name: "rel-path", - cgroups: "0:ctr:rel-path", - want: map[string]string{"ctr": "rel-path"}, + name: "rel-path", + cgroups: "0:ctr:rel-path", + mountinfo: debianMountinfo, + want: map[string]string{"ctr": "rel-path"}, }, { - name: "non-controller", - cgroups: "0:name=systemd:/path", - want: map[string]string{"systemd": "/path"}, + name: "non-controller", + cgroups: "0:name=systemd:/path", + mountinfo: debianMountinfo, + want: map[string]string{"systemd": "path"}, }, { - name: "empty", + name: "empty", + mountinfo: debianMountinfo, }, { name: "multiple", cgroups: "0:ctr0:/path0\n" + "1:ctr1:/path1\n" + "2::/empty\n", + mountinfo: debianMountinfo, want: map[string]string{ "ctr0": "/path0", "ctr1": "/path1", }, }, { - name: "missing-field", - cgroups: "0:nopath\n", - err: "invalid cgroups file", + name: "missing-field", + cgroups: "0:nopath\n", + mountinfo: debianMountinfo, + err: "invalid cgroups file", }, { - name: "too-many-fields", - cgroups: "0:ctr:/path:extra\n", - err: "invalid cgroups file", + name: "too-many-fields", + cgroups: "0:ctr:/path:extra\n", + mountinfo: debianMountinfo, + err: "invalid cgroups file", }, { name: "multiple-malformed", cgroups: "0:ctr0:/path0\n" + "1:ctr1:/path1\n" + "2:\n", - err: "invalid cgroups file", + mountinfo: debianMountinfo, + err: "invalid cgroups file", + }, + { + name: "nested-cgroup", + cgroups: `9:memory:/docker/136811d8fa1136e2746d10f6443c4c787c3cfbab5273270cc3aeeb3a94b3cc51 +2:cpu,cpuacct:/docker/136811d8fa1136e2746d10f6443c4c787c3cfbab5273270cc3aeeb3a94b3cc51 +1:name=systemd:/docker/136811d8fa1136e2746d10f6443c4c787c3cfbab5273270cc3aeeb3a94b3cc51 +0::/system.slice/containerd.service`, + mountinfo: dindMountinfo, + // we want relative path to /sys/fs/cgroup inside the nested container. + // Subcroup inside the container will be created at /sys/fs/cgroup/cpu + // This will be /sys/fs/cgroup/cpu/docker/136811d8fa1136e2746d10f6443c4c787c3cfbab5273270cc3aeeb3a94b3cc51/CGROUP_NAME + // outside the container + want: map[string]string{ + "memory": ".", + "cpu": ".", + "cpuacct": ".", + "systemd": ".", + }, }, } { t.Run(tc.name, func(t *testing.T) { r := strings.NewReader(tc.cgroups) - got, err := loadPathsHelper(r) + mountinfo := strings.NewReader(tc.mountinfo) + got, err := loadPathsHelperWithMountinfo(r, mountinfo) if len(tc.err) == 0 { if err != nil { t.Fatalf("Unexpected error: %v", err) |