|
17 | 17 | package manager |
18 | 18 |
|
19 | 19 | import ( |
20 | | - "os" |
21 | | - "os/exec" |
22 | | - "syscall" |
| 20 | + "fmt" |
| 21 | + |
| 22 | + "golang.org/x/sys/unix" |
23 | 23 | ) |
24 | 24 |
|
25 | | -// cloneMntNs configures the child command to start in a new user + mount |
26 | | -// namespace. The user namespace provides mount isolation and grants the |
27 | | -// child capabilities within it, without requiring or granting real host |
28 | | -// capabilities. User namespaces are available unprivileged on many |
29 | | -// distros (since Linux 3.8), but some may gate them via sysctl (e.g. |
30 | | -// kernel.unprivileged_userns_clone). |
31 | | -// |
32 | | -// For a VM-based runtime like nerdbox, the shim does not need real host |
33 | | -// root — it needs /dev/kvm access (checked against mapped host UID) and |
34 | | -// file access (same user). The user namespace is defense-in-depth: it |
35 | | -// limits the shim's host-level capabilities even when the daemon runs as |
36 | | -// root. |
37 | | -// |
38 | | -// We use clone flags instead of unshare(2) because unshare(CLONE_NEWUSER) |
39 | | -// requires the calling process to be single-threaded, which is not |
40 | | -// possible in a Go program (the runtime uses multiple OS threads). |
41 | | -// |
42 | | -// The new mount namespace inherits copies of the parent's mounts with |
43 | | -// the same propagation flags. The shim performs rootfs mounts (overlay / |
44 | | -// bind) inside this namespace. On hosts where / is shared, those mounts |
45 | | -// could in theory propagate back. Because the child also runs in a user |
46 | | -// namespace, it cannot remount / as MS_SLAVE. In practice this is safe: |
47 | | -// the mounts are into bundle-specific paths that are cleaned up on |
48 | | -// container delete, and the VM itself performs all container-visible |
49 | | -// filesystem setup. |
50 | | -func cloneMntNs(cmd *exec.Cmd) { |
51 | | - uid := os.Getuid() |
52 | | - gid := os.Getgid() |
53 | | - cmd.SysProcAttr.Cloneflags |= syscall.CLONE_NEWUSER | syscall.CLONE_NEWNS |
54 | | - cmd.SysProcAttr.UidMappings = []syscall.SysProcIDMap{ |
55 | | - {ContainerID: uid, HostID: uid, Size: 1}, |
| 25 | +func setupMntNs() error { |
| 26 | + err := unix.Unshare(unix.CLONE_NEWNS) |
| 27 | + if err != nil { |
| 28 | + return err |
56 | 29 | } |
57 | | - cmd.SysProcAttr.GidMappings = []syscall.SysProcIDMap{ |
58 | | - {ContainerID: gid, HostID: gid, Size: 1}, |
| 30 | + |
| 31 | + err = unix.Mount("", "/", "", unix.MS_REC|unix.MS_SLAVE, "") |
| 32 | + if err != nil { |
| 33 | + err = fmt.Errorf("failed to mount with slave: %v", err) |
| 34 | + return err |
59 | 35 | } |
| 36 | + |
| 37 | + err = unix.Mount("", "/", "", unix.MS_REC|unix.MS_SHARED, "") |
| 38 | + if err != nil { |
| 39 | + err = fmt.Errorf("failed to mount with shared: %v", err) |
| 40 | + return err |
| 41 | + } |
| 42 | + |
| 43 | + return nil |
60 | 44 | } |
0 commit comments