|
| 1 | +//go:build linux |
| 2 | + |
| 3 | +/* |
| 4 | + Copyright The containerd Authors. |
| 5 | +
|
| 6 | + Licensed under the Apache License, Version 2.0 (the "License"); |
| 7 | + you may not use this file except in compliance with the License. |
| 8 | + You may obtain a copy of the License at |
| 9 | +
|
| 10 | + http://www.apache.org/licenses/LICENSE-2.0 |
| 11 | +
|
| 12 | + Unless required by applicable law or agreed to in writing, software |
| 13 | + distributed under the License is distributed on an "AS IS" BASIS, |
| 14 | + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 15 | + See the License for the specific language governing permissions and |
| 16 | + limitations under the License. |
| 17 | +*/ |
| 18 | + |
| 19 | +// Detects whether user namespaces are restricted by checking if |
| 20 | +// getsockopt(SO_TYPE) returns EACCES when a unix socket fd is inherited |
| 21 | +// by a child spawned with CLONE_NEWUSER + a UID mapping + exec. |
| 22 | +// |
| 23 | +// This reproduces the exact failure path in the nerdbox shim where |
| 24 | +// net.FileListener calls getsockopt(fd, SOL_SOCKET, SO_TYPE) and gets EACCES. |
| 25 | +// |
| 26 | +// The exec is critical: it triggers capability recomputation. With euid != 0 |
| 27 | +// in the new userns, caps drop to zero, and cross-userns socket access fails. |
| 28 | +// |
| 29 | +// Exit codes: |
| 30 | +// |
| 31 | +// 0 — userns NOT restricted (getsockopt succeeded) |
| 32 | +// 1 — userns RESTRICTED (getsockopt got EACCES/EPERM) |
| 33 | +// 2 — unexpected error |
| 34 | +// 77 — skipped (running as root) |
| 35 | +package main |
| 36 | + |
| 37 | +import ( |
| 38 | + "errors" |
| 39 | + "fmt" |
| 40 | + "net" |
| 41 | + "os" |
| 42 | + "os/exec" |
| 43 | + "path/filepath" |
| 44 | + "syscall" |
| 45 | + |
| 46 | + "golang.org/x/sys/unix" |
| 47 | +) |
| 48 | + |
| 49 | +func main() { |
| 50 | + if len(os.Args) > 1 && os.Args[1] == "--child" { |
| 51 | + os.Exit(childMain()) |
| 52 | + } |
| 53 | + os.Exit(parentMain()) |
| 54 | +} |
| 55 | + |
| 56 | +func childMain() int { |
| 57 | + fd := 3 |
| 58 | + fmt.Printf("child: pid=%d uid=%d, calling getsockopt(fd=%d, SO_TYPE)\n", |
| 59 | + os.Getpid(), os.Getuid(), fd) |
| 60 | + |
| 61 | + _, err := unix.GetsockoptInt(fd, syscall.SOL_SOCKET, syscall.SO_TYPE) |
| 62 | + if err != nil { |
| 63 | + fmt.Printf("child: getsockopt FAILED: %v\n", err) |
| 64 | + if errors.Is(err, syscall.EACCES) || errors.Is(err, syscall.EPERM) { |
| 65 | + return 1 |
| 66 | + } |
| 67 | + return 2 |
| 68 | + } |
| 69 | + |
| 70 | + fmt.Printf("child: getsockopt OK\n") |
| 71 | + return 0 |
| 72 | +} |
| 73 | + |
| 74 | +func parentMain() int { |
| 75 | + uid := os.Getuid() |
| 76 | + gid := os.Getgid() |
| 77 | + |
| 78 | + if uid == 0 { |
| 79 | + fmt.Fprintf(os.Stderr, "run as non-root to reproduce (root gets caps in new userns)\n") |
| 80 | + return 77 |
| 81 | + } |
| 82 | + |
| 83 | + // Create a Unix socket in a temp directory — same idea as the shim's |
| 84 | + // listener socket. |
| 85 | + tmpDir, err := os.MkdirTemp("", "userns-check-*") |
| 86 | + if err != nil { |
| 87 | + fmt.Fprintf(os.Stderr, "mkdirtemp: %v\n", err) |
| 88 | + return 2 |
| 89 | + } |
| 90 | + defer os.RemoveAll(tmpDir) |
| 91 | + |
| 92 | + sockPath := filepath.Join(tmpDir, "test.sock") |
| 93 | + ln, err := net.Listen("unix", sockPath) |
| 94 | + if err != nil { |
| 95 | + fmt.Fprintf(os.Stderr, "listen: %v\n", err) |
| 96 | + return 2 |
| 97 | + } |
| 98 | + defer ln.Close() |
| 99 | + |
| 100 | + f, err := ln.(*net.UnixListener).File() |
| 101 | + if err != nil { |
| 102 | + fmt.Fprintf(os.Stderr, "file: %v\n", err) |
| 103 | + return 2 |
| 104 | + } |
| 105 | + defer f.Close() |
| 106 | + |
| 107 | + fmt.Printf("parent: created socket fd=%d (uid=%d)\n", f.Fd(), uid) |
| 108 | + |
| 109 | + // Re-exec ourselves as "--child" with CLONE_NEWUSER|CLONE_NEWNS. |
| 110 | + // This is the same clone+exec pattern Go's ForkExec uses when |
| 111 | + // SysProcAttr.Cloneflags is set — which triggers cap recomputation. |
| 112 | + // The UID/GID mappings mirror the shim's cloneMntNs implementation. |
| 113 | + cmd := exec.Command("/proc/self/exe", "--child") |
| 114 | + cmd.Stdout = os.Stdout |
| 115 | + cmd.Stderr = os.Stderr |
| 116 | + cmd.ExtraFiles = []*os.File{f} // fd 3 in child |
| 117 | + cmd.SysProcAttr = &syscall.SysProcAttr{ |
| 118 | + Cloneflags: syscall.CLONE_NEWUSER | syscall.CLONE_NEWNS, |
| 119 | + UidMappings: []syscall.SysProcIDMap{ |
| 120 | + {ContainerID: uid, HostID: uid, Size: 1}, |
| 121 | + }, |
| 122 | + GidMappings: []syscall.SysProcIDMap{ |
| 123 | + {ContainerID: gid, HostID: gid, Size: 1}, |
| 124 | + }, |
| 125 | + } |
| 126 | + |
| 127 | + if err := cmd.Run(); err != nil { |
| 128 | + if exitErr, ok := err.(*exec.ExitError); ok { |
| 129 | + rc := exitErr.ExitCode() |
| 130 | + if rc == 1 { |
| 131 | + fmt.Printf("USERNS RESTRICTED — child got EACCES/EPERM (CLONE_NEWUSER + exec)\n") |
| 132 | + return 1 |
| 133 | + } |
| 134 | + fmt.Printf("unexpected child failure (exit %d)\n", rc) |
| 135 | + return 2 |
| 136 | + } |
| 137 | + fmt.Fprintf(os.Stderr, "exec: %v\n", err) |
| 138 | + return 2 |
| 139 | + } |
| 140 | + |
| 141 | + fmt.Printf("userns NOT restricted — getsockopt succeeded\n") |
| 142 | + return 0 |
| 143 | +} |
0 commit comments