Skip to content

Commit 36d67ed

Browse files
Scrub sensitive paths and emails from deploy telemetry error messages
Adds a scrubber that runs before error messages are sent to telemetry: 1. Replaces the bundle root path with "." to avoid leaking local paths 2. Redacts remaining home directory paths (/Users/..., /home/..., C:\Users\...) 3. Redacts email addresses (e.g., in workspace paths) Inspired by VS Code's telemetry path scrubbing and Sentry's @userpath rule. Co-authored-by: Isaac
1 parent a2fb5dd commit 36d67ed

3 files changed

Lines changed: 373 additions & 0 deletions

File tree

bundle/phases/telemetry.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"github.com/databricks/cli/bundle/config"
1010
"github.com/databricks/cli/bundle/libraries"
1111
"github.com/databricks/cli/libs/dyn"
12+
"github.com/databricks/cli/libs/env"
1213
"github.com/databricks/cli/libs/log"
1314
"github.com/databricks/cli/libs/telemetry"
1415
"github.com/databricks/cli/libs/telemetry/protos"
@@ -38,6 +39,9 @@ const maxErrorMessageLength = 500
3839

3940
// LogDeployTelemetry logs a telemetry event for a bundle deploy command.
4041
func LogDeployTelemetry(ctx context.Context, b *bundle.Bundle, errMsg string) {
42+
homeDir, _ := env.UserHomeDir(ctx)
43+
errMsg = scrubForTelemetry(errMsg, b.BundleRootPath, homeDir)
44+
4145
if len(errMsg) > maxErrorMessageLength {
4246
errMsg = errMsg[:maxErrorMessageLength]
4347
}

bundle/phases/telemetry_scrub.go

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
package phases
2+
3+
import (
4+
"path/filepath"
5+
"regexp"
6+
"strings"
7+
)
8+
9+
// Scrub sensitive information from error messages before sending to telemetry.
10+
// Inspired by VS Code's telemetry path scrubbing and Sentry's @userpath pattern.
11+
//
12+
// References:
13+
// - VS Code: https://github.com/microsoft/vscode/blob/main/src/vs/platform/telemetry/common/telemetryUtils.ts
14+
// - Sentry: https://github.com/getsentry/relay (PII rule: @userpath)
15+
var (
16+
// Matches home directory paths on macOS and Linux.
17+
// The leading delimiter check avoids matching workspace paths like
18+
// /Workspace/Users/... where /Users is not a top-level component.
19+
unixHomeDirRegexp = regexp.MustCompile(`(?:^|[\s:,"'])(/(?:Users|home)/[^\s:,"']+)`)
20+
21+
// Matches home directory paths on Windows with either backslashes or
22+
// forward slashes (C:\Users\xxx\... or C:/Users/xxx/...).
23+
windowsHomeDirRegexp = regexp.MustCompile(`[A-Z]:[/\\]Users[/\\][^\s:,"']+`)
24+
25+
// Matches remaining absolute Unix paths with at least two components
26+
// (e.g., /tmp/foo, /var/folders/xx/..., /etc/databricks/...).
27+
// Safe prefixes (/Workspace, /Volumes, /dbfs) are filtered in the
28+
// replacement function since Go regexp lacks negative lookahead.
29+
remainingAbsPathRegexp = regexp.MustCompile(`(?:^|[\s:,"'])(/[^\s:,"'/]+/[^\s:,"']+)`)
30+
31+
// Matches email addresses. Workspace paths in Databricks often contain
32+
// emails (e.g., /Workspace/Users/user@example.com/.bundle/dev).
33+
emailRegexp = regexp.MustCompile(`[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}`)
34+
)
35+
36+
// scrubForTelemetry is a best-effort scrubber that removes sensitive path and
37+
// PII information from error messages before they are sent to telemetry.
38+
// The error message is treated as PII by the logging infrastructure but we
39+
// scrub to avoid collecting more information than necessary.
40+
func scrubForTelemetry(msg, bundleRoot, homeDir string) string {
41+
// Replace the bundle root path first since it's the most specific match.
42+
// This turns "/Users/shreyas/project/databricks.yml" into "./databricks.yml".
43+
if bundleRoot != "" {
44+
msg = replacePath(msg, bundleRoot, ".")
45+
}
46+
47+
// Replace the user's home directory. This catches paths outside the
48+
// bundle root like "/Users/shreyas/.databricks/..." → "~/.databricks/...".
49+
if homeDir != "" {
50+
msg = replacePath(msg, homeDir, "~")
51+
}
52+
53+
// Regex fallback: redact remaining home directory paths not covered by the
54+
// direct home dir replacement above (e.g., paths from other users or
55+
// non-standard home directory locations).
56+
// Run Windows first to avoid partial matches from the Unix regex on
57+
// paths like C:/Users/...
58+
msg = windowsHomeDirRegexp.ReplaceAllString(msg, "[REDACTED_HOME_PATH]")
59+
msg = replaceUnixRegexpMatch(msg, unixHomeDirRegexp, "[REDACTED_HOME_PATH]")
60+
61+
// Catch-all: redact any remaining absolute paths (e.g., /tmp/...,
62+
// /var/folders/...) that weren't covered by the above replacements.
63+
// Skip known safe prefixes like /Workspace, /Volumes, /dbfs.
64+
msg = remainingAbsPathRegexp.ReplaceAllStringFunc(msg, func(match string) string {
65+
idx := strings.Index(match, "/")
66+
if idx < 0 {
67+
return match
68+
}
69+
path := match[idx:]
70+
for _, prefix := range []string{"/Workspace/", "/Volumes/", "/dbfs/"} {
71+
if strings.HasPrefix(path, prefix) {
72+
return match
73+
}
74+
}
75+
return match[:idx] + "[REDACTED_PATH]"
76+
})
77+
78+
// Redact email addresses.
79+
msg = emailRegexp.ReplaceAllString(msg, "[REDACTED_EMAIL]")
80+
81+
return msg
82+
}
83+
84+
// replacePath replaces all occurrences of a directory path with the given
85+
// replacement. It only replaces when the path appears as a complete prefix,
86+
// i.e., followed by `/`, a delimiter, or end of string. This prevents partial
87+
// matches like "/Users/shreyas" matching inside "/Workspace/Users/shreyas@...".
88+
func replacePath(msg, path, replacement string) string {
89+
normalized := filepath.ToSlash(path)
90+
for _, p := range []string{normalized, path} {
91+
msg = strings.ReplaceAll(msg, p+"/", replacement+"/")
92+
93+
// Replace occurrences not followed by '/' only when the path is at
94+
// a word boundary (followed by delimiter or end of string).
95+
result := strings.Builder{}
96+
for {
97+
idx := strings.Index(msg, p)
98+
if idx == -1 {
99+
result.WriteString(msg)
100+
break
101+
}
102+
after := idx + len(p)
103+
// Check the character after the match. Only replace if it's
104+
// a delimiter or end of string.
105+
if after == len(msg) || strings.ContainsRune(" \t\n:,\"'", rune(msg[after])) {
106+
result.WriteString(msg[:idx])
107+
result.WriteString(replacement)
108+
msg = msg[after:]
109+
} else {
110+
result.WriteString(msg[:after])
111+
msg = msg[after:]
112+
}
113+
}
114+
msg = result.String()
115+
}
116+
return msg
117+
}
118+
119+
// replaceUnixRegexpMatch replaces Unix absolute paths matched by the given
120+
// regex. The regex is expected to use a leading delimiter group
121+
// `(?:^|[\s:,"'])` to anchor the match. The delimiter is preserved and only
122+
// the path portion (starting with `/`) is replaced.
123+
func replaceUnixRegexpMatch(msg string, re *regexp.Regexp, replacement string) string {
124+
return re.ReplaceAllStringFunc(msg, func(match string) string {
125+
idx := strings.Index(match, "/")
126+
if idx < 0 {
127+
return match
128+
}
129+
return match[:idx] + replacement
130+
})
131+
}
Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,238 @@
1+
package phases
2+
3+
import (
4+
"testing"
5+
6+
"github.com/stretchr/testify/assert"
7+
)
8+
9+
func TestScrubForTelemetry_BundleRootPath(t *testing.T) {
10+
tests := []struct {
11+
name string
12+
msg string
13+
bundleRoot string
14+
expected string
15+
}{
16+
{
17+
name: "replaces bundle root in file path",
18+
msg: "failed to load /home/user/project/databricks.yml: invalid config",
19+
bundleRoot: "/home/user/project",
20+
expected: "failed to load ./databricks.yml: invalid config",
21+
},
22+
{
23+
name: "replaces bundle root without trailing content",
24+
msg: "error at /home/user/project",
25+
bundleRoot: "/home/user/project",
26+
expected: "error at .",
27+
},
28+
{
29+
name: "replaces multiple occurrences",
30+
msg: "path /home/user/project/a.yml and /home/user/project/b.yml",
31+
bundleRoot: "/home/user/project",
32+
expected: "path ./a.yml and ./b.yml",
33+
},
34+
{
35+
name: "empty bundle root is no-op",
36+
msg: "some error",
37+
bundleRoot: "",
38+
expected: "some error",
39+
},
40+
{
41+
name: "empty message",
42+
msg: "",
43+
bundleRoot: "/home/user/project",
44+
expected: "",
45+
},
46+
}
47+
48+
for _, tt := range tests {
49+
t.Run(tt.name, func(t *testing.T) {
50+
assert.Equal(t, tt.expected, scrubForTelemetry(tt.msg, tt.bundleRoot, ""))
51+
})
52+
}
53+
}
54+
55+
func TestScrubForTelemetry_HomeDir(t *testing.T) {
56+
tests := []struct {
57+
name string
58+
msg string
59+
homeDir string
60+
expected string
61+
}{
62+
{
63+
name: "replaces home dir with tilde",
64+
msg: "failed to read /Users/shreyas/.databricks/config.json",
65+
homeDir: "/Users/shreyas",
66+
expected: "failed to read ~/.databricks/config.json",
67+
},
68+
{
69+
name: "home dir replacement after bundle root",
70+
msg: "error: /Users/shreyas/project/file.yml and /Users/shreyas/.cache/other",
71+
homeDir: "/Users/shreyas",
72+
expected: "error: ~/project/file.yml and ~/.cache/other",
73+
},
74+
{
75+
name: "bundle root takes priority over home dir",
76+
msg: "error at /Users/shreyas/project/databricks.yml",
77+
homeDir: "/Users/shreyas",
78+
expected: "error at ./databricks.yml",
79+
},
80+
}
81+
82+
for _, tt := range tests {
83+
t.Run(tt.name, func(t *testing.T) {
84+
bundleRoot := ""
85+
if tt.name == "bundle root takes priority over home dir" {
86+
bundleRoot = "/Users/shreyas/project"
87+
}
88+
assert.Equal(t, tt.expected, scrubForTelemetry(tt.msg, bundleRoot, tt.homeDir))
89+
})
90+
}
91+
}
92+
93+
func TestScrubForTelemetry_HomeDirRegexFallback(t *testing.T) {
94+
tests := []struct {
95+
name string
96+
msg string
97+
expected string
98+
}{
99+
{
100+
name: "macOS home dir",
101+
msg: "failed to read /Users/otheruser/some-project/file.yml",
102+
expected: "failed to read [REDACTED_HOME_PATH]",
103+
},
104+
{
105+
name: "Linux home dir",
106+
msg: "failed to read /home/runner/work/project/file.yml",
107+
expected: "failed to read [REDACTED_HOME_PATH]",
108+
},
109+
{
110+
name: "home dir in middle of message",
111+
msg: "error: /Users/jane/project/a.yml: not found, try again",
112+
expected: "error: [REDACTED_HOME_PATH]: not found, try again",
113+
},
114+
{
115+
name: "Windows home dir with backslashes",
116+
msg: `error at C:\Users\shreyas\project\file.yml`,
117+
expected: "error at [REDACTED_HOME_PATH]",
118+
},
119+
{
120+
name: "Windows home dir with forward slashes",
121+
msg: "error at C:/Users/shreyas/project/file.yml",
122+
expected: "error at [REDACTED_HOME_PATH]",
123+
},
124+
{
125+
name: "preserves relative paths",
126+
msg: "failed to load ./resources/job.yml",
127+
expected: "failed to load ./resources/job.yml",
128+
},
129+
{
130+
name: "preserves workspace paths without email",
131+
msg: "uploading to /Workspace/.bundle/dev/files",
132+
expected: "uploading to /Workspace/.bundle/dev/files",
133+
},
134+
}
135+
136+
for _, tt := range tests {
137+
t.Run(tt.name, func(t *testing.T) {
138+
assert.Equal(t, tt.expected, scrubForTelemetry(tt.msg, "", ""))
139+
})
140+
}
141+
}
142+
143+
func TestScrubForTelemetry_RemainingAbsolutePaths(t *testing.T) {
144+
tests := []struct {
145+
name string
146+
msg string
147+
expected string
148+
}{
149+
{
150+
name: "tmp path",
151+
msg: "failed to write /tmp/bundle-xyz/state.json",
152+
expected: "failed to write [REDACTED_PATH]",
153+
},
154+
{
155+
name: "var folders path",
156+
msg: "error reading /var/folders/7t/n_tz6x9d4xj91h48pf8md5zh0000gp/T/test123/file",
157+
expected: "error reading [REDACTED_PATH]",
158+
},
159+
{
160+
name: "etc path",
161+
msg: "config at /etc/databricks/config.json not found",
162+
expected: "config at [REDACTED_PATH] not found",
163+
},
164+
{
165+
name: "preserves workspace paths",
166+
msg: "uploading to /Workspace/Users/dev/.bundle/files",
167+
expected: "uploading to /Workspace/Users/dev/.bundle/files",
168+
},
169+
{
170+
name: "preserves volume paths",
171+
msg: "artifact at /Volumes/catalog/schema/volume/artifact.whl",
172+
expected: "artifact at /Volumes/catalog/schema/volume/artifact.whl",
173+
},
174+
{
175+
name: "preserves dbfs paths",
176+
msg: "state at /dbfs/mnt/data/state.json",
177+
expected: "state at /dbfs/mnt/data/state.json",
178+
},
179+
{
180+
name: "single component path is not matched",
181+
msg: "POST /telemetry-ext failed",
182+
expected: "POST /telemetry-ext failed",
183+
},
184+
}
185+
186+
for _, tt := range tests {
187+
t.Run(tt.name, func(t *testing.T) {
188+
assert.Equal(t, tt.expected, scrubForTelemetry(tt.msg, "", ""))
189+
})
190+
}
191+
}
192+
193+
func TestScrubForTelemetry_Emails(t *testing.T) {
194+
tests := []struct {
195+
name string
196+
msg string
197+
expected string
198+
}{
199+
{
200+
name: "email in workspace path",
201+
msg: "/Workspace/Users/user@example.com/.bundle/dev should contain current username",
202+
expected: "/Workspace/Users/[REDACTED_EMAIL]/.bundle/dev should contain current username",
203+
},
204+
{
205+
name: "plain email",
206+
msg: "access denied for user@company.io",
207+
expected: "access denied for [REDACTED_EMAIL]",
208+
},
209+
{
210+
name: "no email present",
211+
msg: "some error without emails",
212+
expected: "some error without emails",
213+
},
214+
}
215+
216+
for _, tt := range tests {
217+
t.Run(tt.name, func(t *testing.T) {
218+
assert.Equal(t, tt.expected, scrubForTelemetry(tt.msg, "", ""))
219+
})
220+
}
221+
}
222+
223+
func TestScrubForTelemetry_Combined(t *testing.T) {
224+
msg := "failed to load /Users/shreyas/myproject/databricks.yml: " +
225+
"workspace /Workspace/Users/shreyas@databricks.com/.bundle is invalid, " +
226+
"also tried /home/other/fallback/config.yml, " +
227+
"temp at /tmp/bundle-cache/state.json"
228+
229+
got := scrubForTelemetry(msg, "/Users/shreyas/myproject", "/Users/shreyas")
230+
231+
assert.Equal(t,
232+
"failed to load ./databricks.yml: "+
233+
"workspace /Workspace/Users/[REDACTED_EMAIL]/.bundle is invalid, "+
234+
"also tried [REDACTED_HOME_PATH], "+
235+
"temp at [REDACTED_PATH]",
236+
got,
237+
)
238+
}

0 commit comments

Comments
 (0)