diff --git a/NEXT_CHANGELOG.md b/NEXT_CHANGELOG.md index 85a50d00b9..e4e33987a9 100644 --- a/NEXT_CHANGELOG.md +++ b/NEXT_CHANGELOG.md @@ -12,6 +12,8 @@ * Deduplicate grant entries with duplicate principals or privileges during initialization ([#4801](https://github.com/databricks/cli/pull/4801)) * engine/direct: Fix unwanted recreation of secret scopes when scope_backend_type is not set ([#4834](https://github.com/databricks/cli/pull/4834)) +* Replace regex-based variable interpolation with a character scanner ([#4747](https://github.com/databricks/cli/pull/4747)). + ### Dependency updates ### API Changes diff --git a/acceptance/bundle/resource_deps/bad_syntax/out.deploy.direct.txt b/acceptance/bundle/resource_deps/bad_syntax/out.deploy.direct.txt index 6bed0296b3..d9790d5c72 100644 --- a/acceptance/bundle/resource_deps/bad_syntax/out.deploy.direct.txt +++ b/acceptance/bundle/resource_deps/bad_syntax/out.deploy.direct.txt @@ -1,3 +1,7 @@ +Warning: invalid variable reference ${resources.volumes.bar.bad..syntax}: invalid path + at resources.volumes.foo.schema_name + in databricks.yml:11:21 + Uploading bundle files to /Workspace/Users/[USERNAME]/.bundle/test-bundle/default/files... Deploying resources... Updating deployment state... diff --git a/acceptance/bundle/resource_deps/bad_syntax/out.plan.direct.txt b/acceptance/bundle/resource_deps/bad_syntax/out.plan.direct.txt index 1a40fdbaa3..e00391b668 100644 --- a/acceptance/bundle/resource_deps/bad_syntax/out.plan.direct.txt +++ b/acceptance/bundle/resource_deps/bad_syntax/out.plan.direct.txt @@ -1,3 +1,7 @@ +Warning: invalid variable reference ${resources.volumes.bar.bad..syntax}: invalid path + at resources.volumes.foo.schema_name + in databricks.yml:11:21 + create volumes.bar create volumes.foo diff --git a/acceptance/bundle/resource_deps/bad_syntax/out.plan.terraform.txt b/acceptance/bundle/resource_deps/bad_syntax/out.plan.terraform.txt index b8f8078aba..ec5521761d 100644 --- a/acceptance/bundle/resource_deps/bad_syntax/out.plan.terraform.txt +++ b/acceptance/bundle/resource_deps/bad_syntax/out.plan.terraform.txt @@ -1,3 +1,7 @@ +Warning: invalid variable reference ${resources.volumes.bar.bad..syntax}: invalid path + at resources.volumes.foo.schema_name + in databricks.yml:11:21 + Error: exit status 1 Error: Invalid attribute name diff --git a/acceptance/bundle/resource_deps/bad_syntax/output.txt b/acceptance/bundle/resource_deps/bad_syntax/output.txt index 0e9d83b643..7dedf83402 100644 --- a/acceptance/bundle/resource_deps/bad_syntax/output.txt +++ b/acceptance/bundle/resource_deps/bad_syntax/output.txt @@ -1,5 +1,9 @@ >>> [CLI] bundle validate -o json +Warning: invalid variable reference ${resources.volumes.bar.bad..syntax}: invalid path + at resources.volumes.foo.schema_name + in databricks.yml:11:21 + { "volumes": { "bar": { diff --git a/acceptance/bundle/variables/dollar_before_non_brace/databricks.yml b/acceptance/bundle/variables/dollar_before_non_brace/databricks.yml new file mode 100644 index 0000000000..62fc346040 --- /dev/null +++ b/acceptance/bundle/variables/dollar_before_non_brace/databricks.yml @@ -0,0 +1,6 @@ +bundle: + name: '$foo is not a reference' + +variables: + a: + default: hello diff --git a/acceptance/bundle/variables/dollar_before_non_brace/out.test.toml b/acceptance/bundle/variables/dollar_before_non_brace/out.test.toml new file mode 100644 index 0000000000..d560f1de04 --- /dev/null +++ b/acceptance/bundle/variables/dollar_before_non_brace/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["terraform", "direct"] diff --git a/acceptance/bundle/variables/dollar_before_non_brace/output.txt b/acceptance/bundle/variables/dollar_before_non_brace/output.txt new file mode 100644 index 0000000000..e97599047b --- /dev/null +++ b/acceptance/bundle/variables/dollar_before_non_brace/output.txt @@ -0,0 +1,9 @@ + +>>> [CLI] bundle validate +Name: $foo is not a reference +Target: default +Workspace: + User: [USERNAME] + Path: /Workspace/Users/[USERNAME]/.bundle/$foo is not a reference/default + +Validation OK! diff --git a/acceptance/bundle/variables/dollar_before_non_brace/script b/acceptance/bundle/variables/dollar_before_non_brace/script new file mode 100644 index 0000000000..5350876150 --- /dev/null +++ b/acceptance/bundle/variables/dollar_before_non_brace/script @@ -0,0 +1 @@ +trace $CLI bundle validate diff --git a/acceptance/bundle/variables/empty_ref/databricks.yml b/acceptance/bundle/variables/empty_ref/databricks.yml new file mode 100644 index 0000000000..c6254777bd --- /dev/null +++ b/acceptance/bundle/variables/empty_ref/databricks.yml @@ -0,0 +1,6 @@ +bundle: + name: '${}' + +variables: + a: + default: hello diff --git a/acceptance/bundle/variables/empty_ref/out.test.toml b/acceptance/bundle/variables/empty_ref/out.test.toml new file mode 100644 index 0000000000..d560f1de04 --- /dev/null +++ b/acceptance/bundle/variables/empty_ref/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["terraform", "direct"] diff --git a/acceptance/bundle/variables/empty_ref/output.txt b/acceptance/bundle/variables/empty_ref/output.txt new file mode 100644 index 0000000000..beafe20df5 --- /dev/null +++ b/acceptance/bundle/variables/empty_ref/output.txt @@ -0,0 +1,13 @@ + +>>> [CLI] bundle validate +Warning: empty variable reference + at bundle.name + in databricks.yml:2:9 + +Name: ${} +Target: default +Workspace: + User: [USERNAME] + Path: /Workspace/Users/[USERNAME]/.bundle/${}/default + +Found 1 warning diff --git a/acceptance/bundle/variables/empty_ref/script b/acceptance/bundle/variables/empty_ref/script new file mode 100644 index 0000000000..5350876150 --- /dev/null +++ b/acceptance/bundle/variables/empty_ref/script @@ -0,0 +1 @@ +trace $CLI bundle validate diff --git a/acceptance/bundle/variables/leading_digit_ref/databricks.yml b/acceptance/bundle/variables/leading_digit_ref/databricks.yml new file mode 100644 index 0000000000..bdd2e44a4b --- /dev/null +++ b/acceptance/bundle/variables/leading_digit_ref/databricks.yml @@ -0,0 +1,6 @@ +bundle: + name: '${0foo}' + +variables: + a: + default: hello diff --git a/acceptance/bundle/variables/leading_digit_ref/out.test.toml b/acceptance/bundle/variables/leading_digit_ref/out.test.toml new file mode 100644 index 0000000000..d560f1de04 --- /dev/null +++ b/acceptance/bundle/variables/leading_digit_ref/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["terraform", "direct"] diff --git a/acceptance/bundle/variables/leading_digit_ref/output.txt b/acceptance/bundle/variables/leading_digit_ref/output.txt new file mode 100644 index 0000000000..d43d228eeb --- /dev/null +++ b/acceptance/bundle/variables/leading_digit_ref/output.txt @@ -0,0 +1,13 @@ + +>>> [CLI] bundle validate +Warning: invalid variable reference ${0foo}: invalid key "0foo" + at bundle.name + in databricks.yml:2:9 + +Name: ${0foo} +Target: default +Workspace: + User: [USERNAME] + Path: /Workspace/Users/[USERNAME]/.bundle/${0foo}/default + +Found 1 warning diff --git a/acceptance/bundle/variables/leading_digit_ref/script b/acceptance/bundle/variables/leading_digit_ref/script new file mode 100644 index 0000000000..5350876150 --- /dev/null +++ b/acceptance/bundle/variables/leading_digit_ref/script @@ -0,0 +1 @@ +trace $CLI bundle validate diff --git a/acceptance/bundle/variables/malformed_reference/databricks.yml b/acceptance/bundle/variables/malformed_reference/databricks.yml new file mode 100644 index 0000000000..63a0282395 --- /dev/null +++ b/acceptance/bundle/variables/malformed_reference/databricks.yml @@ -0,0 +1,6 @@ +bundle: + name: "${foo.bar-}" + +variables: + a: + default: hello diff --git a/acceptance/bundle/variables/malformed_reference/out.test.toml b/acceptance/bundle/variables/malformed_reference/out.test.toml new file mode 100644 index 0000000000..d560f1de04 --- /dev/null +++ b/acceptance/bundle/variables/malformed_reference/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["terraform", "direct"] diff --git a/acceptance/bundle/variables/malformed_reference/output.txt b/acceptance/bundle/variables/malformed_reference/output.txt new file mode 100644 index 0000000000..d5b772a731 --- /dev/null +++ b/acceptance/bundle/variables/malformed_reference/output.txt @@ -0,0 +1,13 @@ + +>>> [CLI] bundle validate +Warning: invalid variable reference ${foo.bar-}: invalid key "bar-" + at bundle.name + in databricks.yml:2:9 + +Name: ${foo.bar-} +Target: default +Workspace: + User: [USERNAME] + Path: /Workspace/Users/[USERNAME]/.bundle/${foo.bar-}/default + +Found 1 warning diff --git a/acceptance/bundle/variables/malformed_reference/script b/acceptance/bundle/variables/malformed_reference/script new file mode 100644 index 0000000000..5350876150 --- /dev/null +++ b/acceptance/bundle/variables/malformed_reference/script @@ -0,0 +1 @@ +trace $CLI bundle validate diff --git a/acceptance/bundle/variables/unterminated_ref/databricks.yml b/acceptance/bundle/variables/unterminated_ref/databricks.yml new file mode 100644 index 0000000000..3ca0d01099 --- /dev/null +++ b/acceptance/bundle/variables/unterminated_ref/databricks.yml @@ -0,0 +1,6 @@ +bundle: + name: '${foo.bar' + +variables: + a: + default: hello diff --git a/acceptance/bundle/variables/unterminated_ref/out.test.toml b/acceptance/bundle/variables/unterminated_ref/out.test.toml new file mode 100644 index 0000000000..d560f1de04 --- /dev/null +++ b/acceptance/bundle/variables/unterminated_ref/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["terraform", "direct"] diff --git a/acceptance/bundle/variables/unterminated_ref/output.txt b/acceptance/bundle/variables/unterminated_ref/output.txt new file mode 100644 index 0000000000..cb5953f539 --- /dev/null +++ b/acceptance/bundle/variables/unterminated_ref/output.txt @@ -0,0 +1,13 @@ + +>>> [CLI] bundle validate +Warning: unterminated variable reference + at bundle.name + in databricks.yml:2:9 + +Name: ${foo.bar +Target: default +Workspace: + User: [USERNAME] + Path: /Workspace/Users/[USERNAME]/.bundle/${foo.bar/default + +Found 1 warning diff --git a/acceptance/bundle/variables/unterminated_ref/script b/acceptance/bundle/variables/unterminated_ref/script new file mode 100644 index 0000000000..5350876150 --- /dev/null +++ b/acceptance/bundle/variables/unterminated_ref/script @@ -0,0 +1 @@ +trace $CLI bundle validate diff --git a/acceptance/bundle/variables/var_in_var_3level/databricks.yml b/acceptance/bundle/variables/var_in_var_3level/databricks.yml new file mode 100644 index 0000000000..af94bf7d0d --- /dev/null +++ b/acceptance/bundle/variables/var_in_var_3level/databricks.yml @@ -0,0 +1,12 @@ +bundle: + name: test-bundle + +variables: + env: + default: prod + region_prod: + default: us + endpoint_us: + default: https://us.example.com + result: + default: ${var.endpoint_${var.region_${var.env}}} diff --git a/acceptance/bundle/variables/var_in_var_3level/out.test.toml b/acceptance/bundle/variables/var_in_var_3level/out.test.toml new file mode 100644 index 0000000000..d560f1de04 --- /dev/null +++ b/acceptance/bundle/variables/var_in_var_3level/out.test.toml @@ -0,0 +1,5 @@ +Local = true +Cloud = false + +[EnvMatrix] + DATABRICKS_BUNDLE_ENGINE = ["terraform", "direct"] diff --git a/acceptance/bundle/variables/var_in_var_3level/output.txt b/acceptance/bundle/variables/var_in_var_3level/output.txt new file mode 100644 index 0000000000..8ddd6e951a --- /dev/null +++ b/acceptance/bundle/variables/var_in_var_3level/output.txt @@ -0,0 +1,20 @@ + +>>> [CLI] bundle validate -o json +{ + "endpoint_us": { + "default": "https://us.example.com", + "value": "https://us.example.com" + }, + "env": { + "default": "prod", + "value": "prod" + }, + "region_prod": { + "default": "us", + "value": "us" + }, + "result": { + "default": "https://us.example.com", + "value": "https://us.example.com" + } +} diff --git a/acceptance/bundle/variables/var_in_var_3level/script b/acceptance/bundle/variables/var_in_var_3level/script new file mode 100644 index 0000000000..1551014050 --- /dev/null +++ b/acceptance/bundle/variables/var_in_var_3level/script @@ -0,0 +1 @@ +trace $CLI bundle validate -o json | jq .variables diff --git a/bundle/config/mutator/warn_malformed_references.go b/bundle/config/mutator/warn_malformed_references.go new file mode 100644 index 0000000000..d13952b64b --- /dev/null +++ b/bundle/config/mutator/warn_malformed_references.go @@ -0,0 +1,72 @@ +package mutator + +import ( + "context" + "errors" + "slices" + + "github.com/databricks/cli/bundle" + "github.com/databricks/cli/libs/diag" + "github.com/databricks/cli/libs/dyn" + "github.com/databricks/cli/libs/interpolation" +) + +type warnMalformedReferences struct{} + +// WarnMalformedReferences returns a mutator that emits warnings for strings +// containing malformed variable references (e.g. "${foo.bar-}"). +func WarnMalformedReferences() bundle.Mutator { + return &warnMalformedReferences{} +} + +func (*warnMalformedReferences) Name() string { + return "WarnMalformedReferences" +} + +func (*warnMalformedReferences) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnostics { + var diags diag.Diagnostics + err := b.Config.Mutate(func(root dyn.Value) (dyn.Value, error) { + _, err := dyn.Walk(root, func(p dyn.Path, v dyn.Value) (dyn.Value, error) { + // Only check values with source locations to avoid false positives + // from synthesized/computed values. + if len(v.Locations()) == 0 { + return v, nil + } + + s, ok := v.AsString() + if !ok { + return v, nil + } + + _, parseErr := interpolation.Parse(s) + if parseErr == nil { + return v, nil + } + + var pe *interpolation.ParseError + if !errors.As(parseErr, &pe) { + return v, nil + } + + // Clone locations and adjust column with the position offset + // so the diagnostic points to the problematic reference. + locs := slices.Clone(v.Locations()) + if len(locs) > 0 { + locs[0].Column += pe.Pos + } + + diags = append(diags, diag.Diagnostic{ + Severity: diag.Warning, + Summary: pe.Msg, + Locations: locs, + Paths: []dyn.Path{p}, + }) + return v, nil + }) + return root, err + }) + if err != nil { + diags = diags.Extend(diag.FromErr(err)) + } + return diags +} diff --git a/bundle/phases/initialize.go b/bundle/phases/initialize.go index 64eb18943a..640fa5193e 100644 --- a/bundle/phases/initialize.go +++ b/bundle/phases/initialize.go @@ -95,6 +95,10 @@ func Initialize(ctx context.Context, b *bundle.Bundle) { // searches for strings with variable references in them. mutator.RewriteWorkspacePrefix(), + // Walks the config tree and emits warnings for malformed variable references + // (e.g. "${foo.bar-}") before variable resolution occurs. + mutator.WarnMalformedReferences(), + // Reads (dynamic): variables.* (checks if there's a value assigned to variable already or if it has lookup reference) // Updates (dynamic): variables.*.value (sets values from environment variables, variable files, or defaults) // Resolves and sets values for bundle variables in the following order: from environment variables, from variable files and then defaults diff --git a/libs/dyn/dynvar/ref.go b/libs/dyn/dynvar/ref.go index a048c80cb8..d52d1fa772 100644 --- a/libs/dyn/dynvar/ref.go +++ b/libs/dyn/dynvar/ref.go @@ -1,23 +1,13 @@ package dynvar import ( - "fmt" - "regexp" - + "github.com/databricks/cli/libs/diag" "github.com/databricks/cli/libs/dyn" -) - -var ( - // !!! Should be in sync with _variable_regex in Python code. - // !!! - // !!! See python/databricks/bundles/core/_transform.py - baseVarDef = `[a-zA-Z]+([-_]*[a-zA-Z0-9]+)*` - re = regexp.MustCompile(fmt.Sprintf(`\$\{(%s(\.%s(\[[0-9]+\])*)*(\[[0-9]+\])*)\}`, baseVarDef, baseVarDef)) + "github.com/databricks/cli/libs/interpolation" ) // Ref represents a variable reference. // It is a string [dyn.Value] contained in a larger [dyn.Value]. -// Its path within the containing [dyn.Value] is also stored. type Ref struct { // Original value. Value dyn.Value @@ -25,13 +15,13 @@ type Ref struct { // String value in the original [dyn.Value]. Str string - // Matches of the variable reference in the string. - Matches [][]string + // Parsed tokens from the interpolation parser. + Tokens []interpolation.Token } // NewRef returns a new Ref if the given [dyn.Value] contains a string // with one or more variable references. It returns false if the given -// [dyn.Value] does not contain variable references. +// [dyn.Value] does not contain variable references or if parsing fails. // // Examples of a valid variable references: // - "${a.b}" @@ -39,53 +29,101 @@ type Ref struct { // - "${a.b[0].c}" // - "${a} ${b} ${c}" func NewRef(v dyn.Value) (Ref, bool) { + ref, ok, _ := newRef(v) + return ref, ok +} + +// NewRefWithDiagnostics returns a new Ref along with any diagnostics. +// Parse errors for malformed references (e.g. "${foo.bar-}") are returned +// as warnings. The second return value is false when no valid references +// are found (either no references at all, or a parse error occurred). +func NewRefWithDiagnostics(v dyn.Value) (Ref, bool, diag.Diagnostics) { + return newRef(v) +} + +func newRef(v dyn.Value) (Ref, bool, diag.Diagnostics) { s, ok := v.AsString() if !ok { - return Ref{}, false + return Ref{}, false, nil + } + + tokens, err := interpolation.Parse(s) + if err != nil { + // Return parse error as a warning diagnostic. + return Ref{}, false, diag.Diagnostics{{ + Severity: diag.Warning, + Summary: err.Error(), + Locations: v.Locations(), + }} } - // Check if the string contains any variable references. - m := re.FindAllStringSubmatch(s, -1) - if len(m) == 0 { - return Ref{}, false + // Check if any token is a variable reference. + hasRef := false + for _, t := range tokens { + if t.Kind == interpolation.TokenRef { + hasRef = true + break + } + } + + if !hasRef { + return Ref{}, false, nil } return Ref{ - Value: v, - Str: s, - Matches: m, - }, true + Value: v, + Str: s, + Tokens: tokens, + }, true, nil } // IsPure returns true if the variable reference contains a single // variable reference and nothing more. We need this so we can // interpolate values of non-string types (i.e. it can be substituted). func (v Ref) IsPure() bool { - // Need single match, equal to the incoming string. - if len(v.Matches) == 0 || len(v.Matches[0]) == 0 { - panic("invalid variable reference; expect at least one match") - } - return v.Matches[0][0] == v.Str + return len(v.Tokens) == 1 && v.Tokens[0].Kind == interpolation.TokenRef } +// References returns the path strings of all variable references. func (v Ref) References() []string { var out []string - for _, m := range v.Matches { - out = append(out, m[1]) + for _, t := range v.Tokens { + if t.Kind == interpolation.TokenRef { + out = append(out, t.Value) + } } return out } +// IsPureVariableReference returns true if s is a single variable reference +// with no surrounding text. func IsPureVariableReference(s string) bool { - return len(s) > 0 && re.FindString(s) == s + if len(s) == 0 { + return false + } + tokens, err := interpolation.Parse(s) + if err != nil { + return false + } + return len(tokens) == 1 && tokens[0].Kind == interpolation.TokenRef } +// ContainsVariableReference returns true if s contains at least one variable reference. func ContainsVariableReference(s string) bool { - return re.MatchString(s) + tokens, err := interpolation.Parse(s) + if err != nil { + return false + } + for _, t := range tokens { + if t.Kind == interpolation.TokenRef { + return true + } + } + return false } -// If s is a pure variable reference, this function returns the corresponding -// dyn.Path. Otherwise, it returns false. +// PureReferenceToPath returns the corresponding [dyn.Path] if s is a pure +// variable reference. Otherwise, it returns false. func PureReferenceToPath(s string) (dyn.Path, bool) { ref, ok := NewRef(dyn.V(s)) if !ok { diff --git a/libs/dyn/dynvar/ref_test.go b/libs/dyn/dynvar/ref_test.go index d59d80cba1..3c98e1e474 100644 --- a/libs/dyn/dynvar/ref_test.go +++ b/libs/dyn/dynvar/ref_test.go @@ -3,6 +3,7 @@ package dynvar import ( "testing" + "github.com/databricks/cli/libs/diag" "github.com/databricks/cli/libs/dyn" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -47,6 +48,23 @@ func TestNewRefInvalidPattern(t *testing.T) { } } +func TestNewRefWithDiagnosticsMalformed(t *testing.T) { + v := dyn.NewValue("${foo.bar-}", []dyn.Location{{File: "test.yml", Line: 1, Column: 1}}) + _, ok, diags := NewRefWithDiagnostics(v) + assert.False(t, ok) + require.Len(t, diags, 1) + assert.Equal(t, diag.Warning, diags[0].Severity) + assert.Contains(t, diags[0].Summary, "invalid") +} + +func TestNewRefWithDiagnosticsValid(t *testing.T) { + v := dyn.V("${foo.bar}") + ref, ok, diags := NewRefWithDiagnostics(v) + assert.True(t, ok) + assert.Empty(t, diags) + assert.Equal(t, []string{"foo.bar"}, ref.References()) +} + func TestIsPureVariableReference(t *testing.T) { assert.False(t, IsPureVariableReference("")) assert.False(t, IsPureVariableReference("${foo.bar} suffix")) diff --git a/libs/dyn/dynvar/resolve.go b/libs/dyn/dynvar/resolve.go index b1366d93bb..f811f25394 100644 --- a/libs/dyn/dynvar/resolve.go +++ b/libs/dyn/dynvar/resolve.go @@ -7,6 +7,7 @@ import ( "strings" "github.com/databricks/cli/libs/dyn" + "github.com/databricks/cli/libs/interpolation" "github.com/databricks/cli/libs/utils" ) @@ -156,30 +157,39 @@ func (r *resolver) resolveRef(ref Ref, seen []string) (dyn.Value, error) { return dyn.NewValue(resolved[0].Value(), ref.Value.Locations()), nil } - // Not pure; perform string interpolation. - for j := range ref.Matches { - // The value is invalid if resolution returned [ErrSkipResolution]. - // We must skip those and leave the original variable reference in place. - if !resolved[j].IsValid() { - continue - } - - // Try to turn the resolved value into a string. - s, ok := resolved[j].AsString() - if !ok { - // Only allow primitive types to be converted to string. - switch resolved[j].Kind() { - case dyn.KindString, dyn.KindBool, dyn.KindInt, dyn.KindFloat, dyn.KindTime, dyn.KindNil: - s = fmt.Sprint(resolved[j].AsAny()) - default: - return dyn.InvalidValue, fmt.Errorf("cannot interpolate non-primitive value of type %s into string", resolved[j].Kind()) + // Not pure; perform token-based string interpolation. + var buf strings.Builder + refIdx := 0 + for _, tok := range ref.Tokens { + switch tok.Kind { + case interpolation.TokenLiteral: + buf.WriteString(tok.Value) + case interpolation.TokenRef: + // The value is invalid if resolution returned [ErrSkipResolution]. + // We must skip those and leave the original variable reference in place. + if !resolved[refIdx].IsValid() { + buf.WriteString("${") + buf.WriteString(tok.Value) + buf.WriteByte('}') + } else { + // Try to turn the resolved value into a string. + s, ok := resolved[refIdx].AsString() + if !ok { + // Only allow primitive types to be converted to string. + switch resolved[refIdx].Kind() { + case dyn.KindString, dyn.KindBool, dyn.KindInt, dyn.KindFloat, dyn.KindTime, dyn.KindNil: + s = fmt.Sprint(resolved[refIdx].AsAny()) + default: + return dyn.InvalidValue, fmt.Errorf("cannot interpolate non-primitive value of type %s into string", resolved[refIdx].Kind()) + } + } + buf.WriteString(s) } + refIdx++ } - - ref.Str = strings.Replace(ref.Str, ref.Matches[j][0], s, 1) } - return dyn.NewValue(ref.Str, ref.Value.Locations()), nil + return dyn.NewValue(buf.String(), ref.Value.Locations()), nil } func (r *resolver) resolveKey(key string, seen []string) (dyn.Value, error) { diff --git a/libs/dyn/dynvar/resolve_test.go b/libs/dyn/dynvar/resolve_test.go index 5b64029bae..959cf3344f 100644 --- a/libs/dyn/dynvar/resolve_test.go +++ b/libs/dyn/dynvar/resolve_test.go @@ -393,3 +393,33 @@ func TestResolveSequenceVariable(t *testing.T) { assert.Equal(t, "value1", seq[0].MustString()) assert.Equal(t, "value2", seq[1].MustString()) } + +func TestResolveNestedVariableReference(t *testing.T) { + in := dyn.V(map[string]dyn.Value{ + "tail": dyn.V("x"), + "foo_x": dyn.V("hello"), + "final": dyn.V("${foo_${tail}}"), + }) + + // First pass resolves ${tail} -> "x", producing "${foo_x}" for final. + out, err := dynvar.Resolve(in, dynvar.DefaultLookup(in)) + require.NoError(t, err) + + // After one pass, the inner ref is resolved but the outer is not yet. + assert.Equal(t, "${foo_x}", getByPath(t, out, "final").MustString()) +} + +func TestResolveThreeLevelNestedVariableReference(t *testing.T) { + in := dyn.V(map[string]dyn.Value{ + "c": dyn.V("z"), + "b_z": dyn.V("y"), + "a_y": dyn.V("hello"), + "final": dyn.V("${a_${b_${c}}}"), + }) + + // First pass resolves ${c} -> "z", producing "${a_${b_z}}" for final. + out, err := dynvar.Resolve(in, dynvar.DefaultLookup(in)) + require.NoError(t, err) + assert.Equal(t, "${a_${b_z}}", getByPath(t, out, "final").MustString()) +} + diff --git a/libs/interpolation/parse.go b/libs/interpolation/parse.go new file mode 100644 index 0000000000..ee0bb92ce9 --- /dev/null +++ b/libs/interpolation/parse.go @@ -0,0 +1,223 @@ +package interpolation + +import ( + "errors" + "fmt" + "regexp" + "strings" +) + +// ParseError is returned by [Parse] when the input contains a malformed +// variable reference. Pos is the byte offset in the original string where +// the problematic reference starts. +type ParseError struct { + Msg string + Pos int +} + +func (e *ParseError) Error() string { + return e.Msg +} + +// TokenKind represents the type of a parsed token. +type TokenKind int + +const ( + TokenLiteral TokenKind = iota // Literal text (no interpolation) + TokenRef // Variable reference: content between ${ and } +) + +// Token represents a parsed segment of an interpolation string. +type Token struct { + Kind TokenKind + Value string // For TokenLiteral: the literal text; For TokenRef: the path string (without ${}) + Start int // Start position in original string + End int // End position in original string (exclusive) +} + +const ( + dollarChar = '$' + openBrace = '{' + closeBrace = '}' +) + +// keyPattern validates a single key segment in a variable path. +// Matches: [a-zA-Z]+([-_]*[a-zA-Z0-9]+)* +// Examples: "foo", "my-job", "a_b_c", "abc123" +// +// PyDABs uses a duplicate regex to detect pure variable references +// (python/databricks/bundles/core/_transform.py). The patterns must stay in +// sync. Cross-language test cases live in testdata/variable_references.json +// and are run by both Go (TestParsePureVariableReferences) and Python +// (test_pure_variable_reference). When changing key/index/path validation +// or reference syntax, add cases to that file so both languages are tested. +var keyPattern = regexp.MustCompile(`^[a-zA-Z]+([-_]*[a-zA-Z0-9]+)*$`) + +// indexPattern matches one or more [N] index suffixes. +var indexPattern = regexp.MustCompile(`^(\[[0-9]+\])+$`) + +// Parse parses a string that may contain ${...} variable references. +// It returns a slice of tokens representing literal text and variable references. +// +// Nested references like "${a.${b}}" are supported by treating the outer +// "${a." as literal text so that inner references are resolved first. +// After resolution the resulting string (e.g. "${a.x}") is re-parsed. +// +// Examples: +// - "hello" -> [Literal("hello")] +// - "${a.b}" -> [Ref("a.b")] +// - "pre ${a.b} post" -> [Literal("pre "), Ref("a.b"), Literal(" post")] +// - "${a.${b}}" -> [Literal("${a."), Ref("b"), Literal("}")] +func Parse(s string) ([]Token, error) { + if len(s) == 0 { + return nil, nil + } + + var tokens []Token + i := 0 + var buf strings.Builder + litStart := 0 // tracks the start position in the original string for the current literal + + flushLiteral := func(end int) { + if buf.Len() > 0 { + tokens = append(tokens, Token{ + Kind: TokenLiteral, + Value: buf.String(), + Start: litStart, + End: end, + }) + buf.Reset() + } + } + + for i < len(s) { + switch s[i] { + case dollarChar: + // We see '$'. Look ahead. + if i+1 >= len(s) { + // Trailing '$' at end of string: treat as literal. + if buf.Len() == 0 { + litStart = i + } + buf.WriteByte(dollarChar) + i++ + continue + } + + if s[i+1] != openBrace { + // '$' not followed by '{': treat as literal. + if buf.Len() == 0 { + litStart = i + } + buf.WriteByte(dollarChar) + i++ + continue + } + + // Start of variable reference "${". + refStart := i + j := i + 2 // skip "${" + + // Scan until closing '}', handling nested '${'. + pathStart := j + nested := false + for j < len(s) && s[j] != closeBrace { + if s[j] == dollarChar && j+1 < len(s) && s[j+1] == openBrace { + // Nested '${' found. Treat the outer "${..." prefix as + // literal so inner references get resolved first. + // E.g. "${a.${b}}" produces: + // [Literal("${a."), Ref("b"), Literal("}")] + nested = true + break + } + j++ + } + + if nested { + if buf.Len() == 0 { + litStart = refStart + } + buf.WriteString(s[refStart:j]) + i = j + continue + } + + if j >= len(s) { + return nil, &ParseError{ + Msg: "unterminated variable reference", + Pos: refStart, + } + } + + path := s[pathStart:j] + j++ // skip '}' + + if path == "" { + return nil, &ParseError{ + Msg: "empty variable reference", + Pos: refStart, + } + } + + if err := validatePath(path); err != nil { + return nil, &ParseError{ + Msg: fmt.Sprintf("invalid variable reference ${%s}: %s", path, err), + Pos: refStart, + } + } + + flushLiteral(i) + tokens = append(tokens, Token{ + Kind: TokenRef, + Value: path, + Start: refStart, + End: j, + }) + i = j + + default: + if buf.Len() == 0 { + litStart = i + } + buf.WriteByte(s[i]) + i++ + } + } + + flushLiteral(i) + return tokens, nil +} + +// validatePath validates the path inside a ${...} reference by splitting on +// '.' and validating each segment individually. +func validatePath(path string) error { + segments := strings.Split(path, ".") + for _, seg := range segments { + if seg == "" { + return errors.New("invalid path") + } + + // Strip trailing [N] index suffixes to get the key part. + key, idx := splitKeyAndIndex(seg) + + if key == "" { + return fmt.Errorf("invalid key %q", seg) + } + if !keyPattern.MatchString(key) { + return fmt.Errorf("invalid key %q", key) + } + if idx != "" && !indexPattern.MatchString(idx) { + return fmt.Errorf("invalid index in %q", seg) + } + } + return nil +} + +// splitKeyAndIndex splits "foo[0][1]" into ("foo", "[0][1]"). +func splitKeyAndIndex(seg string) (string, string) { + i := strings.IndexByte(seg, '[') + if i < 0 { + return seg, "" + } + return seg[:i], seg[i:] +} diff --git a/libs/interpolation/parse_test.go b/libs/interpolation/parse_test.go new file mode 100644 index 0000000000..3c23eb6529 --- /dev/null +++ b/libs/interpolation/parse_test.go @@ -0,0 +1,214 @@ +package interpolation + +import ( + "encoding/json" + "os" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestParseValidPaths(t *testing.T) { + tests := []struct { + input string + path string + }{ + {"${a}", "a"}, + {"${abc}", "abc"}, + {"${a.b.c}", "a.b.c"}, + {"${a.b[0]}", "a.b[0]"}, + {"${a[0]}", "a[0]"}, + {"${a.b[0][1]}", "a.b[0][1]"}, + {"${a.b-c}", "a.b-c"}, + {"${a.b_c}", "a.b_c"}, + {"${a.b-c-d}", "a.b-c-d"}, + {"${a.b_c_d}", "a.b_c_d"}, + {"${abc.def.ghi}", "abc.def.ghi"}, + {"${a.b123}", "a.b123"}, + {"${resources.jobs.my-job.id}", "resources.jobs.my-job.id"}, + {"${var.my_var}", "var.my_var"}, + } + + for _, tt := range tests { + t.Run(tt.input, func(t *testing.T) { + tokens, err := Parse(tt.input) + require.NoError(t, err) + require.Len(t, tokens, 1) + assert.Equal(t, TokenRef, tokens[0].Kind) + assert.Equal(t, tt.path, tokens[0].Value) + }) + } +} + +func TestParse(t *testing.T) { + tests := []struct { + name string + input string + tokens []Token + }{ + { + "empty", + "", + nil, + }, + { + "literal_only", + "hello world", + []Token{{Kind: TokenLiteral, Value: "hello world", Start: 0, End: 11}}, + }, + { + "single_ref", + "${a.b}", + []Token{{Kind: TokenRef, Value: "a.b", Start: 0, End: 6}}, + }, + { + "multiple_refs", + "${a} ${b}", + []Token{ + {Kind: TokenRef, Value: "a", Start: 0, End: 4}, + {Kind: TokenLiteral, Value: " ", Start: 4, End: 5}, + {Kind: TokenRef, Value: "b", Start: 5, End: 9}, + }, + }, + { + "literal_ref_literal", + "pre ${a.b} post", + []Token{ + {Kind: TokenLiteral, Value: "pre ", Start: 0, End: 4}, + {Kind: TokenRef, Value: "a.b", Start: 4, End: 10}, + {Kind: TokenLiteral, Value: " post", Start: 10, End: 15}, + }, + }, + { + "adjacent_refs", + "${a}${b}", + []Token{ + {Kind: TokenRef, Value: "a", Start: 0, End: 4}, + {Kind: TokenRef, Value: "b", Start: 4, End: 8}, + }, + }, + { + "dollar_at_end", + "abc$", + []Token{{Kind: TokenLiteral, Value: "abc$", Start: 0, End: 4}}, + }, + { + "dollar_before_non_brace", + "$x", + []Token{{Kind: TokenLiteral, Value: "$x", Start: 0, End: 2}}, + }, + { + "dollar_mid_literal", + "a$b", + []Token{{Kind: TokenLiteral, Value: "a$b", Start: 0, End: 3}}, + }, + { + "backslash_at_end", + `abc\`, + []Token{{Kind: TokenLiteral, Value: `abc\`, Start: 0, End: 4}}, + }, + { + "nested_ref", + "${var.foo_${var.tail}}", + []Token{ + {Kind: TokenLiteral, Value: "${var.foo_", Start: 0, End: 10}, + {Kind: TokenRef, Value: "var.tail", Start: 10, End: 21}, + {Kind: TokenLiteral, Value: "}", Start: 21, End: 22}, + }, + }, + { + "three_level_nested_ref", + "${a_${b_${c}}}", + []Token{ + {Kind: TokenLiteral, Value: "${a_${b_", Start: 0, End: 8}, + {Kind: TokenRef, Value: "c", Start: 8, End: 12}, + {Kind: TokenLiteral, Value: "}}", Start: 12, End: 14}, + }, + }, + { + "nested_ref_mid_path", + "${a.${b.c}.d}", + []Token{ + {Kind: TokenLiteral, Value: "${a.", Start: 0, End: 4}, + {Kind: TokenRef, Value: "b.c", Start: 4, End: 10}, + {Kind: TokenLiteral, Value: ".d}", Start: 10, End: 13}, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tokens, err := Parse(tt.input) + require.NoError(t, err) + assert.Equal(t, tt.tokens, tokens) + }) + } +} + +func TestParseErrors(t *testing.T) { + tests := []struct { + name string + input string + errContains string + }{ + {"unterminated_ref", "${a.b", "unterminated"}, + {"empty_ref", "${}", "empty"}, + {"trailing_hyphen", "${foo.bar-}", "invalid"}, + {"double_dot", "${foo..bar}", "invalid"}, + {"leading_digit", "${0foo}", "invalid"}, + {"hyphen_start_segment", "${foo.-bar}", "invalid"}, + {"trailing_dot", "${foo.}", "invalid"}, + {"leading_dot", "${.foo}", "invalid"}, + {"space_in_path", "${foo. bar}", "invalid"}, + {"special_char", "${foo.bar!}", "invalid"}, + {"just_digits", "${123}", "invalid"}, + {"trailing_underscore", "${foo.bar_}", "invalid"}, + {"underscore_start_segment", "${foo._bar}", "invalid"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + _, err := Parse(tt.input) + require.Error(t, err) + assert.Contains(t, err.Error(), tt.errContains) + }) + } +} + +// TestParsePureVariableReferences loads shared test cases from +// testdata/variable_references.json and verifies the Go parser agrees +// on which strings are pure variable references. +// +// The same JSON file is consumed by the Python test suite +// (python/databricks_tests/core/test_variable_references.py) to +// verify that the Python regex stays in sync with the Go parser. +// +// When modifying the parser (e.g. adding new key patterns or +// reference syntax), add test cases to the JSON file so both Go +// and Python are validated. +func TestParsePureVariableReferences(t *testing.T) { + data, err := os.ReadFile("testdata/variable_references.json") + require.NoError(t, err) + + var cases []struct { + Input string `json:"input"` + IsPureRef bool `json:"is_pure_ref"` + Path *string `json:"path,omitempty"` + Comment string `json:"comment"` + } + require.NoError(t, json.Unmarshal(data, &cases)) + + for _, tc := range cases { + t.Run(tc.Comment, func(t *testing.T) { + tokens, parseErr := Parse(tc.Input) + + isPure := parseErr == nil && len(tokens) == 1 && tokens[0].Kind == TokenRef + assert.Equal(t, tc.IsPureRef, isPure, "input: %s", tc.Input) + + if tc.IsPureRef && tc.Path != nil { + assert.Equal(t, *tc.Path, tokens[0].Value) + } + }) + } +} diff --git a/libs/interpolation/testdata/variable_references.json b/libs/interpolation/testdata/variable_references.json new file mode 100644 index 0000000000..748f7ef969 --- /dev/null +++ b/libs/interpolation/testdata/variable_references.json @@ -0,0 +1,142 @@ +[ + { + "input": "${a.b}", + "is_pure_ref": true, + "path": "a.b", + "comment": "simple two-segment path" + }, + { + "input": "${a.b.c}", + "is_pure_ref": true, + "path": "a.b.c", + "comment": "three-segment path" + }, + { + "input": "${a.b[0].c}", + "is_pure_ref": true, + "path": "a.b[0].c", + "comment": "path with index" + }, + { + "input": "${a[0]}", + "is_pure_ref": true, + "path": "a[0]", + "comment": "single key with index" + }, + { + "input": "${a.b[0][1]}", + "is_pure_ref": true, + "path": "a.b[0][1]", + "comment": "path with multiple indices" + }, + { + "input": "${a.b-c}", + "is_pure_ref": true, + "path": "a.b-c", + "comment": "hyphen in key" + }, + { + "input": "${a.b_c}", + "is_pure_ref": true, + "path": "a.b_c", + "comment": "underscore in key" + }, + { + "input": "${resources.jobs.my-job.id}", + "is_pure_ref": true, + "path": "resources.jobs.my-job.id", + "comment": "typical resource reference" + }, + { + "input": "${var.my_var}", + "is_pure_ref": true, + "path": "var.my_var", + "comment": "typical variable reference" + }, + { + "input": "${a}", + "is_pure_ref": true, + "path": "a", + "comment": "single key" + }, + { + "input": "hello", + "is_pure_ref": false, + "comment": "plain string, no reference" + }, + { + "input": "${a} ${b}", + "is_pure_ref": false, + "comment": "multiple references, not pure" + }, + { + "input": "pre ${a.b} post", + "is_pure_ref": false, + "comment": "reference with surrounding text" + }, + { + "input": "${a}${b}", + "is_pure_ref": false, + "comment": "adjacent references, not pure" + }, + { + "input": "", + "is_pure_ref": false, + "comment": "empty string" + }, + { + "input": "${}", + "is_pure_ref": false, + "comment": "empty reference" + }, + { + "input": "${a.b", + "is_pure_ref": false, + "comment": "unterminated reference" + }, + { + "input": "${foo.bar-}", + "is_pure_ref": false, + "comment": "trailing hyphen in key" + }, + { + "input": "${foo..bar}", + "is_pure_ref": false, + "comment": "double dot in path" + }, + { + "input": "${0foo}", + "is_pure_ref": false, + "comment": "leading digit in key" + }, + { + "input": "${foo. bar}", + "is_pure_ref": false, + "comment": "space in path" + }, + { + "input": "${foo.bar!}", + "is_pure_ref": false, + "comment": "special char in key" + }, + { + "input": "${foo.bar_}", + "is_pure_ref": false, + "comment": "trailing underscore in key" + }, + { + "input": "${foo._bar}", + "is_pure_ref": false, + "comment": "underscore-prefixed segment" + }, + { + "input": "$x", + "is_pure_ref": false, + "comment": "dollar without brace" + }, + { + "input": "abc$", + "is_pure_ref": false, + "comment": "trailing dollar" + } +] diff --git a/python/databricks/bundles/core/_transform.py b/python/databricks/bundles/core/_transform.py index 8cf639c2c1..05f98417fb 100644 --- a/python/databricks/bundles/core/_transform.py +++ b/python/databricks/bundles/core/_transform.py @@ -272,9 +272,14 @@ def _unwrap_variable(tpe: type) -> Optional[type]: return None -# Regex for string corresponding to variables. +# Regex for detecting pure variable references (entire string is a single ${...}). # -# The source of truth is regex in libs/dyn/dynvar/ref.go +# This regex must stay in sync with the Go parser in libs/interpolation/parse.go. +# The Go parser is the source of truth for interpolation; this regex only needs +# to recognize pure references so PyDABs can wrap them as Variable objects. +# +# Cross-language tests in libs/interpolation/testdata/variable_references.json +# verify that this regex and the Go parser agree. # # Example: # - "${a.b}" diff --git a/python/databricks_tests/core/test_variable_references.py b/python/databricks_tests/core/test_variable_references.py new file mode 100644 index 0000000000..21c41e8db2 --- /dev/null +++ b/python/databricks_tests/core/test_variable_references.py @@ -0,0 +1,45 @@ +"""Cross-language test for variable reference detection. + +Loads shared test cases from libs/interpolation/testdata/variable_references.json +and verifies the Python regex agrees with the Go parser on which strings are pure +variable references. + +The same JSON file is consumed by the Go test suite +(libs/interpolation/parse_test.go:TestParsePureVariableReferences). + +When modifying the Go parser (e.g. adding new key patterns, escape sequences, +or reference syntax), add test cases to the JSON file so both Go and Python +are validated. +""" + +import json +from pathlib import Path + +import pytest + +from databricks.bundles.core._transform import _unwrap_variable_path + +_testdata = ( + Path(__file__).resolve().parents[3] + / "libs" + / "interpolation" + / "testdata" + / "variable_references.json" +) +_cases = json.loads(_testdata.read_text()) + + +@pytest.mark.parametrize( + "tc", + _cases, + ids=[tc["comment"] for tc in _cases], +) +def test_pure_variable_reference(tc): + result = _unwrap_variable_path(tc["input"]) + + if tc["is_pure_ref"]: + assert result == tc.get("path"), ( + f"expected pure ref with path={tc.get('path')!r}, got {result!r}" + ) + else: + assert result is None, f"expected None for non-pure ref, got {result!r}"