Skip to content

Commit afc1b6d

Browse files
committed
fix(tests): environment-agnostic cassette normalization
VCR cassette normalization improvements to eliminate noisy diffs between local Docker and staging recordings: - Add _sort_known_arrays() that sorts only specific non-deterministic API response arrays (referenceProperties, edges, workspaceDataFilterColumns/References, userGroups) during YAML serialization. Unlike blanket sorting, this preserves server order for arrays that tests assert on. - Fix XML group sorting in bytes branch: _sort_xml_groups() was only called for str bodies, skipping bytes bodies from compressed responses. Now applied in both branches of custom_before_request() and custom_before_response(). - Add _stabilize_hash_placeholders() to rename session-global numeric hash indices (EXECUTION_NORMALIZED_3) to format-based names (EXECUTION_NORMALIZED_CSV, EXPORT_NORMALIZED_XLSX) for export cassettes, with per-cassette re-indexing for non-export hashes. - Add firstname/lastname/email to demo_user.json fixture so staging user profile matches local Docker bootstrap. risk: low
1 parent 19e1cc1 commit afc1b6d

2 files changed

Lines changed: 169 additions & 2 deletions

File tree

packages/tests-support/fixtures/demo_user.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,10 @@
33
"id": "demo",
44
"type": "user",
55
"attributes": {
6-
"authenticationId": ""
6+
"authenticationId": "",
7+
"firstname": "Demo",
8+
"lastname": "User",
9+
"email": "demo@example.com"
710
},
811
"relationships": {
912
"userGroups": {

packages/tests-support/src/tests_support/vcrpy_utils.py

Lines changed: 165 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,9 +145,16 @@
145145

146146
# Indexed hash maps: preserve distinctness across different hashes within a cassette.
147147
# Each unique hash gets a unique index (e.g. EXECUTION_NORMALIZED_1, _2, ...).
148+
# NOTE: indices may be non-sequential within a single cassette because the maps
149+
# accumulate across a test session. _reindex_cassette_hashes() fixes this at
150+
# serialization time so each cassette file is self-contained (starts from _1).
148151
_exec_hash_map: dict[str, str] = {}
149152
_export_hash_map: dict[str, str] = {}
150153

154+
# Regex for re-indexing placeholders at serialization time
155+
_REINDEX_EXEC_RE = re.compile(r"EXECUTION_NORMALIZED_(\d+)")
156+
_REINDEX_EXPORT_RE = re.compile(r"EXPORT_NORMALIZED_(\d+)")
157+
151158

152159
def _exec_hash_replacer(match: re.Match) -> str:
153160
"""Replace execution hash with an indexed placeholder, preserving distinctness."""
@@ -339,6 +346,105 @@ def get_vcr() -> vcr.VCR:
339346
return gd_vcr
340347

341348

349+
def _stabilize_hash_placeholders(cassette_dict: dict[str, Any]) -> dict[str, str]:
350+
"""Build a rename map for hash placeholders based on export format context.
351+
352+
During recording, execution/export hashes get session-global numeric indices
353+
(EXECUTION_NORMALIZED_3, EXPORT_NORMALIZED_5). These indices shift when test
354+
execution order changes, causing spurious cassette diffs.
355+
356+
This function scans the cassette for export context (POST /export/tabular with
357+
a ``format`` field) and returns a mapping from old placeholder names to stable,
358+
format-based names (e.g. EXECUTION_NORMALIZED_CSV, EXPORT_NORMALIZED_XLSX).
359+
360+
Non-export hashes (pure compute results) are re-indexed per-cassette starting
361+
from 1.
362+
"""
363+
exec_renames: dict[str, str] = {} # old placeholder → new name
364+
export_renames: dict[str, str] = {}
365+
format_counts: dict[str, int] = {} # format → count, for dedup
366+
367+
for interaction in cassette_dict.get("interactions", []):
368+
req = interaction["request"]
369+
resp = interaction["response"]
370+
371+
if req.get("method") != "POST":
372+
continue
373+
uri = req.get("uri", "")
374+
if "/export/tabular" not in uri:
375+
continue
376+
377+
body = req.get("body")
378+
if not isinstance(body, dict):
379+
continue
380+
fmt = (body.get("format") or "").upper()
381+
if not fmt:
382+
continue
383+
384+
# Track how many exports of each format we've seen
385+
format_counts[fmt] = format_counts.get(fmt, 0) + 1
386+
suffix = fmt if format_counts[fmt] == 1 else f"{fmt}_{format_counts[fmt]}"
387+
388+
# Map execution hash placeholder → format name
389+
exec_ref = body.get("executionResult")
390+
if isinstance(exec_ref, str) and _REINDEX_EXEC_RE.match(exec_ref):
391+
exec_renames[exec_ref] = f"{_CANONICAL_EXECUTION_RESULT}_{suffix}"
392+
393+
# Map export hash placeholder → format name (from response)
394+
resp_body = resp.get("body", {}).get("string")
395+
if isinstance(resp_body, dict):
396+
export_ref = resp_body.get("exportResult")
397+
if isinstance(export_ref, str) and _REINDEX_EXPORT_RE.match(export_ref):
398+
export_renames[export_ref] = f"{_CANONICAL_EXPORT_RESULT}_{suffix}"
399+
400+
# Re-index any remaining non-export execution hashes per-cassette
401+
all_exec_refs: list[str] = []
402+
all_export_refs: list[str] = []
403+
for interaction in cassette_dict.get("interactions", []):
404+
for part in ("request", "response"):
405+
_collect_placeholders(interaction[part], all_exec_refs, all_export_refs)
406+
407+
idx = 1
408+
for ref in dict.fromkeys(all_exec_refs): # preserves order, deduplicates
409+
if ref not in exec_renames:
410+
exec_renames[ref] = f"{_CANONICAL_EXECUTION_RESULT}_{idx}"
411+
idx += 1
412+
413+
idx = 1
414+
for ref in dict.fromkeys(all_export_refs):
415+
if ref not in export_renames:
416+
export_renames[ref] = f"{_CANONICAL_EXPORT_RESULT}_{idx}"
417+
idx += 1
418+
419+
return {**exec_renames, **export_renames}
420+
421+
422+
def _collect_placeholders(obj: Any, exec_refs: list[str], export_refs: list[str]) -> None:
423+
"""Walk a nested structure and collect all hash placeholder strings."""
424+
if isinstance(obj, str):
425+
exec_refs.extend(m.group(0) for m in _REINDEX_EXEC_RE.finditer(obj))
426+
export_refs.extend(m.group(0) for m in _REINDEX_EXPORT_RE.finditer(obj))
427+
elif isinstance(obj, dict):
428+
for v in obj.values():
429+
_collect_placeholders(v, exec_refs, export_refs)
430+
elif isinstance(obj, list):
431+
for item in obj:
432+
_collect_placeholders(item, exec_refs, export_refs)
433+
434+
435+
def _apply_hash_renames(obj: Any, renames: dict[str, str]) -> Any:
436+
"""Recursively apply placeholder renames across a nested structure."""
437+
if isinstance(obj, str):
438+
for old, new in renames.items():
439+
obj = obj.replace(old, new)
440+
return obj
441+
if isinstance(obj, dict):
442+
return {k: _apply_hash_renames(v, renames) for k, v in obj.items()}
443+
if isinstance(obj, list):
444+
return [_apply_hash_renames(item, renames) for item in obj]
445+
return obj
446+
447+
342448
class IndentDumper(yaml.SafeDumper):
343449
@typing.no_type_check
344450
def increase_indent(self, flow: bool = False, indentless: bool = False):
@@ -375,6 +481,52 @@ def _sort_groups(parent: ET.Element) -> None:
375481
return text
376482

377483

484+
# Keys whose array values are returned by the API in non-deterministic order.
485+
# Only these arrays are sorted during serialization — everything else keeps
486+
# the server's original order so that tests replaying from cassettes see the
487+
# same data that was recorded.
488+
_SORT_SAFE_KEYS: frozenset[str] = frozenset(
489+
{
490+
# Entity API: dataset reference properties (order varies between environments)
491+
"referenceProperties",
492+
# Entity API: workspace data filter columns & references
493+
"workspaceDataFilterColumns",
494+
"workspaceDataFilterReferences",
495+
# Dependent entities graph: edge list (set semantics, no inherent order)
496+
"edges",
497+
# Available assignees: user/group lists (order varies)
498+
"userGroups",
499+
}
500+
)
501+
502+
503+
def _sort_known_arrays(obj: Any, parent_key: str | None = None) -> Any:
504+
"""Sort only arrays under known non-deterministic keys.
505+
506+
Unlike a blanket sort of all complex arrays, this only sorts arrays that
507+
are children of keys listed in _SORT_SAFE_KEYS. Everything else is left
508+
in the server's original order so cassette replay matches recording.
509+
510+
Within the sorted scope, recursion sorts nested sub-arrays too (e.g.
511+
``referenceProperties[*].sources``).
512+
"""
513+
if isinstance(obj, dict):
514+
return {k: _sort_known_arrays(v, parent_key=k) for k, v in obj.items()}
515+
if isinstance(obj, list):
516+
if not obj:
517+
return obj
518+
# Recurse into children first
519+
items = [_sort_known_arrays(item, parent_key=parent_key) for item in obj]
520+
# Only sort if we are directly under a safe key
521+
if parent_key in _SORT_SAFE_KEYS and isinstance(items[0], (dict, list)):
522+
try:
523+
return sorted(items, key=lambda x: orjson.dumps(x, option=orjson.OPT_SORT_KEYS))
524+
except TypeError:
525+
return items
526+
return items
527+
return obj
528+
529+
378530
class CustomSerializerYaml:
379531
def deserialize(self, cassette_string: str) -> dict[str, Any]:
380532
cassette_dict = yaml.safe_load(cassette_string)
@@ -411,10 +563,18 @@ def serialize(self, cassette_dict: dict[str, Any]) -> str:
411563
interaction["request"]["body"] = request_body
412564
if response_body is not None and response_body["string"] != "":
413565
try:
414-
interaction["response"]["body"]["string"] = orjson.loads(response_body["string"])
566+
parsed = orjson.loads(response_body["string"])
567+
interaction["response"]["body"]["string"] = _sort_known_arrays(parsed)
415568
except (orjson.JSONDecodeError, UnicodeDecodeError):
416569
# these exceptions are expected while getting file content
417570
continue
571+
572+
# Stabilize hash placeholders: rename session-global numeric indices
573+
# to format-based names (EXPORT_NORMALIZED_CSV) or per-cassette indices.
574+
renames = _stabilize_hash_placeholders(cassette_dict)
575+
if renames:
576+
cassette_dict = _apply_hash_renames(cassette_dict, renames)
577+
418578
return yaml.dump(cassette_dict, Dumper=IndentDumper, sort_keys=True)
419579

420580

@@ -441,6 +601,8 @@ def custom_before_request(request, headers_str: str = HEADERS_STR):
441601
if _normalization_replacements:
442602
decoded = _apply_replacements(decoded)
443603
decoded = _normalize_hashes_in_text(decoded)
604+
if decoded.startswith("<?xml"):
605+
decoded = _sort_xml_groups(decoded)
444606
request.body = decoded.encode("utf-8")
445607
elif isinstance(request.body, str):
446608
if _normalization_replacements:
@@ -498,6 +660,8 @@ def custom_before_response(
498660
if _normalization_replacements:
499661
decoded = _apply_replacements(decoded)
500662
decoded = _normalize_hashes_in_text(decoded)
663+
if decoded.startswith("<?xml"):
664+
decoded = _sort_xml_groups(decoded)
501665
body["string"] = decoded.encode("utf-8")
502666
elif isinstance(body_string, str):
503667
if _normalization_replacements:

0 commit comments

Comments
 (0)