145145
146146# Indexed hash maps: preserve distinctness across different hashes within a cassette.
147147# Each unique hash gets a unique index (e.g. EXECUTION_NORMALIZED_1, _2, ...).
148+ # NOTE: indices may be non-sequential within a single cassette because the maps
149+ # accumulate across a test session. _reindex_cassette_hashes() fixes this at
150+ # serialization time so each cassette file is self-contained (starts from _1).
148151_exec_hash_map : dict [str , str ] = {}
149152_export_hash_map : dict [str , str ] = {}
150153
154+ # Regex for re-indexing placeholders at serialization time
155+ _REINDEX_EXEC_RE = re .compile (r"EXECUTION_NORMALIZED_(\d+)" )
156+ _REINDEX_EXPORT_RE = re .compile (r"EXPORT_NORMALIZED_(\d+)" )
157+
151158
152159def _exec_hash_replacer (match : re .Match ) -> str :
153160 """Replace execution hash with an indexed placeholder, preserving distinctness."""
@@ -339,6 +346,105 @@ def get_vcr() -> vcr.VCR:
339346 return gd_vcr
340347
341348
349+ def _stabilize_hash_placeholders (cassette_dict : dict [str , Any ]) -> dict [str , str ]:
350+ """Build a rename map for hash placeholders based on export format context.
351+
352+ During recording, execution/export hashes get session-global numeric indices
353+ (EXECUTION_NORMALIZED_3, EXPORT_NORMALIZED_5). These indices shift when test
354+ execution order changes, causing spurious cassette diffs.
355+
356+ This function scans the cassette for export context (POST /export/tabular with
357+ a ``format`` field) and returns a mapping from old placeholder names to stable,
358+ format-based names (e.g. EXECUTION_NORMALIZED_CSV, EXPORT_NORMALIZED_XLSX).
359+
360+ Non-export hashes (pure compute results) are re-indexed per-cassette starting
361+ from 1.
362+ """
363+ exec_renames : dict [str , str ] = {} # old placeholder → new name
364+ export_renames : dict [str , str ] = {}
365+ format_counts : dict [str , int ] = {} # format → count, for dedup
366+
367+ for interaction in cassette_dict .get ("interactions" , []):
368+ req = interaction ["request" ]
369+ resp = interaction ["response" ]
370+
371+ if req .get ("method" ) != "POST" :
372+ continue
373+ uri = req .get ("uri" , "" )
374+ if "/export/tabular" not in uri :
375+ continue
376+
377+ body = req .get ("body" )
378+ if not isinstance (body , dict ):
379+ continue
380+ fmt = (body .get ("format" ) or "" ).upper ()
381+ if not fmt :
382+ continue
383+
384+ # Track how many exports of each format we've seen
385+ format_counts [fmt ] = format_counts .get (fmt , 0 ) + 1
386+ suffix = fmt if format_counts [fmt ] == 1 else f"{ fmt } _{ format_counts [fmt ]} "
387+
388+ # Map execution hash placeholder → format name
389+ exec_ref = body .get ("executionResult" )
390+ if isinstance (exec_ref , str ) and _REINDEX_EXEC_RE .match (exec_ref ):
391+ exec_renames [exec_ref ] = f"{ _CANONICAL_EXECUTION_RESULT } _{ suffix } "
392+
393+ # Map export hash placeholder → format name (from response)
394+ resp_body = resp .get ("body" , {}).get ("string" )
395+ if isinstance (resp_body , dict ):
396+ export_ref = resp_body .get ("exportResult" )
397+ if isinstance (export_ref , str ) and _REINDEX_EXPORT_RE .match (export_ref ):
398+ export_renames [export_ref ] = f"{ _CANONICAL_EXPORT_RESULT } _{ suffix } "
399+
400+ # Re-index any remaining non-export execution hashes per-cassette
401+ all_exec_refs : list [str ] = []
402+ all_export_refs : list [str ] = []
403+ for interaction in cassette_dict .get ("interactions" , []):
404+ for part in ("request" , "response" ):
405+ _collect_placeholders (interaction [part ], all_exec_refs , all_export_refs )
406+
407+ idx = 1
408+ for ref in dict .fromkeys (all_exec_refs ): # preserves order, deduplicates
409+ if ref not in exec_renames :
410+ exec_renames [ref ] = f"{ _CANONICAL_EXECUTION_RESULT } _{ idx } "
411+ idx += 1
412+
413+ idx = 1
414+ for ref in dict .fromkeys (all_export_refs ):
415+ if ref not in export_renames :
416+ export_renames [ref ] = f"{ _CANONICAL_EXPORT_RESULT } _{ idx } "
417+ idx += 1
418+
419+ return {** exec_renames , ** export_renames }
420+
421+
422+ def _collect_placeholders (obj : Any , exec_refs : list [str ], export_refs : list [str ]) -> None :
423+ """Walk a nested structure and collect all hash placeholder strings."""
424+ if isinstance (obj , str ):
425+ exec_refs .extend (m .group (0 ) for m in _REINDEX_EXEC_RE .finditer (obj ))
426+ export_refs .extend (m .group (0 ) for m in _REINDEX_EXPORT_RE .finditer (obj ))
427+ elif isinstance (obj , dict ):
428+ for v in obj .values ():
429+ _collect_placeholders (v , exec_refs , export_refs )
430+ elif isinstance (obj , list ):
431+ for item in obj :
432+ _collect_placeholders (item , exec_refs , export_refs )
433+
434+
435+ def _apply_hash_renames (obj : Any , renames : dict [str , str ]) -> Any :
436+ """Recursively apply placeholder renames across a nested structure."""
437+ if isinstance (obj , str ):
438+ for old , new in renames .items ():
439+ obj = obj .replace (old , new )
440+ return obj
441+ if isinstance (obj , dict ):
442+ return {k : _apply_hash_renames (v , renames ) for k , v in obj .items ()}
443+ if isinstance (obj , list ):
444+ return [_apply_hash_renames (item , renames ) for item in obj ]
445+ return obj
446+
447+
342448class IndentDumper (yaml .SafeDumper ):
343449 @typing .no_type_check
344450 def increase_indent (self , flow : bool = False , indentless : bool = False ):
@@ -375,6 +481,52 @@ def _sort_groups(parent: ET.Element) -> None:
375481 return text
376482
377483
484+ # Keys whose array values are returned by the API in non-deterministic order.
485+ # Only these arrays are sorted during serialization — everything else keeps
486+ # the server's original order so that tests replaying from cassettes see the
487+ # same data that was recorded.
488+ _SORT_SAFE_KEYS : frozenset [str ] = frozenset (
489+ {
490+ # Entity API: dataset reference properties (order varies between environments)
491+ "referenceProperties" ,
492+ # Entity API: workspace data filter columns & references
493+ "workspaceDataFilterColumns" ,
494+ "workspaceDataFilterReferences" ,
495+ # Dependent entities graph: edge list (set semantics, no inherent order)
496+ "edges" ,
497+ # Available assignees: user/group lists (order varies)
498+ "userGroups" ,
499+ }
500+ )
501+
502+
503+ def _sort_known_arrays (obj : Any , parent_key : str | None = None ) -> Any :
504+ """Sort only arrays under known non-deterministic keys.
505+
506+ Unlike a blanket sort of all complex arrays, this only sorts arrays that
507+ are children of keys listed in _SORT_SAFE_KEYS. Everything else is left
508+ in the server's original order so cassette replay matches recording.
509+
510+ Within the sorted scope, recursion sorts nested sub-arrays too (e.g.
511+ ``referenceProperties[*].sources``).
512+ """
513+ if isinstance (obj , dict ):
514+ return {k : _sort_known_arrays (v , parent_key = k ) for k , v in obj .items ()}
515+ if isinstance (obj , list ):
516+ if not obj :
517+ return obj
518+ # Recurse into children first
519+ items = [_sort_known_arrays (item , parent_key = parent_key ) for item in obj ]
520+ # Only sort if we are directly under a safe key
521+ if parent_key in _SORT_SAFE_KEYS and isinstance (items [0 ], (dict , list )):
522+ try :
523+ return sorted (items , key = lambda x : orjson .dumps (x , option = orjson .OPT_SORT_KEYS ))
524+ except TypeError :
525+ return items
526+ return items
527+ return obj
528+
529+
378530class CustomSerializerYaml :
379531 def deserialize (self , cassette_string : str ) -> dict [str , Any ]:
380532 cassette_dict = yaml .safe_load (cassette_string )
@@ -411,10 +563,18 @@ def serialize(self, cassette_dict: dict[str, Any]) -> str:
411563 interaction ["request" ]["body" ] = request_body
412564 if response_body is not None and response_body ["string" ] != "" :
413565 try :
414- interaction ["response" ]["body" ]["string" ] = orjson .loads (response_body ["string" ])
566+ parsed = orjson .loads (response_body ["string" ])
567+ interaction ["response" ]["body" ]["string" ] = _sort_known_arrays (parsed )
415568 except (orjson .JSONDecodeError , UnicodeDecodeError ):
416569 # these exceptions are expected while getting file content
417570 continue
571+
572+ # Stabilize hash placeholders: rename session-global numeric indices
573+ # to format-based names (EXPORT_NORMALIZED_CSV) or per-cassette indices.
574+ renames = _stabilize_hash_placeholders (cassette_dict )
575+ if renames :
576+ cassette_dict = _apply_hash_renames (cassette_dict , renames )
577+
418578 return yaml .dump (cassette_dict , Dumper = IndentDumper , sort_keys = True )
419579
420580
@@ -441,6 +601,8 @@ def custom_before_request(request, headers_str: str = HEADERS_STR):
441601 if _normalization_replacements :
442602 decoded = _apply_replacements (decoded )
443603 decoded = _normalize_hashes_in_text (decoded )
604+ if decoded .startswith ("<?xml" ):
605+ decoded = _sort_xml_groups (decoded )
444606 request .body = decoded .encode ("utf-8" )
445607 elif isinstance (request .body , str ):
446608 if _normalization_replacements :
@@ -498,6 +660,8 @@ def custom_before_response(
498660 if _normalization_replacements :
499661 decoded = _apply_replacements (decoded )
500662 decoded = _normalize_hashes_in_text (decoded )
663+ if decoded .startswith ("<?xml" ):
664+ decoded = _sort_xml_groups (decoded )
501665 body ["string" ] = decoded .encode ("utf-8" )
502666 elif isinstance (body_string , str ):
503667 if _normalization_replacements :
0 commit comments