44import os
55import re
66import typing
7+ import xml .etree .ElementTree as ET
78from typing import Any
89from urllib .parse import urlparse
910
6970)
7071_CANONICAL_CREATED_AT = "2000-01-01 00:00"
7172
73+ # --- Transient server value normalization ---
74+ # All patterns need both stdlib JSON ("key": "val") and orjson ("key":"val") variants.
75+ # YAML variants are not needed — normalization runs on JSON bodies before YAML serialization.
76+
77+ # traceId: 32-hex server trace ID (changes every request)
78+ _TRACE_ID_RE = re .compile (
79+ r'(?<="traceId": ")[0-9a-f]{32}(?=")' # JSON (stdlib)
80+ r"|"
81+ r'(?<="traceId":")[0-9a-f]{32}(?=")' # JSON (orjson)
82+ )
83+ _CANONICAL_TRACE_ID = "NORMALIZED_TRACE_ID_000000000000"
84+
85+ # authenticationId / authId: base64-encoded user UUID (differs between environments)
86+ # Only matches base64-like values (CiR...), not test-set values (newUser_auth_id)
87+ # The declarative API uses "authId", entity API uses "authenticationId"
88+ _AUTH_ID_RE = re .compile (
89+ r'(?<="authenticationId": ")[A-Za-z0-9+/]{20,}={0,2}(?=")'
90+ r"|"
91+ r'(?<="authenticationId":")[A-Za-z0-9+/]{20,}={0,2}(?=")'
92+ r"|"
93+ r'(?<="authId": ")[A-Za-z0-9+/]{20,}={0,2}(?=")'
94+ r"|"
95+ r'(?<="authId":")[A-Za-z0-9+/]{20,}={0,2}(?=")'
96+ )
97+ _CANONICAL_AUTH_ID = "NORMALIZED_AUTH_ID"
98+
99+ # bearerToken: base64-encoded token (differs between environments)
100+ _BEARER_TOKEN_RE = re .compile (
101+ r'(?<="bearerToken": ")[A-Za-z0-9+/._-]{20,}(?=")'
102+ r"|"
103+ r'(?<="bearerToken":")[A-Za-z0-9+/._-]{20,}(?=")'
104+ )
105+ _CANONICAL_BEARER_TOKEN = "NORMALIZED_BEARER_TOKEN"
106+
107+ # cacheId: 32-hex cache identifier (changes every request)
108+ _CACHE_ID_RE = re .compile (
109+ r'(?<="cacheId": ")[0-9a-f]{32}(?=")'
110+ r"|"
111+ r'(?<="cacheId":")[0-9a-f]{32}(?=")'
112+ )
113+ _CANONICAL_CACHE_ID = "NORMALIZED_CACHE_ID_000000000000"
114+
115+ # queryDurationMillis: runtime-dependent timing values
116+ _QUERY_DURATION_RE = re .compile (
117+ r'(?<="simpleSelect": )\d+' # JSON (stdlib)
118+ r"|"
119+ r'(?<="simpleSelect":)\d+' # JSON (orjson)
120+ r"|"
121+ r'(?<="createCacheTable": )\d+' # JSON (stdlib)
122+ r"|"
123+ r'(?<="createCacheTable":)\d+' # JSON (orjson)
124+ )
125+
72126# --- Dynamic hash normalization ---
73127# executionResult: 40-hex ":" 64-hex (body uses ":", URI uses "%3A")
74128_EXEC_HASH_BODY_RE = re .compile (r"[0-9a-f]{40}:[0-9a-f]{64}" )
89143_CANONICAL_EXECUTION_RESULT = "EXECUTION_NORMALIZED"
90144_CANONICAL_EXPORT_RESULT = "EXPORT_NORMALIZED"
91145
146+ # Indexed hash maps: preserve distinctness across different hashes within a cassette.
147+ # Each unique hash gets a unique index (e.g. EXECUTION_NORMALIZED_1, _2, ...).
148+ _exec_hash_map : dict [str , str ] = {}
149+ _export_hash_map : dict [str , str ] = {}
150+
151+
152+ def _exec_hash_replacer (match : re .Match ) -> str :
153+ """Replace execution hash with an indexed placeholder, preserving distinctness."""
154+ # Normalize URI-encoded variant (%3A → :) so both forms share the same map entry.
155+ original = match .group (0 ).replace ("%3A" , ":" ).replace ("%3a" , ":" )
156+ if original not in _exec_hash_map :
157+ idx = len (_exec_hash_map ) + 1
158+ _exec_hash_map [original ] = f"{ _CANONICAL_EXECUTION_RESULT } _{ idx } "
159+ return _exec_hash_map [original ]
160+
161+
162+ def _export_hash_replacer (match : re .Match ) -> str :
163+ """Replace export hash with an indexed placeholder, preserving distinctness."""
164+ original = match .group (0 )
165+ if original not in _export_hash_map :
166+ idx = len (_export_hash_map ) + 1
167+ _export_hash_map [original ] = f"{ _CANONICAL_EXPORT_RESULT } _{ idx } "
168+ return _export_hash_map [original ]
169+
92170
93171def configure_normalization (test_config : dict [str , Any ]) -> None :
94172 """Build normalization replacements from the active test environment config.
@@ -107,6 +185,8 @@ def configure_normalization(test_config: dict[str, Any]) -> None:
107185 global _normalization_replacements , _password_replacements , _normalization_configured
108186 replacements : list [tuple [str , str ]] = []
109187 _password_replacements = []
188+ _exec_hash_map .clear ()
189+ _export_hash_map .clear ()
110190
111191 parsed = urlparse (test_config .get ("host" , _CANONICAL_HOST ))
112192 active_scheme = parsed .scheme or "http"
@@ -191,16 +271,21 @@ def _apply_replacements(text: str) -> str:
191271
192272def _normalize_hashes_in_text (text : str ) -> str :
193273 """Replace transient server values with deterministic placeholders."""
194- text = _EXEC_HASH_BODY_RE .sub (_CANONICAL_EXECUTION_RESULT , text )
195- text = _EXPORT_HASH_BODY_RE .sub (_CANONICAL_EXPORT_RESULT , text )
274+ text = _EXEC_HASH_BODY_RE .sub (_exec_hash_replacer , text )
275+ text = _EXPORT_HASH_BODY_RE .sub (_export_hash_replacer , text )
196276 text = _CREATED_AT_RE .sub (_CANONICAL_CREATED_AT , text )
277+ text = _TRACE_ID_RE .sub (_CANONICAL_TRACE_ID , text )
278+ text = _AUTH_ID_RE .sub (_CANONICAL_AUTH_ID , text )
279+ text = _BEARER_TOKEN_RE .sub (_CANONICAL_BEARER_TOKEN , text )
280+ text = _CACHE_ID_RE .sub (_CANONICAL_CACHE_ID , text )
281+ text = _QUERY_DURATION_RE .sub ("0" , text )
197282 return text
198283
199284
200285def _normalize_hashes_in_uri (uri : str ) -> str :
201286 """Replace executionResult/exportResult hashes in a request URI."""
202- uri = _EXEC_HASH_URI_RE .sub (_CANONICAL_EXECUTION_RESULT , uri )
203- uri = _EXPORT_HASH_URI_RE .sub (_CANONICAL_EXPORT_RESULT , uri )
287+ uri = _EXEC_HASH_URI_RE .sub (_exec_hash_replacer , uri )
288+ uri = _EXPORT_HASH_URI_RE .sub (_export_hash_replacer , uri )
204289 return uri
205290
206291
@@ -260,6 +345,36 @@ def increase_indent(self, flow: bool = False, indentless: bool = False):
260345 return super ().increase_indent (flow , False )
261346
262347
348+ def _sort_xml_groups (text : str ) -> str :
349+ """Sort <group> elements in XLIFF/XML localization responses by id attribute.
350+
351+ The server may return localization groups in non-deterministic order,
352+ producing spurious diffs when cassettes are re-recorded.
353+ """
354+ if "<group " not in text :
355+ return text
356+ try :
357+ ns = "urn:oasis:names:tc:xliff:document:2.0"
358+ ET .register_namespace ("" , ns )
359+ root = ET .fromstring (text )
360+
361+ def _sort_groups (parent : ET .Element ) -> None :
362+ groups = [c for c in parent if c .tag == f"{{{ ns } }}group" or c .tag == "group" ]
363+ if len (groups ) > 1 :
364+ groups_sorted = sorted (groups , key = lambda g : g .get ("id" , "" ))
365+ for g in groups :
366+ parent .remove (g )
367+ for g in groups_sorted :
368+ parent .append (g )
369+ for child in parent :
370+ _sort_groups (child )
371+
372+ _sort_groups (root )
373+ return ET .tostring (root , encoding = "unicode" , xml_declaration = True )
374+ except ET .ParseError :
375+ return text
376+
377+
263378class CustomSerializerYaml :
264379 def deserialize (self , cassette_string : str ) -> dict [str , Any ]:
265380 cassette_dict = yaml .safe_load (cassette_string )
@@ -331,6 +446,8 @@ def custom_before_request(request, headers_str: str = HEADERS_STR):
331446 if _normalization_replacements :
332447 request .body = _apply_replacements (request .body )
333448 request .body = _normalize_hashes_in_text (request .body )
449+ if request .body .startswith ("<?xml" ):
450+ request .body = _sort_xml_groups (request .body )
334451
335452 if hasattr (request , headers_str ):
336453 request .headers = {header : request .headers [header ] for header in sorted (request .headers )}
@@ -385,7 +502,10 @@ def custom_before_response(
385502 elif isinstance (body_string , str ):
386503 if _normalization_replacements :
387504 body_string = _apply_replacements (body_string )
388- body ["string" ] = _normalize_hashes_in_text (body_string )
505+ body_string = _normalize_hashes_in_text (body_string )
506+ if body_string .startswith ("<?xml" ):
507+ body_string = _sort_xml_groups (body_string )
508+ body ["string" ] = body_string
389509
390510 return response
391511
0 commit comments