Skip to content

Commit 7609912

Browse files
committed
fix(tests): use indexed placeholders for hash normalization
Blanket replacement of all execution result hashes with the same EXECUTION_NORMALIZED constant collapsed distinct hashes into identical values, breaking tests that assert different result IDs (e.g. test_register_upload_notification). Switch to indexed placeholders (EXECUTION_NORMALIZED_1, _2, ...) so each unique hash gets a distinct canonical value while still removing environment-specific data from cassettes. risk: low
1 parent 9e0d663 commit 7609912

1 file changed

Lines changed: 125 additions & 5 deletions

File tree

packages/tests-support/src/tests_support/vcrpy_utils.py

Lines changed: 125 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import os
55
import re
66
import typing
7+
import xml.etree.ElementTree as ET
78
from typing import Any
89
from urllib.parse import urlparse
910

@@ -69,6 +70,59 @@
6970
)
7071
_CANONICAL_CREATED_AT = "2000-01-01 00:00"
7172

73+
# --- Transient server value normalization ---
74+
# All patterns need both stdlib JSON ("key": "val") and orjson ("key":"val") variants.
75+
# YAML variants are not needed — normalization runs on JSON bodies before YAML serialization.
76+
77+
# traceId: 32-hex server trace ID (changes every request)
78+
_TRACE_ID_RE = re.compile(
79+
r'(?<="traceId": ")[0-9a-f]{32}(?=")' # JSON (stdlib)
80+
r"|"
81+
r'(?<="traceId":")[0-9a-f]{32}(?=")' # JSON (orjson)
82+
)
83+
_CANONICAL_TRACE_ID = "NORMALIZED_TRACE_ID_000000000000"
84+
85+
# authenticationId / authId: base64-encoded user UUID (differs between environments)
86+
# Only matches base64-like values (CiR...), not test-set values (newUser_auth_id)
87+
# The declarative API uses "authId", entity API uses "authenticationId"
88+
_AUTH_ID_RE = re.compile(
89+
r'(?<="authenticationId": ")[A-Za-z0-9+/]{20,}={0,2}(?=")'
90+
r"|"
91+
r'(?<="authenticationId":")[A-Za-z0-9+/]{20,}={0,2}(?=")'
92+
r"|"
93+
r'(?<="authId": ")[A-Za-z0-9+/]{20,}={0,2}(?=")'
94+
r"|"
95+
r'(?<="authId":")[A-Za-z0-9+/]{20,}={0,2}(?=")'
96+
)
97+
_CANONICAL_AUTH_ID = "NORMALIZED_AUTH_ID"
98+
99+
# bearerToken: base64-encoded token (differs between environments)
100+
_BEARER_TOKEN_RE = re.compile(
101+
r'(?<="bearerToken": ")[A-Za-z0-9+/._-]{20,}(?=")'
102+
r"|"
103+
r'(?<="bearerToken":")[A-Za-z0-9+/._-]{20,}(?=")'
104+
)
105+
_CANONICAL_BEARER_TOKEN = "NORMALIZED_BEARER_TOKEN"
106+
107+
# cacheId: 32-hex cache identifier (changes every request)
108+
_CACHE_ID_RE = re.compile(
109+
r'(?<="cacheId": ")[0-9a-f]{32}(?=")'
110+
r"|"
111+
r'(?<="cacheId":")[0-9a-f]{32}(?=")'
112+
)
113+
_CANONICAL_CACHE_ID = "NORMALIZED_CACHE_ID_000000000000"
114+
115+
# queryDurationMillis: runtime-dependent timing values
116+
_QUERY_DURATION_RE = re.compile(
117+
r'(?<="simpleSelect": )\d+' # JSON (stdlib)
118+
r"|"
119+
r'(?<="simpleSelect":)\d+' # JSON (orjson)
120+
r"|"
121+
r'(?<="createCacheTable": )\d+' # JSON (stdlib)
122+
r"|"
123+
r'(?<="createCacheTable":)\d+' # JSON (orjson)
124+
)
125+
72126
# --- Dynamic hash normalization ---
73127
# executionResult: 40-hex ":" 64-hex (body uses ":", URI uses "%3A")
74128
_EXEC_HASH_BODY_RE = re.compile(r"[0-9a-f]{40}:[0-9a-f]{64}")
@@ -89,6 +143,30 @@
89143
_CANONICAL_EXECUTION_RESULT = "EXECUTION_NORMALIZED"
90144
_CANONICAL_EXPORT_RESULT = "EXPORT_NORMALIZED"
91145

146+
# Indexed hash maps: preserve distinctness across different hashes within a cassette.
147+
# Each unique hash gets a unique index (e.g. EXECUTION_NORMALIZED_1, _2, ...).
148+
_exec_hash_map: dict[str, str] = {}
149+
_export_hash_map: dict[str, str] = {}
150+
151+
152+
def _exec_hash_replacer(match: re.Match) -> str:
153+
"""Replace execution hash with an indexed placeholder, preserving distinctness."""
154+
# Normalize URI-encoded variant (%3A → :) so both forms share the same map entry.
155+
original = match.group(0).replace("%3A", ":").replace("%3a", ":")
156+
if original not in _exec_hash_map:
157+
idx = len(_exec_hash_map) + 1
158+
_exec_hash_map[original] = f"{_CANONICAL_EXECUTION_RESULT}_{idx}"
159+
return _exec_hash_map[original]
160+
161+
162+
def _export_hash_replacer(match: re.Match) -> str:
163+
"""Replace export hash with an indexed placeholder, preserving distinctness."""
164+
original = match.group(0)
165+
if original not in _export_hash_map:
166+
idx = len(_export_hash_map) + 1
167+
_export_hash_map[original] = f"{_CANONICAL_EXPORT_RESULT}_{idx}"
168+
return _export_hash_map[original]
169+
92170

93171
def configure_normalization(test_config: dict[str, Any]) -> None:
94172
"""Build normalization replacements from the active test environment config.
@@ -107,6 +185,8 @@ def configure_normalization(test_config: dict[str, Any]) -> None:
107185
global _normalization_replacements, _password_replacements, _normalization_configured
108186
replacements: list[tuple[str, str]] = []
109187
_password_replacements = []
188+
_exec_hash_map.clear()
189+
_export_hash_map.clear()
110190

111191
parsed = urlparse(test_config.get("host", _CANONICAL_HOST))
112192
active_scheme = parsed.scheme or "http"
@@ -191,16 +271,21 @@ def _apply_replacements(text: str) -> str:
191271

192272
def _normalize_hashes_in_text(text: str) -> str:
193273
"""Replace transient server values with deterministic placeholders."""
194-
text = _EXEC_HASH_BODY_RE.sub(_CANONICAL_EXECUTION_RESULT, text)
195-
text = _EXPORT_HASH_BODY_RE.sub(_CANONICAL_EXPORT_RESULT, text)
274+
text = _EXEC_HASH_BODY_RE.sub(_exec_hash_replacer, text)
275+
text = _EXPORT_HASH_BODY_RE.sub(_export_hash_replacer, text)
196276
text = _CREATED_AT_RE.sub(_CANONICAL_CREATED_AT, text)
277+
text = _TRACE_ID_RE.sub(_CANONICAL_TRACE_ID, text)
278+
text = _AUTH_ID_RE.sub(_CANONICAL_AUTH_ID, text)
279+
text = _BEARER_TOKEN_RE.sub(_CANONICAL_BEARER_TOKEN, text)
280+
text = _CACHE_ID_RE.sub(_CANONICAL_CACHE_ID, text)
281+
text = _QUERY_DURATION_RE.sub("0", text)
197282
return text
198283

199284

200285
def _normalize_hashes_in_uri(uri: str) -> str:
201286
"""Replace executionResult/exportResult hashes in a request URI."""
202-
uri = _EXEC_HASH_URI_RE.sub(_CANONICAL_EXECUTION_RESULT, uri)
203-
uri = _EXPORT_HASH_URI_RE.sub(_CANONICAL_EXPORT_RESULT, uri)
287+
uri = _EXEC_HASH_URI_RE.sub(_exec_hash_replacer, uri)
288+
uri = _EXPORT_HASH_URI_RE.sub(_export_hash_replacer, uri)
204289
return uri
205290

206291

@@ -260,6 +345,36 @@ def increase_indent(self, flow: bool = False, indentless: bool = False):
260345
return super().increase_indent(flow, False)
261346

262347

348+
def _sort_xml_groups(text: str) -> str:
349+
"""Sort <group> elements in XLIFF/XML localization responses by id attribute.
350+
351+
The server may return localization groups in non-deterministic order,
352+
producing spurious diffs when cassettes are re-recorded.
353+
"""
354+
if "<group " not in text:
355+
return text
356+
try:
357+
ns = "urn:oasis:names:tc:xliff:document:2.0"
358+
ET.register_namespace("", ns)
359+
root = ET.fromstring(text)
360+
361+
def _sort_groups(parent: ET.Element) -> None:
362+
groups = [c for c in parent if c.tag == f"{{{ns}}}group" or c.tag == "group"]
363+
if len(groups) > 1:
364+
groups_sorted = sorted(groups, key=lambda g: g.get("id", ""))
365+
for g in groups:
366+
parent.remove(g)
367+
for g in groups_sorted:
368+
parent.append(g)
369+
for child in parent:
370+
_sort_groups(child)
371+
372+
_sort_groups(root)
373+
return ET.tostring(root, encoding="unicode", xml_declaration=True)
374+
except ET.ParseError:
375+
return text
376+
377+
263378
class CustomSerializerYaml:
264379
def deserialize(self, cassette_string: str) -> dict[str, Any]:
265380
cassette_dict = yaml.safe_load(cassette_string)
@@ -331,6 +446,8 @@ def custom_before_request(request, headers_str: str = HEADERS_STR):
331446
if _normalization_replacements:
332447
request.body = _apply_replacements(request.body)
333448
request.body = _normalize_hashes_in_text(request.body)
449+
if request.body.startswith("<?xml"):
450+
request.body = _sort_xml_groups(request.body)
334451

335452
if hasattr(request, headers_str):
336453
request.headers = {header: request.headers[header] for header in sorted(request.headers)}
@@ -385,7 +502,10 @@ def custom_before_response(
385502
elif isinstance(body_string, str):
386503
if _normalization_replacements:
387504
body_string = _apply_replacements(body_string)
388-
body["string"] = _normalize_hashes_in_text(body_string)
505+
body_string = _normalize_hashes_in_text(body_string)
506+
if body_string.startswith("<?xml"):
507+
body_string = _sort_xml_groups(body_string)
508+
body["string"] = body_string
389509

390510
return response
391511

0 commit comments

Comments
 (0)