Skip to content

Commit 6828bad

Browse files
committed
feat(gooddata-pandas): use orjson for Arrow schema metadata parsing
Replace stdlib json with orjson in arrow_convertor.py for faster metadata parsing. Add orjson>=3.11.0 to the arrow optional dependency group and align the test group's pyarrow floor to match the arrow extra (>=23.0.1). risk: nonprod
1 parent 199fd91 commit 6828bad

4 files changed

Lines changed: 296 additions & 293 deletions

File tree

packages/gooddata-pandas/pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ classifiers = [
4747
]
4848

4949
[project.optional-dependencies]
50-
arrow = ["pyarrow>=23.0.1"]
50+
arrow = ["pyarrow>=23.0.1", "orjson>=3.11.0"]
5151

5252
[project.urls]
5353
Documentation = "https://gooddata-pandas.readthedocs.io/en/v1.62.0"
@@ -65,7 +65,7 @@ test = [
6565
"python-dotenv~=1.0.0",
6666
"pyyaml",
6767
"tests_support",
68-
"pyarrow>=16.1.0",
68+
"pyarrow>=23.0.1",
6969
]
7070

7171
[tool.ty.analysis]

packages/gooddata-pandas/src/gooddata_pandas/arrow_convertor.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
# (C) 2026 GoodData Corporation
22
from __future__ import annotations
33

4-
import json
54
from typing import Callable
65

6+
import orjson
77
import pandas
88

99
from gooddata_pandas.arrow_types import TypesMapper
@@ -55,7 +55,7 @@ def build_metric_field_index(table: pa.Table) -> dict[int, str]:
5555
result: dict[int, str] = {}
5656
for field in table.schema:
5757
if field.name.startswith(_FIELD_METRIC_GROUP) and field.metadata and b"gdc" in field.metadata:
58-
gdc = json.loads(field.metadata[b"gdc"])
58+
gdc = orjson.loads(field.metadata[b"gdc"])
5959
if "index" not in gdc:
6060
raise ValueError(
6161
f"Metric field {field.name!r} 'gdc' metadata is missing required key 'index'. "
@@ -78,7 +78,7 @@ def _parse_schema_metadata(table: pa.Table) -> dict:
7878
"Arrow table has no schema metadata. Expected GoodData metadata keys: " + ", ".join(_REQUIRED_SCHEMA_KEYS)
7979
)
8080
schema_meta = {
81-
k.decode(): json.loads(v) for k, v in table.schema.metadata.items() if k.decode() in _REQUIRED_SCHEMA_KEYS
81+
k.decode(): orjson.loads(v) for k, v in table.schema.metadata.items() if k.decode() in _REQUIRED_SCHEMA_KEYS
8282
}
8383
missing = [k for k in _REQUIRED_SCHEMA_KEYS if k not in schema_meta]
8484
if missing:
@@ -281,7 +281,7 @@ def _build_field_index(
281281
f"Data field {field.name!r} is missing required 'gdc' field metadata. "
282282
"The Arrow table must originate from the GoodData /binary execution endpoint."
283283
)
284-
gdc = json.loads(field.metadata[b"gdc"])
284+
gdc = orjson.loads(field.metadata[b"gdc"])
285285
label_values: list = list(gdc.get("label_values", []))
286286

287287
gdc_type = gdc.get("type")
@@ -416,9 +416,9 @@ def _label_ids_in_dim(dim: dict) -> set:
416416
{},
417417
)
418418

419-
# Pre-parse gdc metadata once to avoid O(N×M) json.loads calls in the header loop.
419+
# Pre-parse gdc metadata once to avoid O(N×M) orjson.loads calls in the header loop.
420420
parsed_gdcs: list[dict | None] = [
421-
json.loads(f.metadata[b"gdc"]) if f.metadata and b"gdc" in f.metadata else None for f in all_data_fields
421+
orjson.loads(f.metadata[b"gdc"]) if f.metadata and b"gdc" in f.metadata else None for f in all_data_fields
422422
]
423423

424424
result: list[list[int]] = []
@@ -499,9 +499,9 @@ def _label_ids_in_dim(dim: dict) -> set:
499499
f for f in table.schema if f.name.startswith(_FIELD_METRIC_GROUP) or f.name.startswith(_FIELD_GRAND_TOTAL)
500500
]
501501

502-
# Pre-parse gdc metadata once to avoid O(N×M) json.loads calls in the header loop.
502+
# Pre-parse gdc metadata once to avoid O(N×M) orjson.loads calls in the header loop.
503503
parsed_gdcs: list[dict | None] = [
504-
json.loads(f.metadata[b"gdc"]) if f.metadata and b"gdc" in f.metadata else None for f in all_data_fields
504+
orjson.loads(f.metadata[b"gdc"]) if f.metadata and b"gdc" in f.metadata else None for f in all_data_fields
505505
]
506506

507507
result: list[list[int]] = []
@@ -610,7 +610,7 @@ def _compute_primary_labels_from_fields(
610610
for field in all_data_fields:
611611
if not field.metadata or b"gdc" not in field.metadata:
612612
continue
613-
gdc = json.loads(field.metadata[b"gdc"])
613+
gdc = orjson.loads(field.metadata[b"gdc"])
614614
if gdc["type"] != _GDC_TYPE_METRIC:
615615
continue
616616
label_values: list = gdc.get("label_values", [])

packages/gooddata-sdk/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ test = [
7272
"python-dotenv~=1.0.0",
7373
"deepdiff~=8.5.0",
7474
"tests_support",
75-
"pyarrow>=16.1.0",
75+
"pyarrow>=23.0.1",
7676
]
7777

7878
[tool.ty.analysis]

0 commit comments

Comments
 (0)