don't convert json in rowiterator._columns

tswast · tswast · commit 739a38231371 · 2025-03-10T17:12:31.000-05:00
diff --git a/google/cloud/bigquery/_pandas_helpers.py b/google/cloud/bigquery/_pandas_helpers.py
@@ -759,6 +759,9 @@ def _row_iterator_page_to_arrow(page, column_names, arrow_types):
 
     arrays = []
     for column_index, arrow_type in enumerate(arrow_types):
+        # RowIterator parses JSON, but for arrow, we actually want to keep them
+        # as strings.
+        # TODO: Support STRUCT<JSON> and ARRAY<JSON>.
         arrays.append(pyarrow.array(page._columns[column_index], type=arrow_type))
 
     if isinstance(column_names, pyarrow.Schema):
diff --git a/google/cloud/bigquery/_pyarrow_helpers.py b/google/cloud/bigquery/_pyarrow_helpers.py
@@ -66,9 +66,9 @@ def pyarrow_timestamp():
     # Prefer JSON type built-in to pyarrow (adding in 19.0.0), if available.
     # Otherwise, fallback to db-dtypes, where the JSONArrowType was added in 1.4.0,
     # but since they might have an older db-dtypes, have string as a fallback for that.
-    if hasattr(pyarrow, "json_"):
-        json_arrow_type = pyarrow.json_(pyarrow.string())
-    elif hasattr(db_dtypes, "JSONArrowType"):
+    # TODO(https://github.com/pandas-dev/pandas/issues/60958): switch to
+    # pyarrow.json_(pyarrow.string()) if available and supported by pandas.
+    if hasattr(db_dtypes, "JSONArrowType"):
         json_arrow_type = db_dtypes.JSONArrowType()
     else:
         json_arrow_type = pyarrow.string()
diff --git a/google/cloud/bigquery/exceptions.py b/google/cloud/bigquery/exceptions.py
@@ -33,3 +33,11 @@ class BigQueryStorageNotFoundError(BigQueryError):
 
 class LegacyPandasError(BigQueryError):
     """Raised when too old a version of pandas package is detected at runtime."""
+
+
+class BigQueryWarning(UserWarning):
+    """Base class for all custom warnings defined by the BigQuery client."""
+
+
+class JSONDtypeWarning(BigQueryWarning):
+    """Raised when JSON is used in to_dataframe() API."""
diff --git a/google/cloud/bigquery/table.py b/google/cloud/bigquery/table.py
@@ -3531,12 +3531,22 @@ def _row_iterator_page_columns(schema, response):
     columns = []
     rows = response.get("rows", [])
 
-    def get_column_data(field_index, field):
+    def get_column_data(field_index):
         for row in rows:
-            yield _helpers._field_from_json(row["f"][field_index]["v"], field)
+            yield row["f"][field_index]["v"]
+
+    def parse_column_data(column, field):
+        # pyarrow.json_() type needs to keep the data as a string, not parsed.
+        # TODO: support STRUCT<JSON> and ARRAY<JSON>
+        if field.field_type.casefold() == "json":
+            for value in column:
+                yield value
+
+        for value in column:
+            yield _helpers._field_from_json(value, field)
 
     for field_index, field in enumerate(schema):
-        columns.append(get_column_data(field_index, field))
+        columns.append(parse_column_data(get_column_data(field_index), field))
 
     return columns