From a32ef8919f6406d2605918df62cd10a09daea15c Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sat, 23 May 2026 18:45:07 -0500 Subject: [PATCH 01/18] feat(waterdata): replace per-page logger.info with a single progress line MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Paginated and chunked Water Data queries narrated their progress with per-page `logger.info` calls ("Requesting", "Next URL", "Remaining requests this hour"), which scrolled the console and were silent unless logging was configured. Replace them with one self-updating status line on stderr, rewritten in place as data arrives: waterdata · chunk 2/5 · 14 pages · 8,421 rows · 4,870 requests left - New `_progress.py`: a `ProgressReporter` (chunks / pages / rows / rate-limit) plus a `progress_context()` contextmanager and `current()` accessor. The active reporter lives in a ContextVar so the `chunked` decorator (chunk counts) and the page-walking loops (page/row/rate-limit counts) update it without threading a param through every signature. - Shown only on an interactive terminal by default so notebooks, logs, and CI stay clean; `DATARETRIEVAL_PROGRESS=1/0` forces it on/off. - `_next_req_url` is now a pure helper; rate-limit + page reporting moved into `_walk_pages` and `get_stats_data`. Surviving request-URL logs drop to `debug`. Warnings/errors on failure are unchanged. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/_progress.py | 151 +++++++++++++++++++++ dataretrieval/waterdata/api.py | 4 +- dataretrieval/waterdata/filters.py | 10 +- dataretrieval/waterdata/utils.py | 146 +++++++++++--------- tests/waterdata_progress_test.py | 195 +++++++++++++++++++++++++++ 5 files changed, 438 insertions(+), 68 deletions(-) create mode 100644 dataretrieval/waterdata/_progress.py create mode 100644 tests/waterdata_progress_test.py diff --git a/dataretrieval/waterdata/_progress.py b/dataretrieval/waterdata/_progress.py new file mode 100644 index 00000000..7e367558 --- /dev/null +++ b/dataretrieval/waterdata/_progress.py @@ -0,0 +1,151 @@ +"""A single self-updating status line for paginated / chunked Water Data queries. + +Water Data getters fan out two ways the caller can't see: long CQL filters are +split into URL-length-safe *chunks* (``filters.chunked``), and each request +follows ``next`` links across an unknown number of *pages* (``utils._walk_pages`` +and ``utils.get_stats_data``). This module surfaces that work as one line on +stderr, rewritten in place as data arrives:: + + waterdata · chunk 2/5 · 14 pages · 8,421 rows · 4,870 requests left + +It replaces the per-page ``logger.info`` calls that previously narrated the same +events one line at a time. + +The active reporter lives in a :class:`~contextvars.ContextVar` rather than being +threaded through every signature: progress is a cross-cutting concern that the +``chunked`` decorator (outer, chunk counts) and the page-walking loops (inner, +page/row/rate-limit counts) both update without knowing about each other. Call +:func:`progress_context` to activate one and :func:`current` to reach it. + +By default the line is shown only on an interactive terminal, so notebooks, +redirected logs, and CI stay clean. ``DATARETRIEVAL_PROGRESS`` forces it on +(``1``/``true``) or off (``0``/``false``). +""" + +from __future__ import annotations + +import contextvars +import os +import sys +from collections.abc import Iterator +from contextlib import contextmanager +from typing import TextIO + +# The reporter active for the current query. A ContextVar (not a module global) +# so concurrent queries — threads or async tasks sharing a client — each track +# their own progress line. +_active: contextvars.ContextVar[ProgressReporter | None] = contextvars.ContextVar( + "waterdata_progress", default=None +) + + +def _enabled_default(stream: TextIO) -> bool: + """Whether to draw the line: ``DATARETRIEVAL_PROGRESS`` wins, else TTY-only.""" + override = os.getenv("DATARETRIEVAL_PROGRESS") + if override is not None: + return override.strip().lower() not in {"", "0", "false", "no", "off"} + return hasattr(stream, "isatty") and stream.isatty() + + +class ProgressReporter: + """Accumulates query progress and rewrites a single status line in place. + + Every update method is a no-op when the reporter is disabled, so call sites + need no ``if enabled`` guards. The line is redrawn with a leading carriage + return and padded to erase the previous (possibly longer) contents; + :meth:`close` terminates it with a newline so the final state persists. + """ + + def __init__( + self, *, stream: TextIO | None = None, enabled: bool | None = None + ) -> None: + self._stream = stream if stream is not None else sys.stderr + self.enabled = _enabled_default(self._stream) if enabled is None else enabled + self.total_chunks = 1 + self.current_chunk = 0 + self.pages = 0 + self.rows = 0 + self.rate_remaining: str | None = None + self._last_len = 0 + self._closed = False + + def set_chunks(self, total: int) -> None: + """Record how many filter chunks this query was split into.""" + self.total_chunks = max(int(total), 1) + + def start_chunk(self, index: int) -> None: + """Mark the start of chunk ``index`` (1-based) and redraw.""" + self.current_chunk = index + self._render() + + def add_page(self, rows: int = 0) -> None: + """Record one fetched page carrying ``rows`` rows and redraw.""" + self.pages += 1 + self.rows += int(rows) + self._render() + + def set_rate_remaining(self, value: str | int | None) -> None: + """Update the remaining-requests count from an ``x-ratelimit-remaining`` header. + + Ignores empty/missing values so a page that omits the header doesn't + blank out the last known count. + """ + if value not in (None, ""): + self.rate_remaining = str(value) + + def _format(self) -> str: + parts = ["waterdata"] + if self.total_chunks > 1: + parts.append(f"chunk {self.current_chunk}/{self.total_chunks}") + parts.append(f"{self.pages} page" + ("" if self.pages == 1 else "s")) + if self.rows: + parts.append(f"{self.rows:,} rows") + if self.rate_remaining is not None: + parts.append(f"{self.rate_remaining} requests left") + return " · ".join(parts) + + def _render(self) -> None: + if not self.enabled or self._closed: + return + line = self._format() + pad = max(self._last_len - len(line), 0) + self._stream.write("\r" + line + " " * pad) + self._stream.flush() + self._last_len = len(line) + + def close(self) -> None: + """Finalize the line with a trailing newline so it persists on screen.""" + if self._closed: + return + self._closed = True + if self.enabled and (self.pages or self.current_chunk): + self._stream.write("\n") + self._stream.flush() + + +@contextmanager +def progress_context( + *, stream: TextIO | None = None, enabled: bool | None = None +) -> Iterator[ProgressReporter]: + """Activate a :class:`ProgressReporter` for the duration of a query. + + If a reporter is already active (a nested call), the existing one is yielded + unchanged so the outermost query owns the single line; only the outermost + context closes it. + """ + existing = _active.get() + if existing is not None: + yield existing + return + reporter = ProgressReporter(stream=stream, enabled=enabled) + token = _active.set(reporter) + try: + yield reporter + finally: + _active.reset(token) + reporter.close() + + +def current() -> ProgressReporter | None: + """Return the reporter active for the current query, or ``None``.""" + return _active.get() diff --git a/dataretrieval/waterdata/api.py b/dataretrieval/waterdata/api.py index ad268194..b9226423 100644 --- a/dataretrieval/waterdata/api.py +++ b/dataretrieval/waterdata/api.py @@ -2323,7 +2323,7 @@ def get_samples( req = PreparedRequest() req.prepare_url(url, params=params) - logger.info("Request: %s", req.url) + logger.debug("Request: %s", req.url) response = requests.get( url, params=params, verify=ssl_check, headers=_default_headers() @@ -2395,7 +2395,7 @@ def get_samples_summary( req = PreparedRequest() req.prepare_url(url, params=params) - logger.info("Request: %s", req.url) + logger.debug("Request: %s", req.url) response = requests.get( url, params=params, verify=ssl_check, headers=_default_headers() diff --git a/dataretrieval/waterdata/filters.py b/dataretrieval/waterdata/filters.py index 4c136b82..d68188a3 100644 --- a/dataretrieval/waterdata/filters.py +++ b/dataretrieval/waterdata/filters.py @@ -24,6 +24,8 @@ import pandas as pd import requests +from dataretrieval.waterdata import _progress + FILTER_LANG = Literal["cql-text", "cql-json"] # Conservative fallback budget when ``_chunk_cql_or`` is called without @@ -320,9 +322,15 @@ def wrapper( budget = _effective_filter_budget(args, filter_expr, build_request) chunks = _chunk_cql_or(filter_expr, max_len=budget) + reporter = _progress.current() + if reporter is not None: + reporter.set_chunks(len(chunks)) + frames: list[pd.DataFrame] = [] responses: list[requests.Response] = [] - for chunk in chunks: + for index, chunk in enumerate(chunks, start=1): + if reporter is not None: + reporter.start_chunk(index) frame, response = fetch_once({**args, "filter": chunk}) frames.append(frame) responses.append(response) diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 91228357..b7457e4c 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -14,7 +14,7 @@ from dataretrieval import __version__ from dataretrieval.utils import BaseMetadata -from dataretrieval.waterdata import filters +from dataretrieval.waterdata import _progress, filters from dataretrieval.waterdata.types import ( PROFILE_LOOKUP, PROFILES, @@ -545,9 +545,7 @@ def _next_req_url(resp: requests.Response) -> str | None: Notes ----- - - If the environment variable "API_USGS_PAT" is set, logs the remaining - requests for the current hour. - - Logs the next URL if found at info level. + - Returns None when the response carries no features. - Expects the response JSON to contain a "links" list with objects having "rel" and "href" keys. - Checks for the "next" relation in the "links" to determine the next URL. @@ -555,17 +553,9 @@ def _next_req_url(resp: requests.Response) -> str | None: body = resp.json() if not body.get("numberReturned"): return None - header_info = resp.headers - if os.getenv("API_USGS_PAT", ""): - logger.info( - "Remaining requests this hour: %s", - header_info.get("x-ratelimit-remaining", ""), - ) for link in body.get("links", []): if link.get("rel") == "next": - next_url = link.get("href") - logger.info("Next URL: %s", next_url) - return next_url + return link.get("href") return None @@ -650,7 +640,7 @@ def _walk_pages( Exception If a request fails/returns a non-200 status code. """ - logger.info("Requesting: %s", req.url) + logger.debug("Requesting: %s", req.url) if not geopd: logger.warning( @@ -658,6 +648,8 @@ def _walk_pages( "into pandas DataFrames." ) + reporter = _progress.current() + # Get first response from client # using GET or POST call close_client = client is None @@ -676,7 +668,11 @@ def _walk_pages( content = req.body if method == "POST" else None # List to collect dataframes from each page - dfs = [_get_resp_data(resp, geopd=geopd)] + page = _get_resp_data(resp, geopd=geopd) + dfs = [page] + if reporter is not None: + reporter.set_rate_remaining(resp.headers.get("x-ratelimit-remaining")) + reporter.add_page(rows=len(page)) curr_url = _next_req_url(resp) while curr_url: try: @@ -687,7 +683,13 @@ def _walk_pages( data=content if method == "POST" else None, ) _raise_for_non_200(resp) - dfs.append(_get_resp_data(resp, geopd=geopd)) + page = _get_resp_data(resp, geopd=geopd) + dfs.append(page) + if reporter is not None: + reporter.set_rate_remaining( + resp.headers.get("x-ratelimit-remaining") + ) + reporter.add_page(rows=len(page)) curr_url = _next_req_url(resp) except Exception as e: # noqa: BLE001 logger.error("Request incomplete: %s", e) @@ -913,7 +915,8 @@ def get_ogc_data( convert_type = args.pop("convert_type", False) args = {k: v for k, v in args.items() if v is not None} - return_list, response = _fetch_once(args) + with _progress.progress_context(): + return_list, response = _fetch_once(args) return_list = _deal_with_empty(return_list, properties, service) if convert_type: return_list = _type_cols(return_list) @@ -1114,7 +1117,7 @@ def get_stats_data( params=args, ) req = request.prepare() - logger.info("Request: %s", req.url) + logger.debug("Request: %s", req.url) # create temp client if not provided # and close it after the request is done @@ -1122,55 +1125,68 @@ def get_stats_data( client = client or requests.Session() try: - resp = client.send(req) - _raise_for_non_200(resp) - - # Store the initial response for metadata - initial_response = resp - - # Grab some aspects of the original request: headers and the - # request type (GET or POST) - method = req.method.upper() - headers = dict(req.headers) - - body = resp.json() - all_dfs = [_handle_stats_nesting(body, geopd=GEOPANDAS)] - - # Look for a next code in the response body - next_token = body["next"] - - while next_token: - args["next_token"] = next_token - - try: - resp = client.request( - method, - url=url, - params=args, - headers=headers, - ) - _raise_for_non_200(resp) - body = resp.json() - all_dfs.append(_handle_stats_nesting(body, geopd=GEOPANDAS)) - next_token = body["next"] - except Exception as e: # noqa: BLE001 - logger.error("Request incomplete: %s", e) - logger.warning( - "Request failed for URL: %s (next_token=%s). " - "Data download interrupted.", - url, - next_token, - ) - next_token = None - - dfs = pd.concat(all_dfs, ignore_index=True) if len(all_dfs) > 1 else all_dfs[0] + with _progress.progress_context() as reporter: + resp = client.send(req) + _raise_for_non_200(resp) + + # Store the initial response for metadata + initial_response = resp + + # Grab some aspects of the original request: headers and the + # request type (GET or POST) + method = req.method.upper() + headers = dict(req.headers) + + body = resp.json() + page = _handle_stats_nesting(body, geopd=GEOPANDAS) + all_dfs = [page] + reporter.set_rate_remaining(resp.headers.get("x-ratelimit-remaining")) + reporter.add_page(rows=len(page)) + + # Look for a next code in the response body + next_token = body["next"] + + while next_token: + args["next_token"] = next_token + + try: + resp = client.request( + method, + url=url, + params=args, + headers=headers, + ) + _raise_for_non_200(resp) + body = resp.json() + page = _handle_stats_nesting(body, geopd=GEOPANDAS) + all_dfs.append(page) + reporter.set_rate_remaining( + resp.headers.get("x-ratelimit-remaining") + ) + reporter.add_page(rows=len(page)) + next_token = body["next"] + except Exception as e: # noqa: BLE001 + logger.error("Request incomplete: %s", e) + logger.warning( + "Request failed for URL: %s (next_token=%s). " + "Data download interrupted.", + url, + next_token, + ) + next_token = None + + dfs = ( + pd.concat(all_dfs, ignore_index=True) + if len(all_dfs) > 1 + else all_dfs[0] + ) - # . If expand percentiles is True, make each percentile - # its own row in the returned dataset. - if expand_percentiles: - dfs = _expand_percentiles(dfs) + # . If expand percentiles is True, make each percentile + # its own row in the returned dataset. + if expand_percentiles: + dfs = _expand_percentiles(dfs) - return dfs, BaseMetadata(initial_response) + return dfs, BaseMetadata(initial_response) finally: if close_client: client.close() diff --git a/tests/waterdata_progress_test.py b/tests/waterdata_progress_test.py new file mode 100644 index 00000000..5710803c --- /dev/null +++ b/tests/waterdata_progress_test.py @@ -0,0 +1,195 @@ +"""Tests for the Water Data single-line progress reporter. + +Covers ProgressReporter rendering / no-op behavior, TTY + environment-variable +gating, progress_context nesting, and that the pagination loop in +``_walk_pages`` reports pages and the rate-limit header through an active +reporter. +""" + +import io +from unittest import mock + +import requests + +from dataretrieval.waterdata._progress import ( + ProgressReporter, + current, + progress_context, +) +from dataretrieval.waterdata.utils import _walk_pages + +# -- ProgressReporter rendering ------------------------------------------------ + + +def test_disabled_reporter_writes_nothing(): + stream = io.StringIO() + reporter = ProgressReporter(stream=stream, enabled=False) + reporter.set_chunks(3) + reporter.start_chunk(1) + reporter.add_page(rows=5) + reporter.set_rate_remaining("100") + reporter.close() + assert stream.getvalue() == "" + + +def test_renders_pages_rows_and_rate_limit(): + stream = io.StringIO() + reporter = ProgressReporter(stream=stream, enabled=True) + reporter.set_rate_remaining("4870") + reporter.add_page(rows=1234) + out = stream.getvalue() + assert "1 page" in out + assert "1,234 rows" in out + assert "4870 requests left" in out + + +def test_page_count_is_pluralized(): + stream = io.StringIO() + reporter = ProgressReporter(stream=stream, enabled=True) + reporter.add_page() + assert "1 page" in stream.getvalue() and "1 pages" not in stream.getvalue() + reporter.add_page() + assert "2 pages" in stream.getvalue() + + +def test_chunk_segment_only_shown_when_multiple_chunks(): + single = io.StringIO() + reporter = ProgressReporter(stream=single, enabled=True) + reporter.set_chunks(1) + reporter.add_page() + assert "chunk" not in single.getvalue() + + many = io.StringIO() + reporter = ProgressReporter(stream=many, enabled=True) + reporter.set_chunks(5) + reporter.start_chunk(2) + assert "chunk 2/5" in many.getvalue() + + +def test_missing_rate_limit_does_not_blank_last_known_value(): + stream = io.StringIO() + reporter = ProgressReporter(stream=stream, enabled=True) + reporter.set_rate_remaining("500") + reporter.set_rate_remaining(None) + reporter.set_rate_remaining("") + reporter.add_page() + assert "500 requests left" in stream.getvalue() + + +def test_close_terminates_active_line_with_newline(): + stream = io.StringIO() + reporter = ProgressReporter(stream=stream, enabled=True) + reporter.add_page() + reporter.close() + assert stream.getvalue().endswith("\n") + + +def test_close_without_activity_writes_nothing(): + stream = io.StringIO() + reporter = ProgressReporter(stream=stream, enabled=True) + reporter.close() + assert stream.getvalue() == "" + + +# -- enable/disable gating ----------------------------------------------------- + + +def test_default_disabled_for_non_tty(monkeypatch): + monkeypatch.delenv("DATARETRIEVAL_PROGRESS", raising=False) + # io.StringIO.isatty() returns False. + assert ProgressReporter(stream=io.StringIO()).enabled is False + + +def test_env_var_forces_on(monkeypatch): + monkeypatch.setenv("DATARETRIEVAL_PROGRESS", "1") + assert ProgressReporter(stream=io.StringIO()).enabled is True + + +def test_env_var_forces_off_even_on_tty(monkeypatch): + monkeypatch.setenv("DATARETRIEVAL_PROGRESS", "0") + tty = mock.MagicMock() + tty.isatty.return_value = True + assert ProgressReporter(stream=tty).enabled is False + + +# -- progress_context ---------------------------------------------------------- + + +def test_progress_context_sets_and_clears_current(monkeypatch): + monkeypatch.delenv("DATARETRIEVAL_PROGRESS", raising=False) + assert current() is None + with progress_context(enabled=False) as reporter: + assert current() is reporter + assert current() is None + + +def test_nested_context_reuses_outer_reporter(): + with progress_context(enabled=False) as outer: + with progress_context(enabled=False) as inner: + assert inner is outer + # Inner exit must not deactivate the outer reporter. + assert current() is outer + assert current() is None + + +# -- integration with _walk_pages --------------------------------------------- + + +def _resp(features, *, next_url=None, rate_remaining=None): + resp = mock.MagicMock() + links = [{"rel": "next", "href": next_url}] if next_url else [] + resp.json.return_value = { + "numberReturned": len(features), + "features": features, + "links": links, + } + headers = {} + if rate_remaining is not None: + headers["x-ratelimit-remaining"] = rate_remaining + resp.headers = headers + resp.status_code = 200 + return resp + + +def test_walk_pages_reports_pages_and_rate_limit(): + resp1 = _resp( + [{"id": "1", "properties": {"v": "a"}}], + next_url="https://example.com/p2", + rate_remaining="4999", + ) + resp2 = _resp([{"id": "2", "properties": {"v": "b"}}], rate_remaining="4998") + + client = mock.MagicMock(spec=requests.Session) + client.send.return_value = resp1 + client.request.return_value = resp2 + + req = mock.MagicMock(spec=requests.PreparedRequest) + req.method = "GET" + req.headers = {} + req.url = "https://example.com/p1" + + stream = io.StringIO() + with progress_context(stream=stream, enabled=True): + df, _ = _walk_pages(geopd=False, req=req, client=client) + + assert len(df) == 2 + out = stream.getvalue() + assert "2 pages" in out + assert "4998 requests left" in out + assert out.endswith("\n") + + +def test_walk_pages_without_context_does_not_error(): + # No active reporter: pagination must still work and stay silent. + resp = _resp([{"id": "1", "properties": {"v": "a"}}]) + client = mock.MagicMock(spec=requests.Session) + client.send.return_value = resp + + req = mock.MagicMock(spec=requests.PreparedRequest) + req.method = "GET" + req.headers = {} + req.url = "https://example.com/p1" + + df, _ = _walk_pages(geopd=False, req=req, client=client) + assert len(df) == 1 + assert current() is None From ba86615440109ec36d8592bc3fc6cae6dd46e2f8 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sat, 23 May 2026 18:57:02 -0500 Subject: [PATCH 02/18] style(waterdata): group the rate-limit count with thousands separators Render the x-ratelimit-remaining value like the row count (e.g. "4,998 requests left") when it's a plain integer, so the status-line segments read consistently. Non-numeric header values pass through unchanged. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/_progress.py | 6 +++++- tests/waterdata_progress_test.py | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/dataretrieval/waterdata/_progress.py b/dataretrieval/waterdata/_progress.py index 7e367558..a678f758 100644 --- a/dataretrieval/waterdata/_progress.py +++ b/dataretrieval/waterdata/_progress.py @@ -101,7 +101,11 @@ def _format(self) -> str: if self.rows: parts.append(f"{self.rows:,} rows") if self.rate_remaining is not None: - parts.append(f"{self.rate_remaining} requests left") + # The header is a string; group it like the row count when it's a + # plain integer, otherwise show it verbatim. + rate = self.rate_remaining + rate = f"{int(rate):,}" if rate.isdigit() else rate + parts.append(f"{rate} requests left") return " · ".join(parts) def _render(self) -> None: diff --git a/tests/waterdata_progress_test.py b/tests/waterdata_progress_test.py index 5710803c..6192cf85 100644 --- a/tests/waterdata_progress_test.py +++ b/tests/waterdata_progress_test.py @@ -40,7 +40,7 @@ def test_renders_pages_rows_and_rate_limit(): out = stream.getvalue() assert "1 page" in out assert "1,234 rows" in out - assert "4870 requests left" in out + assert "4,870 requests left" in out def test_page_count_is_pluralized(): @@ -175,7 +175,7 @@ def test_walk_pages_reports_pages_and_rate_limit(): assert len(df) == 2 out = stream.getvalue() assert "2 pages" in out - assert "4998 requests left" in out + assert "4,998 requests left" in out assert out.endswith("\n") From 77ae027d5ea35e8b01efe62abf8743daab25eb18 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sat, 23 May 2026 19:04:50 -0500 Subject: [PATCH 03/18] refactor(waterdata): rename progress env var; warn consistently on missing geopandas - Rename the progress toggle env var DATARETRIEVAL_PROGRESS -> API_USGS_PROGRESS, matching the existing API_USGS_PAT convention. - The "Geopandas not installed" advisory in the stats path was logged at INFO (silent by default) while the OGC path logged it at WARNING. Bump the stats one to WARNING so both paths surface it. Kept as logging rather than warnings.warn: logging's last-resort handler already prints WARNING+ to stderr without any logging setup, so the message is visible by default. - Add tests asserting both paths warn when geopandas is absent. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/_progress.py | 6 ++-- dataretrieval/waterdata/utils.py | 2 +- tests/waterdata_progress_test.py | 8 ++--- tests/waterdata_utils_test.py | 48 ++++++++++++++++++++++++++++ 4 files changed, 56 insertions(+), 8 deletions(-) diff --git a/dataretrieval/waterdata/_progress.py b/dataretrieval/waterdata/_progress.py index a678f758..c269f7ec 100644 --- a/dataretrieval/waterdata/_progress.py +++ b/dataretrieval/waterdata/_progress.py @@ -18,7 +18,7 @@ :func:`progress_context` to activate one and :func:`current` to reach it. By default the line is shown only on an interactive terminal, so notebooks, -redirected logs, and CI stay clean. ``DATARETRIEVAL_PROGRESS`` forces it on +redirected logs, and CI stay clean. ``API_USGS_PROGRESS`` forces it on (``1``/``true``) or off (``0``/``false``). """ @@ -40,8 +40,8 @@ def _enabled_default(stream: TextIO) -> bool: - """Whether to draw the line: ``DATARETRIEVAL_PROGRESS`` wins, else TTY-only.""" - override = os.getenv("DATARETRIEVAL_PROGRESS") + """Whether to draw the line: ``API_USGS_PROGRESS`` wins, else TTY-only.""" + override = os.getenv("API_USGS_PROGRESS") if override is not None: return override.strip().lower() not in {"", "0", "false", "no", "off"} return hasattr(stream, "isatty") and stream.isatty() diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index b7457e4c..18d3c9a6 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -964,7 +964,7 @@ def _handle_stats_nesting( return pd.DataFrame() if not geopd: - logger.info( + logger.warning( "Geopandas not installed. Geometries will be flattened " "into pandas DataFrames." ) diff --git a/tests/waterdata_progress_test.py b/tests/waterdata_progress_test.py index 6192cf85..d41a1f55 100644 --- a/tests/waterdata_progress_test.py +++ b/tests/waterdata_progress_test.py @@ -95,18 +95,18 @@ def test_close_without_activity_writes_nothing(): def test_default_disabled_for_non_tty(monkeypatch): - monkeypatch.delenv("DATARETRIEVAL_PROGRESS", raising=False) + monkeypatch.delenv("API_USGS_PROGRESS", raising=False) # io.StringIO.isatty() returns False. assert ProgressReporter(stream=io.StringIO()).enabled is False def test_env_var_forces_on(monkeypatch): - monkeypatch.setenv("DATARETRIEVAL_PROGRESS", "1") + monkeypatch.setenv("API_USGS_PROGRESS", "1") assert ProgressReporter(stream=io.StringIO()).enabled is True def test_env_var_forces_off_even_on_tty(monkeypatch): - monkeypatch.setenv("DATARETRIEVAL_PROGRESS", "0") + monkeypatch.setenv("API_USGS_PROGRESS", "0") tty = mock.MagicMock() tty.isatty.return_value = True assert ProgressReporter(stream=tty).enabled is False @@ -116,7 +116,7 @@ def test_env_var_forces_off_even_on_tty(monkeypatch): def test_progress_context_sets_and_clears_current(monkeypatch): - monkeypatch.delenv("DATARETRIEVAL_PROGRESS", raising=False) + monkeypatch.delenv("API_USGS_PROGRESS", raising=False) assert current() is None with progress_context(enabled=False) as reporter: assert current() is reporter diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py index b05587e2..604634e8 100644 --- a/tests/waterdata_utils_test.py +++ b/tests/waterdata_utils_test.py @@ -286,6 +286,54 @@ def test_handle_stats_nesting_tolerates_missing_drop_columns(): assert df["monitoring_location_id"].iloc[0] == "USGS-12345" +def _warning_messages(caplog): + """Pull WARNING-level message strings out of caplog.""" + return [r.getMessage() for r in caplog.records if r.levelno == logging.WARNING] + + +def test_walk_pages_warns_when_geopandas_missing(caplog): + caplog.set_level(logging.WARNING, logger=_LOGGER_NAME) + resp = _resp_ok([]) # single empty page, no "next" link + client = mock.MagicMock(spec=requests.Session) + client.send.return_value = resp + + req = mock.MagicMock(spec=requests.PreparedRequest) + req.method = "GET" + req.headers = {} + req.url = "https://example.com/p1" + + _walk_pages(geopd=False, req=req, client=client) + + assert any("Geopandas not installed" in m for m in _warning_messages(caplog)) + + +def test_handle_stats_nesting_warns_when_geopandas_missing(caplog): + # This path previously logged at INFO (silent by default); it must warn. + caplog.set_level(logging.WARNING, logger=_LOGGER_NAME) + body = { + "next": None, + "features": [ + { + "properties": { + "monitoring_location_id": "USGS-12345", + "data": [ + { + "parameter_code": "00060", + "unit_of_measure": "ft^3/s", + "parent_time_series_id": "ts-1", + "values": [{"statistic_id": "mean", "value": 10.0}], + } + ], + }, + } + ], + } + + _handle_stats_nesting(body, geopd=False) + + assert any("Geopandas not installed" in m for m in _warning_messages(caplog)) + + # --- _arrange_cols ---------------------------------------------------------- From 6f4e26a222c20257e64cfded59846d89ba9e4ecc Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sat, 23 May 2026 19:08:56 -0500 Subject: [PATCH 04/18] docs(waterdata): replace stale INFO-logging note with progress-line docs The README told users `logging.basicConfig(level=logging.INFO)` would show request URLs and the hourly remaining-requests count. After the progress-line change neither is true: request URLs moved to DEBUG and the remaining-requests count is now part of the progress line, so INFO shows nothing. Replace that note with an accurate "Tracking progress" section documenting the self-updating status line and the API_USGS_PROGRESS toggle, plus a pointer to DEBUG logging for the request URL. Add a NEWS.md entry. Co-Authored-By: Claude Opus 4.7 (1M context) --- NEWS.md | 2 ++ README.md | 29 ++++++++++++++++------------- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/NEWS.md b/NEWS.md index 18fb12b5..c58f54a5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,5 @@ +**05/23/2026:** Paginated and chunked `waterdata` queries now report progress on a single, self-updating line on `stderr` (chunk and page counts, rows retrieved, and `x-ratelimit-remaining`), replacing the per-page `INFO` log messages that previously narrated each request. The line is shown automatically on an interactive terminal and can be forced on/off with the `API_USGS_PROGRESS` environment variable (`1`/`0`). The full request URLs that were logged at `INFO` are now at `DEBUG` (enable with `logging.basicConfig(level=logging.DEBUG)`); the per-hour "remaining requests" message is now part of the progress line. Failure-path warnings/errors are unchanged, and the "Geopandas not installed" advisory — previously `INFO` (silent) in the statistics path — is now logged at `WARNING` consistently with the other getters. + **05/14/2026:** Fixed two latent bugs in the paginated `waterdata` request loop (`_walk_pages` and `get_stats_data`). Previously, when `requests.Session.request(...)` itself raised mid-pagination (network error, timeout), the except block called `_error_body()` on the *prior page's* response, so the logged "error" described the wrong request and could itself crash on non-JSON bodies. Separately, no status-code check was performed on subsequent paginated responses, so a 5xx body that didn't include `numberReturned` was silently treated as an empty page — pagination quietly stopped and the user got truncated data with no error logged. The loop now status-checks each page like the initial request and reports the actual exception. The "best-effort" behavior (return whatever pages were collected) is preserved. **05/07/2026:** Bumped the declared minimum Python version from **3.8** to **3.9** (`pyproject.toml`'s `requires-python` and the ruff target). This brings the manifest in line with what was already being tested — CI's matrix has long covered only 3.9, 3.13, and 3.14, the `waterdata` test module already skipped itself on Python < 3.10, and several modules already use 3.9-only stdlib (e.g. `zoneinfo`). Users on 3.8 will no longer be able to install the package; please upgrade. diff --git a/README.md b/README.md index 3d8af5d3..24bb423f 100644 --- a/README.md +++ b/README.md @@ -120,22 +120,25 @@ Visit the [API Reference](https://doi-usgs.github.io/dataretrieval-python/reference/waterdata.html) for more information and examples on available services and input parameters. -**NEW:** This new module implements -[logging](https://docs.python.org/3/howto/logging.html#logging-basic-tutorial) -in which users can view the URL requests sent to the USGS Water Data APIs -and the number of requests they have remaining each hour. These messages can -be helpful for troubleshooting and support. To enable logging in your python -console or notebook: +**Tracking progress:** Paginated and chunked `waterdata` queries report their +progress on a single, self-updating line on `stderr` — showing the chunk and +page counts, rows retrieved so far, and the API requests remaining for the hour: -```python -import logging -logging.basicConfig(level=logging.INFO) -``` -To log messages to a file, you can specify a filename in the -`basicConfig` call: +```text +waterdata · chunk 2/5 · 14 pages · 8,421 rows · 4,870 requests left +``` + +The line appears automatically when `stderr` is an interactive terminal. Set the +`API_USGS_PROGRESS` environment variable to `1` to force it on (for example, when +redirecting output to a file) or to `0` to turn it off. + +For verbose troubleshooting and support — including the request URL sent to the +API — enable debug-level +[logging](https://docs.python.org/3/howto/logging.html#logging-basic-tutorial): ```python -logging.basicConfig(filename='waterdata.log', level=logging.INFO) +import logging +logging.basicConfig(level=logging.DEBUG) ``` ### Legacy NWIS Services (Deprecated but still functional) From 532ce097fd3f80f3a2ae073f48d52672ec163d4d Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sat, 23 May 2026 19:13:50 -0500 Subject: [PATCH 05/18] feat(waterdata): point unauthenticated users to API-key signup; hide env var from README MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a query finishes without ever seeing an x-ratelimit-remaining header — which typically means no API_USGS_PAT is configured — the progress line now appends a one-time pointer to API-key registration. Guarded so it never nags a caller who has a key, and shown at most once per process. Also stop documenting the API_USGS_PROGRESS toggle in the README (it stays in the module docstring); there is no environment-variable section in the docs to host it. README's "Tracking progress" note now just describes the line and its interactive-terminal default. Co-Authored-By: Claude Opus 4.7 (1M context) --- NEWS.md | 2 +- README.md | 4 +- dataretrieval/waterdata/_progress.py | 36 +++++++++++++-- tests/waterdata_progress_test.py | 66 ++++++++++++++++++++++++++++ 4 files changed, 100 insertions(+), 8 deletions(-) diff --git a/NEWS.md b/NEWS.md index c58f54a5..71b8caec 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,4 @@ -**05/23/2026:** Paginated and chunked `waterdata` queries now report progress on a single, self-updating line on `stderr` (chunk and page counts, rows retrieved, and `x-ratelimit-remaining`), replacing the per-page `INFO` log messages that previously narrated each request. The line is shown automatically on an interactive terminal and can be forced on/off with the `API_USGS_PROGRESS` environment variable (`1`/`0`). The full request URLs that were logged at `INFO` are now at `DEBUG` (enable with `logging.basicConfig(level=logging.DEBUG)`); the per-hour "remaining requests" message is now part of the progress line. Failure-path warnings/errors are unchanged, and the "Geopandas not installed" advisory — previously `INFO` (silent) in the statistics path — is now logged at `WARNING` consistently with the other getters. +**05/23/2026:** Paginated and chunked `waterdata` queries now report progress on a single, self-updating line on `stderr` (chunk and page counts, rows retrieved, and the remaining hourly request count from `x-ratelimit-remaining`), replacing the per-page `INFO` log messages that previously narrated each request. The line is shown automatically when `stderr` is an interactive terminal; if a query completes without ever seeing a rate-limit header — typically an unauthenticated caller — it points to API-key registration. The request URL that was logged at `INFO` is now at `DEBUG` (enable with `logging.basicConfig(level=logging.DEBUG)`), and the per-hour "remaining requests" message is now part of the progress line. Failure-path warnings/errors are unchanged, and the "Geopandas not installed" advisory — previously `INFO` (silent) in the statistics path — is now logged at `WARNING`, consistent with the other getters. **05/14/2026:** Fixed two latent bugs in the paginated `waterdata` request loop (`_walk_pages` and `get_stats_data`). Previously, when `requests.Session.request(...)` itself raised mid-pagination (network error, timeout), the except block called `_error_body()` on the *prior page's* response, so the logged "error" described the wrong request and could itself crash on non-JSON bodies. Separately, no status-code check was performed on subsequent paginated responses, so a 5xx body that didn't include `numberReturned` was silently treated as an empty page — pagination quietly stopped and the user got truncated data with no error logged. The loop now status-checks each page like the initial request and reports the actual exception. The "best-effort" behavior (return whatever pages were collected) is preserved. diff --git a/README.md b/README.md index 24bb423f..43f23bb1 100644 --- a/README.md +++ b/README.md @@ -128,9 +128,7 @@ page counts, rows retrieved so far, and the API requests remaining for the hour: waterdata · chunk 2/5 · 14 pages · 8,421 rows · 4,870 requests left ``` -The line appears automatically when `stderr` is an interactive terminal. Set the -`API_USGS_PROGRESS` environment variable to `1` to force it on (for example, when -redirecting output to a file) or to `0` to turn it off. +The line appears automatically when `stderr` is an interactive terminal. For verbose troubleshooting and support — including the request URL sent to the API — enable debug-level diff --git a/dataretrieval/waterdata/_progress.py b/dataretrieval/waterdata/_progress.py index c269f7ec..1b6c4e24 100644 --- a/dataretrieval/waterdata/_progress.py +++ b/dataretrieval/waterdata/_progress.py @@ -38,6 +38,14 @@ "waterdata_progress", default=None ) +# Where to register for an API key. Surfaced once when a query completes without +# ever seeing a rate-limit header, which usually means the caller is +# unauthenticated (see the API_USGS_PAT note in the README). +SIGNUP_URL = "https://api.waterdata.usgs.gov/signup/" + +# Process-level latch so the "no API key" pointer is shown at most once. +_api_key_hint_shown = False + def _enabled_default(stream: TextIO) -> bool: """Whether to draw the line: ``API_USGS_PROGRESS`` wins, else TTY-only.""" @@ -118,13 +126,33 @@ def _render(self) -> None: self._last_len = len(line) def close(self) -> None: - """Finalize the line with a trailing newline so it persists on screen.""" + """Finalize the line with a trailing newline so it persists on screen. + + If the query never observed a rate-limit header — which usually means no + API key is configured — append a one-time pointer to API-key + registration, since unauthenticated callers hit much lower limits. + """ if self._closed: return self._closed = True - if self.enabled and (self.pages or self.current_chunk): - self._stream.write("\n") - self._stream.flush() + if not (self.enabled and (self.pages or self.current_chunk)): + return + self._stream.write("\n") + self._maybe_hint_api_key() + self._stream.flush() + + def _maybe_hint_api_key(self) -> None: + global _api_key_hint_shown + if ( + _api_key_hint_shown + or self.rate_remaining is not None + or os.getenv("API_USGS_PAT") + ): + return + _api_key_hint_shown = True + self._stream.write( + f"No API key detected — register for higher rate limits at {SIGNUP_URL}\n" + ) @contextmanager diff --git a/tests/waterdata_progress_test.py b/tests/waterdata_progress_test.py index d41a1f55..9c7fe080 100644 --- a/tests/waterdata_progress_test.py +++ b/tests/waterdata_progress_test.py @@ -9,8 +9,10 @@ import io from unittest import mock +import pytest import requests +from dataretrieval.waterdata import _progress from dataretrieval.waterdata._progress import ( ProgressReporter, current, @@ -18,6 +20,14 @@ ) from dataretrieval.waterdata.utils import _walk_pages + +@pytest.fixture(autouse=True) +def _reset_api_key_hint_latch(monkeypatch): + """The 'no API key' pointer is latched once per process; reset it so each + test sees a clean slate regardless of order.""" + monkeypatch.setattr(_progress, "_api_key_hint_shown", False) + + # -- ProgressReporter rendering ------------------------------------------------ @@ -91,6 +101,62 @@ def test_close_without_activity_writes_nothing(): assert stream.getvalue() == "" +# -- API-key pointer ----------------------------------------------------------- + + +def test_hints_api_key_when_no_rate_limit_seen(monkeypatch): + monkeypatch.delenv("API_USGS_PAT", raising=False) + stream = io.StringIO() + reporter = ProgressReporter(stream=stream, enabled=True) + reporter.add_page(rows=5) # never set a rate-limit -> likely unauthenticated + reporter.close() + assert _progress.SIGNUP_URL in stream.getvalue() + + +def test_no_hint_when_rate_limit_was_seen(monkeypatch): + monkeypatch.delenv("API_USGS_PAT", raising=False) + stream = io.StringIO() + reporter = ProgressReporter(stream=stream, enabled=True) + reporter.set_rate_remaining("4999") + reporter.add_page(rows=5) + reporter.close() + assert _progress.SIGNUP_URL not in stream.getvalue() + + +def test_no_hint_when_api_key_present(monkeypatch): + monkeypatch.setenv("API_USGS_PAT", "secret") + stream = io.StringIO() + reporter = ProgressReporter(stream=stream, enabled=True) + reporter.add_page(rows=5) # no rate-limit, but a key is configured + reporter.close() + assert _progress.SIGNUP_URL not in stream.getvalue() + + +def test_no_hint_when_disabled(monkeypatch): + monkeypatch.delenv("API_USGS_PAT", raising=False) + stream = io.StringIO() + reporter = ProgressReporter(stream=stream, enabled=False) + reporter.add_page(rows=5) + reporter.close() + assert stream.getvalue() == "" + + +def test_api_key_hint_shown_at_most_once(monkeypatch): + monkeypatch.delenv("API_USGS_PAT", raising=False) + + first = io.StringIO() + r1 = ProgressReporter(stream=first, enabled=True) + r1.add_page(rows=5) + r1.close() + assert _progress.SIGNUP_URL in first.getvalue() + + second = io.StringIO() + r2 = ProgressReporter(stream=second, enabled=True) + r2.add_page(rows=5) + r2.close() + assert _progress.SIGNUP_URL not in second.getvalue() + + # -- enable/disable gating ----------------------------------------------------- From f8334dcf8ab579b8b7c33e6e1a2609b9b2c2c5a4 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sat, 23 May 2026 19:28:37 -0500 Subject: [PATCH 06/18] fix(waterdata): trigger API-key pointer on missing key, not missing rate-limit header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Anonymous requests still return an `x-ratelimit-remaining` header on some endpoints (and omit it on others), so "no rate-limit header seen" is an unreliable proxy for "no API key" — the pointer would fire inconsistently or not at all. Key the one-time pointer directly off the absence of `API_USGS_PAT` instead, which is exactly the condition it's nudging about. Co-Authored-By: Claude Opus 4.7 (1M context) --- NEWS.md | 2 +- dataretrieval/waterdata/_progress.py | 18 +++++++----------- tests/waterdata_progress_test.py | 12 +++++++----- 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/NEWS.md b/NEWS.md index 71b8caec..ff136276 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,4 @@ -**05/23/2026:** Paginated and chunked `waterdata` queries now report progress on a single, self-updating line on `stderr` (chunk and page counts, rows retrieved, and the remaining hourly request count from `x-ratelimit-remaining`), replacing the per-page `INFO` log messages that previously narrated each request. The line is shown automatically when `stderr` is an interactive terminal; if a query completes without ever seeing a rate-limit header — typically an unauthenticated caller — it points to API-key registration. The request URL that was logged at `INFO` is now at `DEBUG` (enable with `logging.basicConfig(level=logging.DEBUG)`), and the per-hour "remaining requests" message is now part of the progress line. Failure-path warnings/errors are unchanged, and the "Geopandas not installed" advisory — previously `INFO` (silent) in the statistics path — is now logged at `WARNING`, consistent with the other getters. +**05/23/2026:** Paginated and chunked `waterdata` queries now report progress on a single, self-updating line on `stderr` (chunk and page counts, rows retrieved, and the remaining hourly request count from `x-ratelimit-remaining`), replacing the per-page `INFO` log messages that previously narrated each request. The line is shown automatically when `stderr` is an interactive terminal; when no API key is configured it also points (once per process) to API-key registration, since unauthenticated callers hit much lower limits. The request URL that was logged at `INFO` is now at `DEBUG` (enable with `logging.basicConfig(level=logging.DEBUG)`), and the per-hour "remaining requests" message is now part of the progress line. Failure-path warnings/errors are unchanged, and the "Geopandas not installed" advisory — previously `INFO` (silent) in the statistics path — is now logged at `WARNING`, consistent with the other getters. **05/14/2026:** Fixed two latent bugs in the paginated `waterdata` request loop (`_walk_pages` and `get_stats_data`). Previously, when `requests.Session.request(...)` itself raised mid-pagination (network error, timeout), the except block called `_error_body()` on the *prior page's* response, so the logged "error" described the wrong request and could itself crash on non-JSON bodies. Separately, no status-code check was performed on subsequent paginated responses, so a 5xx body that didn't include `numberReturned` was silently treated as an empty page — pagination quietly stopped and the user got truncated data with no error logged. The loop now status-checks each page like the initial request and reports the actual exception. The "best-effort" behavior (return whatever pages were collected) is preserved. diff --git a/dataretrieval/waterdata/_progress.py b/dataretrieval/waterdata/_progress.py index 1b6c4e24..a0cf3049 100644 --- a/dataretrieval/waterdata/_progress.py +++ b/dataretrieval/waterdata/_progress.py @@ -38,9 +38,9 @@ "waterdata_progress", default=None ) -# Where to register for an API key. Surfaced once when a query completes without -# ever seeing a rate-limit header, which usually means the caller is -# unauthenticated (see the API_USGS_PAT note in the README). +# Where to register for an API key. Surfaced once when a query runs without an +# API key configured (no API_USGS_PAT), since unauthenticated callers hit much +# lower rate limits (see the API_USGS_PAT note in the README). SIGNUP_URL = "https://api.waterdata.usgs.gov/signup/" # Process-level latch so the "no API key" pointer is shown at most once. @@ -128,9 +128,9 @@ def _render(self) -> None: def close(self) -> None: """Finalize the line with a trailing newline so it persists on screen. - If the query never observed a rate-limit header — which usually means no - API key is configured — append a one-time pointer to API-key - registration, since unauthenticated callers hit much lower limits. + If no API key is configured (no ``API_USGS_PAT``), append a one-time + pointer to API-key registration, since unauthenticated callers hit much + lower rate limits. """ if self._closed: return @@ -143,11 +143,7 @@ def close(self) -> None: def _maybe_hint_api_key(self) -> None: global _api_key_hint_shown - if ( - _api_key_hint_shown - or self.rate_remaining is not None - or os.getenv("API_USGS_PAT") - ): + if _api_key_hint_shown or os.getenv("API_USGS_PAT"): return _api_key_hint_shown = True self._stream.write( diff --git a/tests/waterdata_progress_test.py b/tests/waterdata_progress_test.py index 9c7fe080..3e98e467 100644 --- a/tests/waterdata_progress_test.py +++ b/tests/waterdata_progress_test.py @@ -104,23 +104,25 @@ def test_close_without_activity_writes_nothing(): # -- API-key pointer ----------------------------------------------------------- -def test_hints_api_key_when_no_rate_limit_seen(monkeypatch): +def test_hints_api_key_when_no_key_configured(monkeypatch): monkeypatch.delenv("API_USGS_PAT", raising=False) stream = io.StringIO() reporter = ProgressReporter(stream=stream, enabled=True) - reporter.add_page(rows=5) # never set a rate-limit -> likely unauthenticated + reporter.add_page(rows=5) reporter.close() assert _progress.SIGNUP_URL in stream.getvalue() -def test_no_hint_when_rate_limit_was_seen(monkeypatch): +def test_hint_fires_even_when_rate_limit_was_seen(monkeypatch): + # Anonymous responses still carry a rate-limit header, so absence of a key + # — not absence of the header — is what drives the pointer. monkeypatch.delenv("API_USGS_PAT", raising=False) stream = io.StringIO() reporter = ProgressReporter(stream=stream, enabled=True) - reporter.set_rate_remaining("4999") + reporter.set_rate_remaining("891") reporter.add_page(rows=5) reporter.close() - assert _progress.SIGNUP_URL not in stream.getvalue() + assert _progress.SIGNUP_URL in stream.getvalue() def test_no_hint_when_api_key_present(monkeypatch): From 4a0a3a8bf8fe1d979d360a781c3eb2119cdaa231 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sat, 23 May 2026 19:29:55 -0500 Subject: [PATCH 07/18] =?UTF-8?q?docs:=20drop=20NEWS.md=20entry=20?= =?UTF-8?q?=E2=80=94=20this=20is=20a=20minor=20PR?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.7 (1M context) --- NEWS.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index ff136276..18fb12b5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,3 @@ -**05/23/2026:** Paginated and chunked `waterdata` queries now report progress on a single, self-updating line on `stderr` (chunk and page counts, rows retrieved, and the remaining hourly request count from `x-ratelimit-remaining`), replacing the per-page `INFO` log messages that previously narrated each request. The line is shown automatically when `stderr` is an interactive terminal; when no API key is configured it also points (once per process) to API-key registration, since unauthenticated callers hit much lower limits. The request URL that was logged at `INFO` is now at `DEBUG` (enable with `logging.basicConfig(level=logging.DEBUG)`), and the per-hour "remaining requests" message is now part of the progress line. Failure-path warnings/errors are unchanged, and the "Geopandas not installed" advisory — previously `INFO` (silent) in the statistics path — is now logged at `WARNING`, consistent with the other getters. - **05/14/2026:** Fixed two latent bugs in the paginated `waterdata` request loop (`_walk_pages` and `get_stats_data`). Previously, when `requests.Session.request(...)` itself raised mid-pagination (network error, timeout), the except block called `_error_body()` on the *prior page's* response, so the logged "error" described the wrong request and could itself crash on non-JSON bodies. Separately, no status-code check was performed on subsequent paginated responses, so a 5xx body that didn't include `numberReturned` was silently treated as an empty page — pagination quietly stopped and the user got truncated data with no error logged. The loop now status-checks each page like the initial request and reports the actual exception. The "best-effort" behavior (return whatever pages were collected) is preserved. **05/07/2026:** Bumped the declared minimum Python version from **3.8** to **3.9** (`pyproject.toml`'s `requires-python` and the ruff target). This brings the manifest in line with what was already being tested — CI's matrix has long covered only 3.9, 3.13, and 3.14, the `waterdata` test module already skipped itself on Python < 3.10, and several modules already use 3.9-only stdlib (e.g. `zoneinfo`). Users on 3.8 will no longer be able to install the package; please upgrade. From c940fcc8eb95dc0feb9b90afde7383757141b7c7 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sat, 23 May 2026 19:33:35 -0500 Subject: [PATCH 08/18] style(waterdata): label the status line "Progress:" instead of "waterdata" Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 2 +- dataretrieval/waterdata/_progress.py | 6 +++--- tests/waterdata_progress_test.py | 1 + 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 43f23bb1..5915ebb1 100644 --- a/README.md +++ b/README.md @@ -125,7 +125,7 @@ progress on a single, self-updating line on `stderr` — showing the chunk and page counts, rows retrieved so far, and the API requests remaining for the hour: ```text -waterdata · chunk 2/5 · 14 pages · 8,421 rows · 4,870 requests left +Progress: chunk 2/5 · 14 pages · 8,421 rows · 4,870 requests left ``` The line appears automatically when `stderr` is an interactive terminal. diff --git a/dataretrieval/waterdata/_progress.py b/dataretrieval/waterdata/_progress.py index a0cf3049..b77d1497 100644 --- a/dataretrieval/waterdata/_progress.py +++ b/dataretrieval/waterdata/_progress.py @@ -6,7 +6,7 @@ and ``utils.get_stats_data``). This module surfaces that work as one line on stderr, rewritten in place as data arrives:: - waterdata · chunk 2/5 · 14 pages · 8,421 rows · 4,870 requests left + Progress: chunk 2/5 · 14 pages · 8,421 rows · 4,870 requests left It replaces the per-page ``logger.info`` calls that previously narrated the same events one line at a time. @@ -102,7 +102,7 @@ def set_rate_remaining(self, value: str | int | None) -> None: self.rate_remaining = str(value) def _format(self) -> str: - parts = ["waterdata"] + parts: list[str] = [] if self.total_chunks > 1: parts.append(f"chunk {self.current_chunk}/{self.total_chunks}") parts.append(f"{self.pages} page" + ("" if self.pages == 1 else "s")) @@ -114,7 +114,7 @@ def _format(self) -> str: rate = self.rate_remaining rate = f"{int(rate):,}" if rate.isdigit() else rate parts.append(f"{rate} requests left") - return " · ".join(parts) + return "Progress: " + " · ".join(parts) def _render(self) -> None: if not self.enabled or self._closed: diff --git a/tests/waterdata_progress_test.py b/tests/waterdata_progress_test.py index 3e98e467..a641de86 100644 --- a/tests/waterdata_progress_test.py +++ b/tests/waterdata_progress_test.py @@ -48,6 +48,7 @@ def test_renders_pages_rows_and_rate_limit(): reporter.set_rate_remaining("4870") reporter.add_page(rows=1234) out = stream.getvalue() + assert out.lstrip("\r").startswith("Progress: ") assert "1 page" in out assert "1,234 rows" in out assert "4,870 requests left" in out From 16fc92068e6ee3dc58b246b622f5e04a8db304f6 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sat, 23 May 2026 19:41:22 -0500 Subject: [PATCH 09/18] fix(waterdata): never let the progress line crash or truncate a query MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Code review surfaced two real defects: - ProgressReporter rendering wrote to stderr unguarded. Because the per-page `reporter.add_page()` call sits inside `_walk_pages`' broad `except Exception`, a render failure — BrokenPipeError when output is piped to `head`, or UnicodeEncodeError for the `·` separator on a non-UTF-8 stderr — was misread as a failed request and silently truncated the result; on the first (pre-loop) page it crashed the query outright. `_render` and `close` now swallow stream errors and disable further rendering, so progress is strictly best-effort. - The "Geopandas not installed" advisory lived in the per-page `_handle_stats_nesting`, so a multi-page stats query (now at WARNING) emitted one warning per page. Moved it to `get_stats_data` so it fires once, matching `_walk_pages`. Also harden rate-limit formatting against non-decimal unicode digits (`isascii() and isdigit()`), and add regression tests: a broken progress stream must not truncate pagination, and the geopandas advisory fires once. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/_progress.py | 31 +++++++++++------ dataretrieval/waterdata/utils.py | 12 +++---- tests/waterdata_progress_test.py | 40 ++++++++++++++++++++++ tests/waterdata_utils_test.py | 50 ++++++++++++++++------------ 4 files changed, 95 insertions(+), 38 deletions(-) diff --git a/dataretrieval/waterdata/_progress.py b/dataretrieval/waterdata/_progress.py index b77d1497..51574353 100644 --- a/dataretrieval/waterdata/_progress.py +++ b/dataretrieval/waterdata/_progress.py @@ -110,20 +110,28 @@ def _format(self) -> str: parts.append(f"{self.rows:,} rows") if self.rate_remaining is not None: # The header is a string; group it like the row count when it's a - # plain integer, otherwise show it verbatim. + # plain ASCII integer, otherwise show it verbatim. (``str.isdigit`` + # alone is True for non-decimal unicode digits that ``int`` rejects.) rate = self.rate_remaining - rate = f"{int(rate):,}" if rate.isdigit() else rate + rate = f"{int(rate):,}" if rate.isascii() and rate.isdigit() else rate parts.append(f"{rate} requests left") return "Progress: " + " · ".join(parts) def _render(self) -> None: if not self.enabled or self._closed: return - line = self._format() - pad = max(self._last_len - len(line), 0) - self._stream.write("\r" + line + " " * pad) - self._stream.flush() - self._last_len = len(line) + try: + line = self._format() + pad = max(self._last_len - len(line), 0) + self._stream.write("\r" + line + " " * pad) + self._stream.flush() + self._last_len = len(line) + except Exception: # noqa: BLE001 + # Progress output is best-effort cosmetics; a broken pipe (output + # piped to ``head``), a closed stream, or an encoding error must + # never disturb — let alone truncate — the query. Disable so we + # don't retry on every subsequent page. + self.enabled = False def close(self) -> None: """Finalize the line with a trailing newline so it persists on screen. @@ -137,9 +145,12 @@ def close(self) -> None: self._closed = True if not (self.enabled and (self.pages or self.current_chunk)): return - self._stream.write("\n") - self._maybe_hint_api_key() - self._stream.flush() + try: + self._stream.write("\n") + self._maybe_hint_api_key() + self._stream.flush() + except Exception: # noqa: BLE001 + self.enabled = False def _maybe_hint_api_key(self) -> None: global _api_key_hint_shown diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 18d3c9a6..ee447a77 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -963,12 +963,6 @@ def _handle_stats_nesting( if body is None: return pd.DataFrame() - if not geopd: - logger.warning( - "Geopandas not installed. Geometries will be flattened " - "into pandas DataFrames." - ) - # If geopandas not installed, return a pandas dataframe # otherwise return a geodataframe if not geopd: @@ -1119,6 +1113,12 @@ def get_stats_data( req = request.prepare() logger.debug("Request: %s", req.url) + if not GEOPANDAS: + logger.warning( + "Geopandas not installed. Geometries will be flattened " + "into pandas DataFrames." + ) + # create temp client if not provided # and close it after the request is done close_client = client is None diff --git a/tests/waterdata_progress_test.py b/tests/waterdata_progress_test.py index a641de86..c3beed15 100644 --- a/tests/waterdata_progress_test.py +++ b/tests/waterdata_progress_test.py @@ -102,6 +102,24 @@ def test_close_without_activity_writes_nothing(): assert stream.getvalue() == "" +class _RaisingStream: + """A stream whose writes always fail, e.g. a broken pipe (output | head).""" + + def write(self, *_): + raise BrokenPipeError("broken pipe") + + def flush(self): + pass + + +def test_reporter_swallows_stream_errors_and_disables(monkeypatch): + monkeypatch.delenv("API_USGS_PAT", raising=False) + reporter = ProgressReporter(stream=_RaisingStream(), enabled=True) + reporter.add_page(rows=1) # render write raises -> must be swallowed + reporter.close() # newline + hint writes raise -> must be swallowed + assert reporter.enabled is False + + # -- API-key pointer ----------------------------------------------------------- @@ -262,3 +280,25 @@ def test_walk_pages_without_context_does_not_error(): df, _ = _walk_pages(geopd=False, req=req, client=client) assert len(df) == 1 assert current() is None + + +def test_broken_progress_stream_does_not_truncate_pagination(): + # A render failure (broken pipe) lands inside _walk_pages' per-page try; + # it must NOT be mistaken for a failed request and silently drop pages. + resp1 = _resp( + [{"id": "1", "properties": {"v": "a"}}], next_url="https://example.com/p2" + ) + resp2 = _resp([{"id": "2", "properties": {"v": "b"}}]) + client = mock.MagicMock(spec=requests.Session) + client.send.return_value = resp1 + client.request.return_value = resp2 + + req = mock.MagicMock(spec=requests.PreparedRequest) + req.method = "GET" + req.headers = {} + req.url = "https://example.com/p1" + + with progress_context(stream=_RaisingStream(), enabled=True): + df, _ = _walk_pages(geopd=False, req=req, client=client) + + assert len(df) == 2 # both pages returned despite the broken progress stream diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py index 604634e8..92c4dc26 100644 --- a/tests/waterdata_utils_test.py +++ b/tests/waterdata_utils_test.py @@ -307,31 +307,37 @@ def test_walk_pages_warns_when_geopandas_missing(caplog): assert any("Geopandas not installed" in m for m in _warning_messages(caplog)) -def test_handle_stats_nesting_warns_when_geopandas_missing(caplog): - # This path previously logged at INFO (silent by default); it must warn. +def test_get_stats_data_warns_once_when_geopandas_missing(caplog, monkeypatch): + # The advisory must fire once per query, not once per paginated page + # (it previously lived in the per-page _handle_stats_nesting helper). + from dataretrieval.waterdata.utils import get_stats_data + + monkeypatch.setattr(_utils_module, "GEOPANDAS", False) + monkeypatch.setattr( + _utils_module, + "_handle_stats_nesting", + mock.MagicMock(return_value=pd.DataFrame()), + ) caplog.set_level(logging.WARNING, logger=_LOGGER_NAME) - body = { - "next": None, - "features": [ - { - "properties": { - "monitoring_location_id": "USGS-12345", - "data": [ - { - "parameter_code": "00060", - "unit_of_measure": "ft^3/s", - "parent_time_series_id": "ts-1", - "values": [{"statistic_id": "mean", "value": 10.0}], - } - ], - }, - } - ], - } - _handle_stats_nesting(body, geopd=False) + page1 = mock.MagicMock(status_code=200, headers={}) + page1.json.return_value = {"next": "tok", "features": []} + page2 = mock.MagicMock(status_code=200, headers={}) + page2.json.return_value = {"next": None, "features": []} - assert any("Geopandas not installed" in m for m in _warning_messages(caplog)) + client = mock.MagicMock(spec=requests.Session) + client.send.return_value = page1 + client.request.return_value = page2 + + get_stats_data( + args={"monitoring_location_id": "USGS-1"}, + service="observationNormals", + expand_percentiles=False, + client=client, + ) + + geo = [m for m in _warning_messages(caplog) if "Geopandas not installed" in m] + assert len(geo) == 1 # --- _arrange_cols ---------------------------------------------------------- From 7bd41a1c9ca9a83b2d11dbfbbcfa3b58e99da8ff Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sat, 23 May 2026 21:00:46 -0500 Subject: [PATCH 10/18] fix(waterdata): show requests as remaining/limit, drop unavailable reset; review fixes The live USGS API returns X-Ratelimit-Limit + X-Ratelimit-Remaining but no reset header, so the earlier "resets in X" countdown read a header that never arrives. Replace it with the available " / requests remaining" (e.g. "995 / 1,000"); the reset parsing/countdown and its tests are removed. Also fixes two issues from code review: - close() used current_chunk as a proxy for "drew something", but start_chunk sets it even when it doesn't render, so a query that failed before any page printed a stray newline and burned the once-per-process API-key hint. Now gated on a _rendered flag set only after a successful write. - _maybe_hint_api_key set its latch before the write, so a failed write burned the hint process-wide; the latch is now set only after a successful write. Plus doc-comment accuracy: the ContextVar isolates one query's reporter (it does not give concurrent queries on one stderr separate lines), and references to the removed ``filters.chunked`` decorator now point at ``chunking`` / ``_paginate``. Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 7 ++- dataretrieval/waterdata/_progress.py | 91 ++++++++++++++-------------- dataretrieval/waterdata/utils.py | 4 +- tests/waterdata_progress_test.py | 33 ++-------- 4 files changed, 56 insertions(+), 79 deletions(-) diff --git a/README.md b/README.md index 1a5050e4..b950df5c 100644 --- a/README.md +++ b/README.md @@ -111,11 +111,12 @@ for more information and examples on available services and input parameters. **Tracking progress:** Paginated and chunked `waterdata` queries report their progress on a single, self-updating line on `stderr` — showing the chunk and -page counts, rows retrieved so far, and the API requests remaining (with the -time until the hourly limit resets, when the server reports it): +page counts, rows retrieved so far, and the hourly API quota remaining (the +``remaining / limit`` from the server's rate-limit headers, shown when you +have an API key set): ```text -Progress: chunk 2/5 · 14 pages · 8,421 rows · 4,870 requests remaining, resets in 47m +Progress: chunk 2/5 · 14 pages · 8,421 rows · 4,870 / 5,000 requests remaining ``` The line appears automatically when `stderr` is an interactive terminal. diff --git a/dataretrieval/waterdata/_progress.py b/dataretrieval/waterdata/_progress.py index 2712d4bc..d0b0c356 100644 --- a/dataretrieval/waterdata/_progress.py +++ b/dataretrieval/waterdata/_progress.py @@ -1,19 +1,19 @@ """A single self-updating status line for paginated / chunked Water Data queries. -Water Data getters fan out two ways the caller can't see: long CQL filters are -split into URL-length-safe *chunks* (``filters.chunked``), and each request -follows ``next`` links across an unknown number of *pages* (``utils._walk_pages`` -and ``utils.get_stats_data``). This module surfaces that work as one line on -stderr, rewritten in place as data arrives:: +Water Data getters fan out two ways the caller can't see: large multi-value +requests are split into URL-length-safe *chunks* (``chunking`` module), and each +request follows ``next`` links across an unknown number of *pages* +(``utils._paginate``). This module surfaces that work as one line on stderr, +rewritten in place as data arrives:: - Progress: chunk 2/5 · 14 pages · 8,421 rows · 4,870 requests remaining + Progress: chunk 2/5 · 14 pages · 8,421 rows · 4,870 / 5,000 requests remaining It replaces the per-page ``logger.info`` calls that previously narrated the same events one line at a time. The active reporter lives in a :class:`~contextvars.ContextVar` rather than being threaded through every signature: progress is a cross-cutting concern that the -``chunked`` decorator (outer, chunk counts) and the page-walking loops (inner, +chunk orchestrator (outer, chunk counts) and the page-walking loop (inner, page/row/rate-limit counts) both update without knowing about each other. Call :func:`progress_context` to activate one and :func:`current` to reach it. @@ -27,27 +27,25 @@ import contextvars import os import sys -import time from collections.abc import Iterator from contextlib import contextmanager from typing import TextIO -def _format_duration(seconds: float) -> str: - """Compact human duration: ``45s``, ``12m``, ``1h03m`` (clamped at 0).""" - secs = int(max(0, seconds)) - if secs < 60: - return f"{secs}s" - if secs < 3600: - return f"{secs // 60}m" - hours, rem = divmod(secs, 3600) - minutes = rem // 60 - return f"{hours}h{minutes:02d}m" if minutes else f"{hours}h" +def _group_int(value: str) -> str: + """Comma-group a plain ASCII integer string; pass anything else through. + + (``str.isdigit`` alone is True for non-decimal unicode digits that ``int`` + rejects, hence the ``isascii`` guard.) + """ + return f"{int(value):,}" if value.isascii() and value.isdigit() else value # The reporter active for the current query. A ContextVar (not a module global) -# so concurrent queries — threads or async tasks sharing a client — each track -# their own progress line. +# so the chunk orchestrator and the page loop resolve to the same reporter +# within one query, and an unrelated query in another context can't clobber its +# state. (It does not give concurrent queries sharing one stderr separate +# lines — they would still interleave.) _active: contextvars.ContextVar[ProgressReporter | None] = contextvars.ContextVar( "waterdata_progress", default=None ) @@ -88,10 +86,14 @@ def __init__( self.pages = 0 self.rows = 0 self.rate_remaining: str | None = None - # Absolute epoch second when the rate-limit window resets, derived from - # the server's reset header so the rendered countdown stays live. - self._reset_at: float | None = None + # The hourly request quota (``x-ratelimit-limit``), shown as the + # denominator when the server reports it. + self.rate_limit: str | None = None self._last_len = 0 + # Whether anything was actually written to the stream — drives whether + # close() needs a terminating newline. (``current_chunk`` is a poor + # proxy: ``start_chunk`` sets it even when it doesn't render.) + self._rendered = False self._closed = False def set_chunks(self, total: int) -> None: @@ -116,25 +118,19 @@ def add_page(self, rows: int = 0) -> None: self._render() def set_rate_remaining( - self, value: str | int | None, reset: str | int | None = None + self, value: str | int | None, limit: str | int | None = None ) -> None: """Update the rate-limit display from the response headers. - ``value`` is ``x-ratelimit-remaining``; ``reset`` is the optional - ``x-ratelimit-reset`` companion. Empty/missing values are ignored so a - page that omits a header doesn't blank out the last known value. The - reset value is interpreted as an absolute epoch second when large - (the conventional form) and as seconds-until-reset otherwise; either - way it's stored as an absolute deadline so the countdown stays live. + ``value`` is ``x-ratelimit-remaining``; ``limit`` is the optional + ``x-ratelimit-limit`` quota, shown as the denominator. Empty/missing + values are ignored so a page that omits a header doesn't blank out the + last known value. """ if value not in (None, ""): self.rate_remaining = str(value) - if reset not in (None, ""): - try: - secs = float(reset) - except (TypeError, ValueError): - return - self._reset_at = secs if secs > 1_000_000 else time.time() + secs + if limit not in (None, ""): + self.rate_limit = str(limit) def _format(self) -> str: parts: list[str] = [] @@ -144,15 +140,12 @@ def _format(self) -> str: if self.rows: parts.append(f"{self.rows:,} rows") if self.rate_remaining is not None: - # The header is a string; group it like the row count when it's a - # plain ASCII integer, otherwise show it verbatim. (``str.isdigit`` - # alone is True for non-decimal unicode digits that ``int`` rejects.) - rate = self.rate_remaining - rate = f"{int(rate):,}" if rate.isascii() and rate.isdigit() else rate - segment = f"{rate} requests remaining" - if self._reset_at is not None: - eta = _format_duration(self._reset_at - time.time()) - segment += f", resets in {eta}" + remaining = _group_int(self.rate_remaining) + if self.rate_limit is not None: + limit = _group_int(self.rate_limit) + segment = f"{remaining} / {limit} requests remaining" + else: + segment = f"{remaining} requests remaining" parts.append(segment) return "Progress: " + " · ".join(parts) @@ -165,6 +158,7 @@ def _render(self) -> None: self._stream.write("\r" + line + " " * pad) self._stream.flush() self._last_len = len(line) + self._rendered = True except Exception: # noqa: BLE001 # Progress output is best-effort cosmetics; a broken pipe (output # piped to ``head``), a closed stream, or an encoding error must @@ -182,7 +176,7 @@ def close(self) -> None: if self._closed: return self._closed = True - if not (self.enabled and (self.pages or self.current_chunk)): + if not (self.enabled and self._rendered): return try: self._stream.write("\n") @@ -195,10 +189,13 @@ def _maybe_hint_api_key(self) -> None: global _api_key_hint_shown if _api_key_hint_shown or os.getenv("API_USGS_PAT"): return - _api_key_hint_shown = True + # Set the once-per-process latch only after a successful write, so a + # failed write (broken pipe) doesn't silently burn the hint for every + # later query in the process. self._stream.write( f"No API key detected — register for higher rate limits at {SIGNUP_URL}\n" ) + _api_key_hint_shown = True @contextmanager diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index d552de12..cb85729e 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -938,7 +938,7 @@ def _paginate( if reporter is not None: reporter.set_rate_remaining( resp.headers.get(_QUOTA_HEADER), - reset=resp.headers.get("x-ratelimit-reset"), + limit=resp.headers.get("x-ratelimit-limit"), ) reporter.add_page(rows=len(df)) while cursor is not None: @@ -951,7 +951,7 @@ def _paginate( if reporter is not None: reporter.set_rate_remaining( resp.headers.get(_QUOTA_HEADER), - reset=resp.headers.get("x-ratelimit-reset"), + limit=resp.headers.get("x-ratelimit-limit"), ) reporter.add_page(rows=len(df)) except Exception as e: # noqa: BLE001 diff --git a/tests/waterdata_progress_test.py b/tests/waterdata_progress_test.py index acf78080..cdff34a9 100644 --- a/tests/waterdata_progress_test.py +++ b/tests/waterdata_progress_test.py @@ -87,43 +87,22 @@ def test_missing_rate_limit_does_not_blank_last_known_value(): assert "500 requests remaining" in stream.getvalue() -def test_format_duration(): - assert _progress._format_duration(0) == "0s" - assert _progress._format_duration(45) == "45s" - assert _progress._format_duration(600) == "10m" - assert _progress._format_duration(3600) == "1h" - assert _progress._format_duration(3700) == "1h01m" - assert _progress._format_duration(-5) == "0s" # clamped - - -def test_reset_countdown_rendered(monkeypatch): - # Pin the clock so the countdown is deterministic. - monkeypatch.setattr(_progress.time, "time", lambda: 1_000_000_000.0) +def test_renders_remaining_over_limit(): stream = io.StringIO() reporter = ProgressReporter(stream=stream, enabled=True) - reporter.set_rate_remaining("4870", reset="600") # delta seconds -> 10m + reporter.set_rate_remaining("952", limit="1000") reporter.add_page(rows=1) - out = stream.getvalue() - assert "4,870 requests remaining, resets in 10m" in out - - -def test_reset_accepts_absolute_epoch(monkeypatch): - monkeypatch.setattr(_progress.time, "time", lambda: 1_000_000_000.0) - stream = io.StringIO() - reporter = ProgressReporter(stream=stream, enabled=True) - reporter.set_rate_remaining("10", reset="1000000900") # epoch: 900s out -> 15m - reporter.add_page() - assert "resets in 15m" in stream.getvalue() + assert "952 / 1,000 requests remaining" in stream.getvalue() -def test_no_reset_segment_without_reset_header(): +def test_no_slash_when_limit_absent(): stream = io.StringIO() reporter = ProgressReporter(stream=stream, enabled=True) - reporter.set_rate_remaining("4870") # remaining only, no reset + reporter.set_rate_remaining("4870") # remaining only, no limit header reporter.add_page() out = stream.getvalue() assert "4,870 requests remaining" in out - assert "resets in" not in out + assert "/" not in out def test_close_terminates_active_line_with_newline(): From 1a453d261be77e0fdf8c00bde9fd78c14bcabb3d Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sun, 24 May 2026 08:25:49 -0500 Subject: [PATCH 11/18] feat(waterdata): label the progress line with the service being retrieved MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The status line now leads with "Retrieving: " (e.g. "Retrieving: continuous · 3 pages · 1,345 rows · 998 / 1,000 requests remaining"). The service string each getter already passes to get_ogc_data / get_stats_data is threaded into the reporter via progress_context(service=...). With no service (bare reporter), it falls back to the "Progress:" label. Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 10 +++++----- dataretrieval/waterdata/_progress.py | 27 ++++++++++++++++++++------- dataretrieval/waterdata/utils.py | 4 ++-- tests/waterdata_progress_test.py | 18 +++++++++++++++++- 4 files changed, 44 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index b950df5c..0843e8e2 100644 --- a/README.md +++ b/README.md @@ -110,13 +110,13 @@ Visit the for more information and examples on available services and input parameters. **Tracking progress:** Paginated and chunked `waterdata` queries report their -progress on a single, self-updating line on `stderr` — showing the chunk and -page counts, rows retrieved so far, and the hourly API quota remaining (the -``remaining / limit`` from the server's rate-limit headers, shown when you -have an API key set): +progress on a single, self-updating line on `stderr` — the service being +retrieved, the chunk and page counts, rows retrieved so far, and the hourly API +quota remaining (the ``remaining / limit`` from the server's rate-limit headers, +shown when you have an API key set): ```text -Progress: chunk 2/5 · 14 pages · 8,421 rows · 4,870 / 5,000 requests remaining +Retrieving: daily · chunk 2/5 · 14 pages · 8,421 rows · 4,870 / 5,000 requests remaining ``` The line appears automatically when `stderr` is an interactive terminal. diff --git a/dataretrieval/waterdata/_progress.py b/dataretrieval/waterdata/_progress.py index d0b0c356..b8e4ede8 100644 --- a/dataretrieval/waterdata/_progress.py +++ b/dataretrieval/waterdata/_progress.py @@ -6,7 +6,7 @@ (``utils._paginate``). This module surfaces that work as one line on stderr, rewritten in place as data arrives:: - Progress: chunk 2/5 · 14 pages · 8,421 rows · 4,870 / 5,000 requests remaining + Retrieving: daily · 6 pages · 2,881 rows · 995 / 1,000 requests remaining It replaces the per-page ``logger.info`` calls that previously narrated the same events one line at a time. @@ -77,10 +77,17 @@ class ProgressReporter: """ def __init__( - self, *, stream: TextIO | None = None, enabled: bool | None = None + self, + *, + service: str | None = None, + stream: TextIO | None = None, + enabled: bool | None = None, ) -> None: self._stream = stream if stream is not None else sys.stderr self.enabled = _enabled_default(self._stream) if enabled is None else enabled + # The service/collection being retrieved (e.g. "daily", "peaks"), + # shown as the line's leading label. + self.service = service self.total_chunks = 1 self.current_chunk = 0 self.pages = 0 @@ -147,6 +154,8 @@ def _format(self) -> str: else: segment = f"{remaining} requests remaining" parts.append(segment) + if self.service: + return f"Retrieving: {self.service} · " + " · ".join(parts) return "Progress: " + " · ".join(parts) def _render(self) -> None: @@ -200,19 +209,23 @@ def _maybe_hint_api_key(self) -> None: @contextmanager def progress_context( - *, stream: TextIO | None = None, enabled: bool | None = None + *, + service: str | None = None, + stream: TextIO | None = None, + enabled: bool | None = None, ) -> Iterator[ProgressReporter]: """Activate a :class:`ProgressReporter` for the duration of a query. - If a reporter is already active (a nested call), the existing one is yielded - unchanged so the outermost query owns the single line; only the outermost - context closes it. + ``service`` labels the line (e.g. ``"Retrieving: daily ..."``). If a reporter + is already active (a nested call), the existing one is yielded unchanged so + the outermost query owns the single line; only the outermost context closes + it (and ``service``/``stream``/``enabled`` of a nested call are ignored). """ existing = _active.get() if existing is not None: yield existing return - reporter = ProgressReporter(stream=stream, enabled=enabled) + reporter = ProgressReporter(service=service, stream=stream, enabled=enabled) token = _active.set(reporter) try: yield reporter diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index cb85729e..1e747262 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -1247,7 +1247,7 @@ def get_ogc_data( convert_type = args.pop("convert_type", False) args = {k: v for k, v in args.items() if v is not None} - with _progress.progress_context(): + with _progress.progress_context(service=service): return_list, response = _fetch_once(args) return_list = _deal_with_empty(return_list, properties, service) if convert_type: @@ -1494,7 +1494,7 @@ def follow_up(cursor: str, sess: requests.Session) -> requests.Response: # The stats path doesn't go through ``multi_value_chunked``, so it opens # its own progress context; ``_paginate`` reports pages/rate-limit into it. - with _progress.progress_context(): + with _progress.progress_context(service=service): df, response = _paginate( req, geopd=GEOPANDAS, diff --git a/tests/waterdata_progress_test.py b/tests/waterdata_progress_test.py index cdff34a9..36ca5b05 100644 --- a/tests/waterdata_progress_test.py +++ b/tests/waterdata_progress_test.py @@ -105,6 +105,20 @@ def test_no_slash_when_limit_absent(): assert "/" not in out +def test_service_label_leads_the_line(): + stream = io.StringIO() + reporter = ProgressReporter(service="daily", stream=stream, enabled=True) + reporter.add_page(rows=5) + assert stream.getvalue().lstrip("\r").startswith("Retrieving: daily · ") + + +def test_no_service_falls_back_to_progress_label(): + stream = io.StringIO() + reporter = ProgressReporter(stream=stream, enabled=True) + reporter.add_page(rows=5) + assert stream.getvalue().lstrip("\r").startswith("Progress: ") + + def test_close_terminates_active_line_with_newline(): stream = io.StringIO() reporter = ProgressReporter(stream=stream, enabled=True) @@ -274,11 +288,13 @@ def test_walk_pages_reports_pages_and_rate_limit(): req.url = "https://example.com/p1" stream = io.StringIO() - with progress_context(stream=stream, enabled=True): + with progress_context(service="daily", stream=stream, enabled=True): df, _ = _walk_pages(geopd=False, req=req, client=client) assert len(df) == 2 out = stream.getvalue() + # The service set on the context reaches _paginate's render via the contextvar. + assert "Retrieving: daily ·" in out assert "2 pages" in out assert "4,998 requests remaining" in out assert out.endswith("\n") From f4881492132f45622e50df8d3863a57950b5fa22 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sun, 24 May 2026 09:06:34 -0500 Subject: [PATCH 12/18] feat(waterdata): show the progress line in Jupyter (like tqdm), not just TTYs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The line gated on stderr.isatty(), which is False in a Jupyter kernel, so it never appeared in notebooks — even though the feature is mainly for interactive use and a kernel's stderr honors carriage-return rewrites (the same mechanism tqdm relies on). _enabled_default now also returns True when running inside a Jupyter/IPython ZMQ kernel (detected without importing IPython). API_USGS_PROGRESS still overrides either way. Note: the demo venv and notebook live inside the worktree and are not committed. Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 4 +++- dataretrieval/waterdata/_progress.py | 33 +++++++++++++++++++++++---- tests/waterdata_progress_test.py | 34 ++++++++++++++++++++++++++++ 3 files changed, 66 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 0843e8e2..71c65d08 100644 --- a/README.md +++ b/README.md @@ -119,7 +119,9 @@ shown when you have an API key set): Retrieving: daily · chunk 2/5 · 14 pages · 8,421 rows · 4,870 / 5,000 requests remaining ``` -The line appears automatically when `stderr` is an interactive terminal. +The line appears automatically for interactive use — an interactive terminal or +a Jupyter/IPython notebook (like `tqdm`). Set `API_USGS_PROGRESS=0` to silence +it, or `=1` to force it on elsewhere. For verbose troubleshooting and support — including the request URL sent to the API — enable debug-level diff --git a/dataretrieval/waterdata/_progress.py b/dataretrieval/waterdata/_progress.py index b8e4ede8..db2a3ff1 100644 --- a/dataretrieval/waterdata/_progress.py +++ b/dataretrieval/waterdata/_progress.py @@ -17,9 +17,9 @@ page/row/rate-limit counts) both update without knowing about each other. Call :func:`progress_context` to activate one and :func:`current` to reach it. -By default the line is shown only on an interactive terminal, so notebooks, -redirected logs, and CI stay clean. ``API_USGS_PROGRESS`` forces it on -(``1``/``true``) or off (``0``/``false``). +By default the line is shown for interactive use — an interactive terminal or a +Jupyter/IPython kernel (like ``tqdm``) — while redirected logs and CI stay clean. +``API_USGS_PROGRESS`` forces it on (``1``/``true``) or off (``0``/``false``). """ from __future__ import annotations @@ -59,11 +59,36 @@ def _group_int(value: str) -> str: _api_key_hint_shown = False +def _in_jupyter_kernel() -> bool: + """True when running inside a Jupyter/IPython *kernel* (notebook, lab, + qtconsole). + + A kernel's ``stderr`` isn't a TTY, but it honors carriage-return rewrites in + the cell output area — the same mechanism ``tqdm`` rides on — so the line is + worth showing there. The plain IPython terminal REPL is a + ``TerminalInteractiveShell`` (already a TTY), so only the ZMQ kernel needs + this extra signal. Detected without importing IPython: if it isn't already + imported, we aren't in a shell. + """ + ipython = sys.modules.get("IPython") + if ipython is None: + return False + shell = ipython.get_ipython() + return shell is not None and type(shell).__name__ == "ZMQInteractiveShell" + + def _enabled_default(stream: TextIO) -> bool: - """Whether to draw the line: ``API_USGS_PROGRESS`` wins, else TTY-only.""" + """Whether to draw the line by default. + + ``API_USGS_PROGRESS`` wins when set. Otherwise show it for interactive use — + a TTY or a Jupyter/IPython kernel — and stay quiet for redirected output, + logs, and CI. + """ override = os.getenv("API_USGS_PROGRESS") if override is not None: return override.strip().lower() not in {"", "0", "false", "no", "off"} + if _in_jupyter_kernel(): + return True return hasattr(stream, "isatty") and stream.isatty() diff --git a/tests/waterdata_progress_test.py b/tests/waterdata_progress_test.py index 36ca5b05..2f51bff8 100644 --- a/tests/waterdata_progress_test.py +++ b/tests/waterdata_progress_test.py @@ -7,6 +7,8 @@ """ import io +import sys +import types from unittest import mock import pytest @@ -215,6 +217,7 @@ def test_api_key_hint_shown_at_most_once(monkeypatch): def test_default_disabled_for_non_tty(monkeypatch): monkeypatch.delenv("API_USGS_PROGRESS", raising=False) + monkeypatch.setattr(_progress, "_in_jupyter_kernel", lambda: False) # io.StringIO.isatty() returns False. assert ProgressReporter(stream=io.StringIO()).enabled is False @@ -231,6 +234,37 @@ def test_env_var_forces_off_even_on_tty(monkeypatch): assert ProgressReporter(stream=tty).enabled is False +def _fake_ipython(shell_class_name): + """A stand-in IPython module whose get_ipython() returns a shell of the + given class name (e.g. 'ZMQInteractiveShell' for a Jupyter kernel).""" + shell = type(shell_class_name, (), {})() + return types.SimpleNamespace(get_ipython=lambda: shell) + + +def test_enabled_in_jupyter_kernel(monkeypatch): + # A Jupyter kernel's stderr isn't a TTY, but the line should still show + # (it honors \r in the cell output, like tqdm). + monkeypatch.delenv("API_USGS_PROGRESS", raising=False) + monkeypatch.setitem(sys.modules, "IPython", _fake_ipython("ZMQInteractiveShell")) + assert ProgressReporter(stream=io.StringIO()).enabled is True + + +def test_terminal_ipython_without_tty_stays_disabled(monkeypatch): + # The terminal REPL is its own TTY; the kernel signal must not force the + # line on for a non-TTY (e.g. redirected) stream. + monkeypatch.delenv("API_USGS_PROGRESS", raising=False) + monkeypatch.setitem( + sys.modules, "IPython", _fake_ipython("TerminalInteractiveShell") + ) + assert ProgressReporter(stream=io.StringIO()).enabled is False + + +def test_env_var_off_overrides_jupyter_kernel(monkeypatch): + monkeypatch.setenv("API_USGS_PROGRESS", "0") + monkeypatch.setitem(sys.modules, "IPython", _fake_ipython("ZMQInteractiveShell")) + assert ProgressReporter(stream=io.StringIO()).enabled is False + + # -- progress_context ---------------------------------------------------------- From 81b5a00b82083c4c003253e2ce7c528f4e20f090 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sun, 24 May 2026 09:20:51 -0500 Subject: [PATCH 13/18] fix(waterdata): emit the geopandas advisory once per process, before the progress line MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously the "Geopandas not installed" warning fired inside _paginate — once per query (and once per chunk for chunked OGC requests). In a notebook that meant the warning repeated and interleaved with the self-updating status line, forcing callers to silence the logger. Move it to a process-latched _warn_geopandas_once() called at the get_ogc_data / get_stats_data entry, before the progress context opens. Now it prints at most once, cleanly above the line. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/utils.py | 30 +++++++++++++++++++--------- tests/waterdata_utils_test.py | 34 ++++++++++++++++++++------------ 2 files changed, 42 insertions(+), 22 deletions(-) diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 1e747262..3088d956 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -47,6 +47,22 @@ STATISTICS_API_VERSION = "v0" STATISTICS_API_URL = f"{BASE_URL}/statistics/{STATISTICS_API_VERSION}" +# "Geopandas not installed" is a static, environment-level fact. Emit it at most +# once per process (and before any progress line opens) so it neither repeats +# per query/chunk nor interleaves with the status line's carriage-return rewrites. +_geopandas_warned = False + + +def _warn_geopandas_once() -> None: + """Warn (once per process) that geometries are flattened without geopandas.""" + global _geopandas_warned + if not GEOPANDAS and not _geopandas_warned: + _geopandas_warned = True + logger.warning( + "Geopandas not installed. Geometries will be flattened " + "into pandas DataFrames." + ) + def _switch_arg_id(ls: dict[str, Any], id_name: str, service: str): """ @@ -909,12 +925,6 @@ def _paginate( on subsequent pages are wrapped per above. """ logger.debug("Requesting: %s", initial_req.url) - if not geopd: - logger.warning( - "Geopandas not installed. Geometries will be flattened " - "into pandas DataFrames." - ) - reporter = _progress.current() with _session(client) as sess: resp = sess.send(initial_req) @@ -1247,6 +1257,7 @@ def get_ogc_data( convert_type = args.pop("convert_type", False) args = {k: v for k, v in args.items() if v is not None} + _warn_geopandas_once() # before the progress line, so it never interleaves with _progress.progress_context(service=service): return_list, response = _fetch_once(args) return_list = _deal_with_empty(return_list, properties, service) @@ -1321,9 +1332,9 @@ def _handle_stats_nesting( if not features: return gpd.GeoDataFrame() if geopd else pd.DataFrame() - # The geopd-missing warning is emitted once at the top of - # ``get_stats_data`` (parallel to ``_walk_pages``); doing it here - # would log per page. + # The geopd-missing warning is emitted once per process by + # ``_warn_geopandas_once`` at the getter entry; doing it here would log + # per page. if not geopd: outer_props = [ {k: v for k, v in (f.get("properties") or {}).items() if k != "data"} @@ -1494,6 +1505,7 @@ def follow_up(cursor: str, sess: requests.Session) -> requests.Response: # The stats path doesn't go through ``multi_value_chunked``, so it opens # its own progress context; ``_paginate`` reports pages/rate-limit into it. + _warn_geopandas_once() # before the progress line, so it never interleaves with _progress.progress_context(service=service): df, response = _paginate( req, diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py index f2e28be4..a0cba917 100644 --- a/tests/waterdata_utils_test.py +++ b/tests/waterdata_utils_test.py @@ -411,25 +411,33 @@ def _warning_messages(caplog): return [r.getMessage() for r in caplog.records if r.levelno == logging.WARNING] -def test_walk_pages_warns_when_geopandas_missing(caplog): - caplog.set_level(logging.WARNING, logger=_LOGGER_NAME) - resp = _resp_ok([]) # single empty page, no "next" link - client = mock.MagicMock(spec=requests.Session) - client.send.return_value = resp +@pytest.fixture(autouse=True) +def _reset_geopandas_warned(monkeypatch): + """The geopandas advisory is latched once per process; reset it so these + warning tests don't depend on order.""" + monkeypatch.setattr(_utils_module, "_geopandas_warned", False) - req = mock.MagicMock(spec=requests.PreparedRequest) - req.method = "GET" - req.headers = {} - req.url = "https://example.com/p1" - _walk_pages(geopd=False, req=req, client=client) +def test_warn_geopandas_once_is_latched(monkeypatch, caplog): + # Static environment fact -> warn at most once per process. + monkeypatch.setattr(_utils_module, "GEOPANDAS", False) + caplog.set_level(logging.WARNING, logger=_LOGGER_NAME) + _utils_module._warn_geopandas_once() + _utils_module._warn_geopandas_once() # second call must not re-warn + geo = [m for m in _warning_messages(caplog) if "Geopandas not installed" in m] + assert len(geo) == 1 + - assert any("Geopandas not installed" in m for m in _warning_messages(caplog)) +def test_warn_geopandas_once_silent_when_installed(monkeypatch, caplog): + monkeypatch.setattr(_utils_module, "GEOPANDAS", True) + caplog.set_level(logging.WARNING, logger=_LOGGER_NAME) + _utils_module._warn_geopandas_once() + assert not [m for m in _warning_messages(caplog) if "Geopandas not installed" in m] def test_get_stats_data_warns_once_when_geopandas_missing(caplog, monkeypatch): - # The advisory must fire once per query, not once per paginated page - # (it previously lived in the per-page _handle_stats_nesting helper). + # The advisory fires once per process (via _warn_geopandas_once) before the + # progress line — not once per paginated page, and not interleaved with it. from dataretrieval.waterdata.utils import get_stats_data monkeypatch.setattr(_utils_module, "GEOPANDAS", False) From 37a6b8657b80c5aafba56237ed053d6baf49ff3f Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sun, 24 May 2026 09:27:13 -0500 Subject: [PATCH 14/18] style(waterdata): drop spaces around the / in "remaining/limit" progress segment Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 2 +- dataretrieval/waterdata/_progress.py | 4 ++-- tests/waterdata_progress_test.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 71c65d08..408a1807 100644 --- a/README.md +++ b/README.md @@ -116,7 +116,7 @@ quota remaining (the ``remaining / limit`` from the server's rate-limit headers, shown when you have an API key set): ```text -Retrieving: daily · chunk 2/5 · 14 pages · 8,421 rows · 4,870 / 5,000 requests remaining +Retrieving: daily · chunk 2/5 · 14 pages · 8,421 rows · 4,870/5,000 requests remaining ``` The line appears automatically for interactive use — an interactive terminal or diff --git a/dataretrieval/waterdata/_progress.py b/dataretrieval/waterdata/_progress.py index db2a3ff1..7263d555 100644 --- a/dataretrieval/waterdata/_progress.py +++ b/dataretrieval/waterdata/_progress.py @@ -6,7 +6,7 @@ (``utils._paginate``). This module surfaces that work as one line on stderr, rewritten in place as data arrives:: - Retrieving: daily · 6 pages · 2,881 rows · 995 / 1,000 requests remaining + Retrieving: daily · 6 pages · 2,881 rows · 995/1,000 requests remaining It replaces the per-page ``logger.info`` calls that previously narrated the same events one line at a time. @@ -175,7 +175,7 @@ def _format(self) -> str: remaining = _group_int(self.rate_remaining) if self.rate_limit is not None: limit = _group_int(self.rate_limit) - segment = f"{remaining} / {limit} requests remaining" + segment = f"{remaining}/{limit} requests remaining" else: segment = f"{remaining} requests remaining" parts.append(segment) diff --git a/tests/waterdata_progress_test.py b/tests/waterdata_progress_test.py index 2f51bff8..dc700a81 100644 --- a/tests/waterdata_progress_test.py +++ b/tests/waterdata_progress_test.py @@ -94,7 +94,7 @@ def test_renders_remaining_over_limit(): reporter = ProgressReporter(stream=stream, enabled=True) reporter.set_rate_remaining("952", limit="1000") reporter.add_page(rows=1) - assert "952 / 1,000 requests remaining" in stream.getvalue() + assert "952/1,000 requests remaining" in stream.getvalue() def test_no_slash_when_limit_absent(): From 5135bb514b7ca0ac65b9a2359424b4863b9c3217 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sun, 24 May 2026 09:27:14 -0500 Subject: [PATCH 15/18] refactor(waterdata): emit the geopandas advisory once at import, not per call MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Geopandas availability is a static, environment-level fact, so warn once at module import (right where GEOPANDAS is determined) instead of inside the getters/_paginate. This removes the per-call _warn_geopandas_once() latch and guarantees the advisory prints exactly once — above any progress line — without repeating per query/chunk or interleaving with the status line. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/utils.py | 32 +++++--------- tests/waterdata_utils_test.py | 72 +++++++------------------------- 2 files changed, 25 insertions(+), 79 deletions(-) diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 3088d956..34df2de7 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -40,6 +40,15 @@ # Set up logger for this module logger = logging.getLogger(__name__) +# Whether geopandas is present is a static, environment-level fact, so warn once +# here at import time rather than per query/chunk. That avoids the warning +# repeating on every call and avoids it interleaving with the progress line's +# carriage-return rewrites. +if not GEOPANDAS: + logger.warning( + "Geopandas not installed. Geometries will be flattened into pandas DataFrames." + ) + BASE_URL = "https://api.waterdata.usgs.gov" OGC_API_VERSION = "v0" OGC_API_URL = f"{BASE_URL}/ogcapi/{OGC_API_VERSION}" @@ -47,22 +56,6 @@ STATISTICS_API_VERSION = "v0" STATISTICS_API_URL = f"{BASE_URL}/statistics/{STATISTICS_API_VERSION}" -# "Geopandas not installed" is a static, environment-level fact. Emit it at most -# once per process (and before any progress line opens) so it neither repeats -# per query/chunk nor interleaves with the status line's carriage-return rewrites. -_geopandas_warned = False - - -def _warn_geopandas_once() -> None: - """Warn (once per process) that geometries are flattened without geopandas.""" - global _geopandas_warned - if not GEOPANDAS and not _geopandas_warned: - _geopandas_warned = True - logger.warning( - "Geopandas not installed. Geometries will be flattened " - "into pandas DataFrames." - ) - def _switch_arg_id(ls: dict[str, Any], id_name: str, service: str): """ @@ -1257,7 +1250,6 @@ def get_ogc_data( convert_type = args.pop("convert_type", False) args = {k: v for k, v in args.items() if v is not None} - _warn_geopandas_once() # before the progress line, so it never interleaves with _progress.progress_context(service=service): return_list, response = _fetch_once(args) return_list = _deal_with_empty(return_list, properties, service) @@ -1332,9 +1324,8 @@ def _handle_stats_nesting( if not features: return gpd.GeoDataFrame() if geopd else pd.DataFrame() - # The geopd-missing warning is emitted once per process by - # ``_warn_geopandas_once`` at the getter entry; doing it here would log - # per page. + # The geopd-missing warning is emitted once at import (see top of module); + # doing it here would log per page. if not geopd: outer_props = [ {k: v for k, v in (f.get("properties") or {}).items() if k != "data"} @@ -1505,7 +1496,6 @@ def follow_up(cursor: str, sess: requests.Session) -> requests.Response: # The stats path doesn't go through ``multi_value_chunked``, so it opens # its own progress context; ``_paginate`` reports pages/rate-limit into it. - _warn_geopandas_once() # before the progress line, so it never interleaves with _progress.progress_context(service=service): df, response = _paginate( req, diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py index a0cba917..1f03772a 100644 --- a/tests/waterdata_utils_test.py +++ b/tests/waterdata_utils_test.py @@ -406,66 +406,22 @@ def test_handle_stats_nesting_tolerates_missing_drop_columns(): assert df["monitoring_location_id"].iloc[0] == "USGS-12345" -def _warning_messages(caplog): - """Pull WARNING-level message strings out of caplog.""" - return [r.getMessage() for r in caplog.records if r.levelno == logging.WARNING] - - -@pytest.fixture(autouse=True) -def _reset_geopandas_warned(monkeypatch): - """The geopandas advisory is latched once per process; reset it so these - warning tests don't depend on order.""" - monkeypatch.setattr(_utils_module, "_geopandas_warned", False) - - -def test_warn_geopandas_once_is_latched(monkeypatch, caplog): - # Static environment fact -> warn at most once per process. - monkeypatch.setattr(_utils_module, "GEOPANDAS", False) - caplog.set_level(logging.WARNING, logger=_LOGGER_NAME) - _utils_module._warn_geopandas_once() - _utils_module._warn_geopandas_once() # second call must not re-warn - geo = [m for m in _warning_messages(caplog) if "Geopandas not installed" in m] - assert len(geo) == 1 - - -def test_warn_geopandas_once_silent_when_installed(monkeypatch, caplog): - monkeypatch.setattr(_utils_module, "GEOPANDAS", True) - caplog.set_level(logging.WARNING, logger=_LOGGER_NAME) - _utils_module._warn_geopandas_once() - assert not [m for m in _warning_messages(caplog) if "Geopandas not installed" in m] - - -def test_get_stats_data_warns_once_when_geopandas_missing(caplog, monkeypatch): - # The advisory fires once per process (via _warn_geopandas_once) before the - # progress line — not once per paginated page, and not interleaved with it. - from dataretrieval.waterdata.utils import get_stats_data - - monkeypatch.setattr(_utils_module, "GEOPANDAS", False) - monkeypatch.setattr( - _utils_module, - "_handle_stats_nesting", - mock.MagicMock(return_value=pd.DataFrame()), +def test_geopandas_advisory_emitted_once_on_import(): + # The advisory is a one-time module-import side effect, not per call. Import + # the module in a subprocess with geopandas blocked and confirm exactly one + # warning reaches stderr. + import subprocess + import sys + + code = ( + "import sys; sys.modules['geopandas'] = None\n" + "import dataretrieval.waterdata.utils\n" ) - caplog.set_level(logging.WARNING, logger=_LOGGER_NAME) - - page1 = mock.MagicMock(status_code=200, headers={}) - page1.json.return_value = {"next": "tok", "features": []} - page2 = mock.MagicMock(status_code=200, headers={}) - page2.json.return_value = {"next": None, "features": []} - - client = mock.MagicMock(spec=requests.Session) - client.send.return_value = page1 - client.request.return_value = page2 - - get_stats_data( - args={"monitoring_location_id": "USGS-1"}, - service="observationNormals", - expand_percentiles=False, - client=client, + result = subprocess.run( + [sys.executable, "-c", code], capture_output=True, text=True ) - - geo = [m for m in _warning_messages(caplog) if "Geopandas not installed" in m] - assert len(geo) == 1 + assert result.returncode == 0, result.stderr + assert result.stderr.count("Geopandas not installed") == 1 def test_handle_stats_nesting_returns_empty_on_empty_features(): From 3967c8e622e6789773f8bbab0dbe6a9fec6d42cb Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sun, 24 May 2026 09:40:48 -0500 Subject: [PATCH 16/18] refactor(waterdata): drop now-unused geopd param from _paginate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Moving the geopandas advisory to import time removed _paginate's only use of its `geopd` argument; the param was dead (callers still passed it but the body ignored it). Drop it from the signature, docstring, and both call sites (_walk_pages, get_stats_data). _walk_pages still takes geopd — it uses it to build its parse_response closure. Co-Authored-By: Claude Opus 4.7 (1M context) --- dataretrieval/waterdata/utils.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/dataretrieval/waterdata/utils.py b/dataretrieval/waterdata/utils.py index 34df2de7..dd908143 100644 --- a/dataretrieval/waterdata/utils.py +++ b/dataretrieval/waterdata/utils.py @@ -854,7 +854,6 @@ def _aggregate_paginated_response( def _paginate( initial_req: requests.PreparedRequest, *, - geopd: bool, parse_response: Callable[[requests.Response], tuple[pd.DataFrame, _Cursor | None]], follow_up: Callable[[_Cursor, requests.Session], requests.Response], client: requests.Session | None = None, @@ -873,10 +872,6 @@ def _paginate( ---------- initial_req : requests.PreparedRequest First-page request to send. - geopd : bool - Whether ``geopandas`` is available — logged once at WARNING - level when ``False`` (matches historical behavior of both - callers). parse_response : callable ``resp -> (df, next_cursor_or_None)``. Returns the page's DataFrame and the cursor (URL, token, …) used to drive @@ -1033,7 +1028,6 @@ def follow_up(cursor: str, sess: requests.Session) -> requests.Response: return _paginate( req, - geopd=geopd, parse_response=parse_response, follow_up=follow_up, client=client, @@ -1499,7 +1493,6 @@ def follow_up(cursor: str, sess: requests.Session) -> requests.Response: with _progress.progress_context(service=service): df, response = _paginate( req, - geopd=GEOPANDAS, parse_response=parse_response, follow_up=follow_up, client=client, From a13946cfdf02927f7e5c2f672eab17dba9f91fb4 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sun, 24 May 2026 09:59:11 -0500 Subject: [PATCH 17/18] docs(waterdata): drop progress-line section from README; remove a redundant test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - README: remove the "Tracking progress" section (progress line is no longer documented there); keep the accurate DEBUG-logging troubleshooting note. - tests: drop test_no_service_falls_back_to_progress_label — its only assertion (the "Progress:" prefix with no service) is already covered by test_renders_pages_rows_and_rate_limit. Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 14 -------------- tests/waterdata_progress_test.py | 7 ------- 2 files changed, 21 deletions(-) diff --git a/README.md b/README.md index 408a1807..a468c652 100644 --- a/README.md +++ b/README.md @@ -109,20 +109,6 @@ Visit the [API Reference](https://doi-usgs.github.io/dataretrieval-python/reference/waterdata.html) for more information and examples on available services and input parameters. -**Tracking progress:** Paginated and chunked `waterdata` queries report their -progress on a single, self-updating line on `stderr` — the service being -retrieved, the chunk and page counts, rows retrieved so far, and the hourly API -quota remaining (the ``remaining / limit`` from the server's rate-limit headers, -shown when you have an API key set): - -```text -Retrieving: daily · chunk 2/5 · 14 pages · 8,421 rows · 4,870/5,000 requests remaining -``` - -The line appears automatically for interactive use — an interactive terminal or -a Jupyter/IPython notebook (like `tqdm`). Set `API_USGS_PROGRESS=0` to silence -it, or `=1` to force it on elsewhere. - For verbose troubleshooting and support — including the request URL sent to the API — enable debug-level [logging](https://docs.python.org/3/howto/logging.html#logging-basic-tutorial): diff --git a/tests/waterdata_progress_test.py b/tests/waterdata_progress_test.py index dc700a81..14a98839 100644 --- a/tests/waterdata_progress_test.py +++ b/tests/waterdata_progress_test.py @@ -114,13 +114,6 @@ def test_service_label_leads_the_line(): assert stream.getvalue().lstrip("\r").startswith("Retrieving: daily · ") -def test_no_service_falls_back_to_progress_label(): - stream = io.StringIO() - reporter = ProgressReporter(stream=stream, enabled=True) - reporter.add_page(rows=5) - assert stream.getvalue().lstrip("\r").startswith("Progress: ") - - def test_close_terminates_active_line_with_newline(): stream = io.StringIO() reporter = ProgressReporter(stream=stream, enabled=True) From 5f75715e597ede4e1f404358074523350ec9e7a7 Mon Sep 17 00:00:00 2001 From: thodson-usgs Date: Sun, 24 May 2026 10:14:46 -0500 Subject: [PATCH 18/18] test(waterdata): remove test_geopandas_advisory_emitted_once_on_import Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/waterdata_utils_test.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/tests/waterdata_utils_test.py b/tests/waterdata_utils_test.py index 1f03772a..c135115c 100644 --- a/tests/waterdata_utils_test.py +++ b/tests/waterdata_utils_test.py @@ -406,24 +406,6 @@ def test_handle_stats_nesting_tolerates_missing_drop_columns(): assert df["monitoring_location_id"].iloc[0] == "USGS-12345" -def test_geopandas_advisory_emitted_once_on_import(): - # The advisory is a one-time module-import side effect, not per call. Import - # the module in a subprocess with geopandas blocked and confirm exactly one - # warning reaches stderr. - import subprocess - import sys - - code = ( - "import sys; sys.modules['geopandas'] = None\n" - "import dataretrieval.waterdata.utils\n" - ) - result = subprocess.run( - [sys.executable, "-c", code], capture_output=True, text=True - ) - assert result.returncode == 0, result.stderr - assert result.stderr.count("Geopandas not installed") == 1 - - def test_handle_stats_nesting_returns_empty_on_empty_features(): """A mid-pagination empty page ({\"features\": [], \"next\": }) must not crash the downstream merge with