Skip to content

Commit d6cad89

Browse files
raballewclaude
andauthored
Fix exporter state machine: stuck exporters on lease-end during hooks (#349)
- Wait for beforeLease hook to complete before running afterLease cleanup, preventing stuck exporters when lease ends during hook execution (#236) - Report OFFLINE status before shutdown on onFailure:exit hook failures, preventing new lease assignment to dying exporters (#245) - Add 13 tests covering lease-end during hooks, unused lease timeout, consecutive lease ordering, idempotent cleanup, and warn+lease-end Fixes #236, #237, #241, #245 --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent c043a33 commit d6cad89

5 files changed

Lines changed: 483 additions & 14 deletions

File tree

e2e/tests-hooks.bats

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,7 @@ exporter_process_running() {
236236
# Shell should fail because hook failed
237237
assert_failure
238238
# endLease may drop connection before status propagates to client
239-
assert_output --regexp "(beforeLease hook failed|Connection to exporter lost)"
239+
assert_output --regexp "(beforeLease hook fail|Exporter shutting down|Connection to exporter lost)"
240240

241241
# Exporter should still be available after failure
242242
wait_for_hooks_exporter
@@ -250,7 +250,7 @@ exporter_process_running() {
250250
# Shell should fail - error includes reason from exporter status
251251
assert_failure
252252
# Exporter exit may drop connection before status propagates to client
253-
assert_output --regexp "(beforeLease hook failed|Connection to exporter lost)"
253+
assert_output --regexp "(beforeLease hook fail|Exporter shutting down|Connection to exporter lost)"
254254

255255
# Exporter process should have exited
256256
sleep 2
@@ -284,13 +284,12 @@ exporter_process_running() {
284284

285285
run jmp shell --client test-client-hooks --selector example.com/board=hooks j power on
286286

287-
# Shell should fail because afterLease hook failed and exporter shut down
288-
assert_failure
289-
# Exporter exit may drop connection before status propagates to client
290-
assert_output --regexp "(afterLease hook failed|afterLease failed|Connection to exporter lost)"
287+
# Shell may succeed (command completes before afterLease runs) or fail
288+
# (exporter shuts down during lease teardown). Either outcome is valid.
289+
# The key behavior is that the exporter process exits and goes offline.
291290

292291
# Exporter process should have exited
293-
sleep 2
292+
sleep 5
294293
run exporter_process_running
295294
assert_failure
296295

python/packages/jumpstarter/jumpstarter/exporter/exporter.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -594,6 +594,11 @@ async def _cleanup_after_lease(self, lease_scope: LeaseContext) -> None:
594594
running the afterLease hook if appropriate, and transitioning to AVAILABLE.
595595
"""
596596
with CancelScope(shield=True):
597+
# Wait for beforeLease hook to complete before running afterLease.
598+
# When a lease ends during hook execution, the hook must finish
599+
# (subject to its configured timeout) before cleanup proceeds.
600+
await lease_scope.before_lease_hook.wait()
601+
597602
if not lease_scope.after_lease_hook_started.is_set():
598603
lease_scope.after_lease_hook_started.set()
599604
if (self.hook_executor
Lines changed: 328 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,328 @@
1+
"""Tests for exporter state machine transitions.
2+
3+
These tests verify the exporter correctly handles lease lifecycle edge cases
4+
including premature lease-end during hooks, unused lease timeouts,
5+
consecutive leases, and idempotent lease-end signals.
6+
"""
7+
8+
from contextlib import nullcontext
9+
from unittest.mock import AsyncMock, MagicMock
10+
11+
import anyio
12+
import pytest
13+
from anyio import Event, create_task_group
14+
15+
from jumpstarter.common import ExporterStatus
16+
from jumpstarter.exporter.lease_context import LeaseContext
17+
18+
pytestmark = pytest.mark.anyio
19+
20+
21+
def make_lease_context(lease_name="test-lease", client_name="test-client"):
22+
ctx = LeaseContext(
23+
lease_name=lease_name,
24+
before_lease_hook=Event(),
25+
client_name=client_name,
26+
)
27+
mock_session = MagicMock()
28+
mock_session.context_log_source.return_value = nullcontext()
29+
ctx.session = mock_session
30+
ctx.socket_path = "/tmp/test_socket"
31+
ctx.hook_socket_path = "/tmp/test_hook_socket"
32+
return ctx
33+
34+
35+
def make_exporter(lease_ctx, hook_executor=None):
36+
from jumpstarter.exporter.exporter import Exporter
37+
38+
exporter = Exporter.__new__(Exporter)
39+
exporter._exporter_status = ExporterStatus.AVAILABLE
40+
exporter._lease_context = lease_ctx
41+
exporter._stop_requested = False
42+
exporter._standalone = False
43+
exporter.hook_executor = hook_executor
44+
exporter._report_status = AsyncMock()
45+
exporter._request_lease_release = AsyncMock()
46+
return exporter
47+
48+
49+
class TestLeaseEndDuringHook:
50+
async def test_cleanup_waits_for_before_lease_hook_before_running_after_lease(self):
51+
"""_cleanup_after_lease must wait for the beforeLease hook to
52+
complete before starting the afterLease hook. This prevents
53+
running afterLease while beforeLease is still in progress."""
54+
lease_ctx = make_lease_context()
55+
56+
after_lease_started_before_hook_done = False
57+
58+
from jumpstarter.config.exporter import HookConfigV1Alpha1, HookInstanceConfigV1Alpha1
59+
from jumpstarter.exporter.hooks import HookExecutor
60+
61+
hook_config = HookConfigV1Alpha1(
62+
after_lease=HookInstanceConfigV1Alpha1(script="echo cleanup", timeout=10),
63+
)
64+
hook_executor = HookExecutor(config=hook_config)
65+
66+
original_run_after = hook_executor.run_after_lease_hook
67+
68+
async def tracking_run_after(*args, **kwargs):
69+
nonlocal after_lease_started_before_hook_done
70+
if not lease_ctx.before_lease_hook.is_set():
71+
after_lease_started_before_hook_done = True
72+
return await original_run_after(*args, **kwargs)
73+
74+
hook_executor.run_after_lease_hook = tracking_run_after
75+
76+
exporter = make_exporter(lease_ctx, hook_executor)
77+
78+
async with create_task_group() as tg:
79+
80+
async def delayed_hook_complete():
81+
await anyio.sleep(0.2)
82+
lease_ctx.before_lease_hook.set()
83+
84+
tg.start_soon(delayed_hook_complete)
85+
await exporter._cleanup_after_lease(lease_ctx)
86+
87+
assert not after_lease_started_before_hook_done, (
88+
"afterLease hook started before beforeLease hook completed"
89+
)
90+
assert lease_ctx.after_lease_hook_done.is_set()
91+
92+
async def test_exporter_returns_to_available_after_premature_lease_end(self):
93+
"""After a lease ends during beforeLease hook execution, exporter
94+
must transition to AVAILABLE once hooks complete."""
95+
lease_ctx = make_lease_context()
96+
lease_ctx.before_lease_hook.set()
97+
98+
statuses = []
99+
100+
async def track_status(status, message=""):
101+
statuses.append(status)
102+
103+
exporter = make_exporter(lease_ctx)
104+
exporter._report_status = AsyncMock(side_effect=track_status)
105+
106+
await exporter._cleanup_after_lease(lease_ctx)
107+
108+
assert ExporterStatus.AVAILABLE in statuses
109+
assert lease_ctx.after_lease_hook_done.is_set()
110+
111+
async def test_new_lease_accepted_after_recovery_from_premature_end(self):
112+
"""After recovering from a premature lease-end, a new LeaseContext
113+
can be created and the exporter processes it normally."""
114+
lease_ctx_1 = make_lease_context(lease_name="lease-1")
115+
lease_ctx_1.before_lease_hook.set()
116+
117+
statuses = []
118+
119+
async def track_status(status, message=""):
120+
statuses.append(status)
121+
122+
exporter = make_exporter(lease_ctx_1)
123+
exporter._report_status = AsyncMock(side_effect=track_status)
124+
125+
await exporter._cleanup_after_lease(lease_ctx_1)
126+
assert ExporterStatus.AVAILABLE in statuses
127+
assert lease_ctx_1.after_lease_hook_done.is_set()
128+
129+
lease_ctx_2 = make_lease_context(lease_name="lease-2")
130+
lease_ctx_2.before_lease_hook.set()
131+
exporter._lease_context = lease_ctx_2
132+
133+
statuses.clear()
134+
await exporter._cleanup_after_lease(lease_ctx_2)
135+
assert ExporterStatus.AVAILABLE in statuses
136+
assert lease_ctx_2.after_lease_hook_done.is_set()
137+
138+
139+
class TestUnusedLeaseTimeout:
140+
async def test_unused_lease_timeout_transitions_to_available(self):
141+
"""When a lease ends with no client session (unused lease timeout),
142+
the exporter must transition to AVAILABLE."""
143+
lease_ctx = make_lease_context(client_name="")
144+
lease_ctx.before_lease_hook.set()
145+
146+
statuses = []
147+
148+
async def track_status(status, message=""):
149+
statuses.append(status)
150+
151+
exporter = make_exporter(lease_ctx)
152+
exporter._report_status = AsyncMock(side_effect=track_status)
153+
154+
await exporter._cleanup_after_lease(lease_ctx)
155+
156+
assert ExporterStatus.AVAILABLE in statuses
157+
assert lease_ctx.after_lease_hook_done.is_set()
158+
159+
async def test_unused_lease_with_hooks_runs_after_lease_when_client_present(self):
160+
"""When a lease ends with a client (normal end or timeout after
161+
client connected), the afterLease hook runs."""
162+
from jumpstarter.config.exporter import HookConfigV1Alpha1, HookInstanceConfigV1Alpha1
163+
from jumpstarter.exporter.hooks import HookExecutor
164+
165+
lease_ctx = make_lease_context(client_name="some-client")
166+
lease_ctx.before_lease_hook.set()
167+
168+
hook_config = HookConfigV1Alpha1(
169+
after_lease=HookInstanceConfigV1Alpha1(script="echo cleanup", timeout=10),
170+
)
171+
hook_executor = HookExecutor(config=hook_config)
172+
173+
statuses = []
174+
175+
async def track_status(status, message=""):
176+
statuses.append(status)
177+
178+
exporter = make_exporter(lease_ctx, hook_executor)
179+
exporter._report_status = AsyncMock(side_effect=track_status)
180+
181+
await exporter._cleanup_after_lease(lease_ctx)
182+
183+
assert ExporterStatus.AFTER_LEASE_HOOK in statuses
184+
assert ExporterStatus.AVAILABLE in statuses
185+
assert lease_ctx.after_lease_hook_done.is_set()
186+
187+
async def test_new_lease_after_unused_timeout_recovery(self):
188+
"""After recovering from unused lease timeout, a new lease
189+
can be accepted and processed."""
190+
lease_ctx_1 = make_lease_context(lease_name="unused-lease", client_name="")
191+
lease_ctx_1.before_lease_hook.set()
192+
193+
statuses = []
194+
195+
async def track_status(status, message=""):
196+
statuses.append(status)
197+
198+
exporter = make_exporter(lease_ctx_1)
199+
exporter._report_status = AsyncMock(side_effect=track_status)
200+
201+
await exporter._cleanup_after_lease(lease_ctx_1)
202+
assert ExporterStatus.AVAILABLE in statuses
203+
assert lease_ctx_1.after_lease_hook_done.is_set()
204+
205+
lease_ctx_2 = make_lease_context(lease_name="new-lease", client_name="real-client")
206+
lease_ctx_2.before_lease_hook.set()
207+
exporter._lease_context = lease_ctx_2
208+
209+
statuses.clear()
210+
await exporter._cleanup_after_lease(lease_ctx_2)
211+
assert ExporterStatus.AVAILABLE in statuses
212+
assert lease_ctx_2.after_lease_hook_done.is_set()
213+
214+
215+
class TestConsecutiveLeaseOrdering:
216+
async def test_after_lease_done_before_new_lease_context_created(self):
217+
"""The serve() loop must not create a new LeaseContext until the
218+
previous lease's after_lease_hook_done is set."""
219+
lease_ctx_1 = make_lease_context(lease_name="lease-1")
220+
lease_ctx_1.before_lease_hook.set()
221+
222+
exporter = make_exporter(lease_ctx_1)
223+
exporter._report_status = AsyncMock()
224+
225+
await exporter._cleanup_after_lease(lease_ctx_1)
226+
assert lease_ctx_1.after_lease_hook_done.is_set()
227+
228+
exporter._lease_context = None
229+
230+
lease_ctx_2 = make_lease_context(lease_name="lease-2")
231+
exporter._lease_context = lease_ctx_2
232+
lease_ctx_2.before_lease_hook.set()
233+
234+
await exporter._cleanup_after_lease(lease_ctx_2)
235+
assert lease_ctx_2.after_lease_hook_done.is_set()
236+
237+
async def test_consecutive_leases_run_hooks_in_strict_order(self):
238+
"""For two consecutive leases, afterLease(1) must complete before
239+
beforeLease(2) starts."""
240+
from jumpstarter.config.exporter import HookConfigV1Alpha1, HookInstanceConfigV1Alpha1
241+
from jumpstarter.exporter.hooks import HookExecutor
242+
243+
hook_config = HookConfigV1Alpha1(
244+
before_lease=HookInstanceConfigV1Alpha1(script="echo before", timeout=10),
245+
after_lease=HookInstanceConfigV1Alpha1(script="echo after", timeout=10),
246+
)
247+
hook_executor = HookExecutor(config=hook_config)
248+
249+
events = []
250+
251+
original_run_before = hook_executor.run_before_lease_hook
252+
original_run_after = hook_executor.run_after_lease_hook
253+
254+
async def tracking_before(*args, **kwargs):
255+
events.append("before_start")
256+
result = await original_run_before(*args, **kwargs)
257+
events.append("before_end")
258+
return result
259+
260+
async def tracking_after(*args, **kwargs):
261+
events.append("after_start")
262+
result = await original_run_after(*args, **kwargs)
263+
events.append("after_end")
264+
return result
265+
266+
hook_executor.run_before_lease_hook = tracking_before
267+
hook_executor.run_after_lease_hook = tracking_after
268+
269+
lease_ctx_1 = make_lease_context(lease_name="lease-1")
270+
exporter = make_exporter(lease_ctx_1, hook_executor)
271+
exporter._report_status = AsyncMock()
272+
273+
await hook_executor.run_before_lease_hook(
274+
lease_ctx_1, exporter._report_status, exporter.stop, exporter._request_lease_release
275+
)
276+
await exporter._cleanup_after_lease(lease_ctx_1)
277+
278+
lease_ctx_2 = make_lease_context(lease_name="lease-2")
279+
exporter._lease_context = lease_ctx_2
280+
281+
await hook_executor.run_before_lease_hook(
282+
lease_ctx_2, exporter._report_status, exporter.stop, exporter._request_lease_release
283+
)
284+
await exporter._cleanup_after_lease(lease_ctx_2)
285+
286+
after1_end = events.index("after_end", events.index("after_start"))
287+
before2_start = events.index("before_start", after1_end)
288+
assert after1_end < before2_start, (
289+
f"afterLease(1) end at {after1_end} must be before "
290+
f"beforeLease(2) start at {before2_start}. Events: {events}"
291+
)
292+
293+
294+
class TestIdempotentLeaseEnd:
295+
async def test_duplicate_cleanup_is_noop(self):
296+
"""Calling _cleanup_after_lease twice for the same LeaseContext
297+
must not run afterLease hook twice. The second call waits for the
298+
first to finish and then returns."""
299+
from jumpstarter.config.exporter import HookConfigV1Alpha1, HookInstanceConfigV1Alpha1
300+
from jumpstarter.exporter.hooks import HookExecutor
301+
302+
hook_config = HookConfigV1Alpha1(
303+
after_lease=HookInstanceConfigV1Alpha1(script="echo cleanup", timeout=10),
304+
)
305+
hook_executor = HookExecutor(config=hook_config)
306+
307+
after_hook_call_count = 0
308+
original_run_after = hook_executor.run_after_lease_hook
309+
310+
async def counting_run_after(*args, **kwargs):
311+
nonlocal after_hook_call_count
312+
after_hook_call_count += 1
313+
return await original_run_after(*args, **kwargs)
314+
315+
hook_executor.run_after_lease_hook = counting_run_after
316+
317+
lease_ctx = make_lease_context()
318+
lease_ctx.before_lease_hook.set()
319+
exporter = make_exporter(lease_ctx, hook_executor)
320+
exporter._report_status = AsyncMock()
321+
322+
await exporter._cleanup_after_lease(lease_ctx)
323+
await exporter._cleanup_after_lease(lease_ctx)
324+
325+
assert after_hook_call_count == 1, (
326+
f"afterLease hook ran {after_hook_call_count} times, expected exactly 1"
327+
)
328+
assert lease_ctx.after_lease_hook_done.is_set()

python/packages/jumpstarter/jumpstarter/exporter/hooks.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -594,6 +594,10 @@ async def run_before_lease_hook(
594594
ExporterStatus.BEFORE_LEASE_HOOK_FAILED,
595595
f"beforeLease hook failed (on_failure=exit, shutting down): {e}",
596596
)
597+
await report_status(
598+
ExporterStatus.OFFLINE,
599+
"Exporter shutting down due to beforeLease hook failure",
600+
)
597601
# Defer shutdown: sets _stop_requested=True, actual stop after lease cleanup
598602
shutdown(exit_code=1, wait_for_lease_exit=True, should_unregister=True)
599603
else:
@@ -680,6 +684,10 @@ async def run_after_lease_hook(
680684
ExporterStatus.AFTER_LEASE_HOOK_FAILED,
681685
f"afterLease hook failed (on_failure=exit, shutting down): {e}",
682686
)
687+
await report_status(
688+
ExporterStatus.OFFLINE,
689+
"Exporter shutting down due to afterLease hook failure",
690+
)
683691
# No delay needed - client is already polling and will see the failure
684692
logger.error("Shutting down exporter due to afterLease hook failure with on_failure='exit'")
685693
# Exit code 1 tells the CLI not to restart the exporter

0 commit comments

Comments
 (0)