Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
4ee369f
feat: add scheduled snapshots with retention cleanup
sjmiller609 Mar 9, 2026
c16caac
fix: aggregate scheduled snapshot runner errors
sjmiller609 Mar 9, 2026
c13fcdc
perf: scan schedule index and reduce lock contention
sjmiller609 Mar 9, 2026
51c97f3
fix: validate scheduled snapshot name prefix length
sjmiller609 Mar 9, 2026
12c7c55
refactor: move scheduled snapshot domain logic to lib package
sjmiller609 Mar 9, 2026
b8cca6c
fix: ignore missing snapshots during retention cleanup
sjmiller609 Mar 9, 2026
d172b0a
chore: remove dead wrapper and align retention schema
sjmiller609 Mar 9, 2026
e8a4a3d
fix: tighten schedule run due-check under write lock
sjmiller609 Mar 9, 2026
666e14c
feat: auto-select scheduled snapshot kind and retain post-delete sche…
sjmiller609 Mar 9, 2026
84bd548
fix: return 404 when scheduling a missing instance
sjmiller609 Mar 9, 2026
0430e6c
fix: retire converged deleted-instance schedules
sjmiller609 Mar 9, 2026
77e55a1
chore: remove test-only snapshot schedule aliases
sjmiller609 Mar 9, 2026
6aa101f
fix: preserve schedule runtime history on updates
sjmiller609 Mar 9, 2026
c346a1f
fix: tighten schedule retention request handling
sjmiller609 Mar 9, 2026
84ef7c8
fix: honor cancellation in schedule runner loop
sjmiller609 Mar 9, 2026
1108cd8
fix: preserve explicit zero retention settings
sjmiller609 Mar 9, 2026
b9013e7
Merge origin/main into codex/snapshot-schedule-retention
sjmiller609 Mar 10, 2026
6db9b1c
fix: evaluate schedule timing per-instance
sjmiller609 Mar 10, 2026
c94c292
fix: make schedule interval step arithmetic explicit
sjmiller609 Mar 10, 2026
1104580
Merge origin/main into codex/snapshot-schedule-retention
sjmiller609 Mar 10, 2026
670413c
simplify snapshot scheduling: extract error helper, inline thin wrapp…
sjmiller609 Mar 19, 2026
7e59da6
deslop: remove redundant nil guards and unnecessary make
sjmiller609 Mar 19, 2026
59dd46d
Merge remote-tracking branch 'origin/main' into codex/snapshot-schedu…
sjmiller609 Mar 19, 2026
3f2e442
add snapshot-schedule routes to scope mappings
sjmiller609 Mar 19, 2026
208b092
fix gofmt alignment in scope mappings
sjmiller609 Mar 19, 2026
970212b
Merge origin/main into codex/snapshot-schedule-retention
sjmiller609 Mar 24, 2026
2097524
Harden snapshot schedule persistence and timing
sjmiller609 Mar 24, 2026
dac8b77
Fix manager formatting for CI
sjmiller609 Mar 24, 2026
b3110f7
Stabilize scheduled snapshot cleanup tests
sjmiller609 Mar 24, 2026
52e78bf
Add snapshot schedule routes to Stainless config
sjmiller609 Mar 24, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 146 additions & 0 deletions cmd/api/api/snapshots.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package api
import (
"context"
"errors"
"time"

"github.com/kernel/hypeman/lib/hypervisor"
"github.com/kernel/hypeman/lib/instances"
Expand Down Expand Up @@ -203,6 +204,124 @@ func (s *ApiService) ForkSnapshot(ctx context.Context, request oapi.ForkSnapshot
return oapi.ForkSnapshot201JSONResponse(instanceToOAPI(*result)), nil
}

// GetInstanceSnapshotSchedule gets a snapshot schedule for the resolved instance.
func (s *ApiService) GetInstanceSnapshotSchedule(ctx context.Context, request oapi.GetInstanceSnapshotScheduleRequestObject) (oapi.GetInstanceSnapshotScheduleResponseObject, error) {
inst := mw.GetResolvedInstance[instances.Instance](ctx)
if inst == nil {
return oapi.GetInstanceSnapshotSchedule500JSONResponse{Code: "internal_error", Message: "resource not resolved"}, nil
}

scheduleManager, ok := s.InstanceManager.(instances.SnapshotScheduleManager)
if !ok {
return oapi.GetInstanceSnapshotSchedule500JSONResponse{Code: "internal_error", Message: "snapshot scheduling is not available"}, nil
}

schedule, err := scheduleManager.GetSnapshotSchedule(ctx, inst.Id)
if err != nil {
log := logger.FromContext(ctx)
switch {
case errors.Is(err, instances.ErrSnapshotScheduleNotFound):
return oapi.GetInstanceSnapshotSchedule404JSONResponse{Code: "not_found", Message: "snapshot schedule not found"}, nil
case errors.Is(err, instances.ErrNotFound):
return oapi.GetInstanceSnapshotSchedule404JSONResponse{Code: "not_found", Message: "instance not found"}, nil
default:
log.ErrorContext(ctx, "failed to get snapshot schedule", "error", err)
return oapi.GetInstanceSnapshotSchedule500JSONResponse{Code: "internal_error", Message: "failed to get snapshot schedule"}, nil
}
}

return oapi.GetInstanceSnapshotSchedule200JSONResponse(snapshotScheduleToOAPI(*schedule)), nil
}

// SetInstanceSnapshotSchedule creates or updates a snapshot schedule for the resolved instance.
func (s *ApiService) SetInstanceSnapshotSchedule(ctx context.Context, request oapi.SetInstanceSnapshotScheduleRequestObject) (oapi.SetInstanceSnapshotScheduleResponseObject, error) {
inst := mw.GetResolvedInstance[instances.Instance](ctx)
if inst == nil {
return oapi.SetInstanceSnapshotSchedule500JSONResponse{Code: "internal_error", Message: "resource not resolved"}, nil
}
if request.Body == nil {
return oapi.SetInstanceSnapshotSchedule400JSONResponse{Code: "invalid_request", Message: "request body is required"}, nil
}

scheduleManager, ok := s.InstanceManager.(instances.SnapshotScheduleManager)
if !ok {
return oapi.SetInstanceSnapshotSchedule500JSONResponse{Code: "internal_error", Message: "snapshot scheduling is not available"}, nil
}

interval, err := time.ParseDuration(request.Body.Interval)
if err != nil {
return oapi.SetInstanceSnapshotSchedule400JSONResponse{Code: "invalid_request", Message: "interval must be a valid duration"}, nil
}
if request.Body.Retention.MaxCount == nil && request.Body.Retention.MaxAge == nil {
return oapi.SetInstanceSnapshotSchedule400JSONResponse{Code: "invalid_request", Message: "retention must include max_count or max_age"}, nil
}

retention := instances.SnapshotScheduleRetention{}
if request.Body.Retention.MaxCount != nil {
retention.MaxCount = *request.Body.Retention.MaxCount
}
if request.Body.Retention.MaxAge != nil {
maxAge, parseErr := time.ParseDuration(*request.Body.Retention.MaxAge)
if parseErr != nil {
return oapi.SetInstanceSnapshotSchedule400JSONResponse{Code: "invalid_request", Message: "retention.max_age must be a valid duration"}, nil
}
retention.MaxAge = maxAge
}
req := instances.SetSnapshotScheduleRequest{
Interval: interval,
Metadata: toMapTags(request.Body.Metadata),
Retention: retention,
}
if request.Body.NamePrefix != nil {
req.NamePrefix = *request.Body.NamePrefix
}

schedule, err := scheduleManager.SetSnapshotSchedule(ctx, inst.Id, req)
if err != nil {
log := logger.FromContext(ctx)
switch {
case errors.Is(err, instances.ErrInvalidRequest):
return oapi.SetInstanceSnapshotSchedule400JSONResponse{Code: "invalid_request", Message: err.Error()}, nil
case errors.Is(err, instances.ErrNotFound):
return oapi.SetInstanceSnapshotSchedule404JSONResponse{Code: "not_found", Message: "instance not found"}, nil
default:
log.ErrorContext(ctx, "failed to set snapshot schedule", "error", err)
return oapi.SetInstanceSnapshotSchedule500JSONResponse{Code: "internal_error", Message: "failed to set snapshot schedule"}, nil
}
}

return oapi.SetInstanceSnapshotSchedule200JSONResponse(snapshotScheduleToOAPI(*schedule)), nil
}

// DeleteInstanceSnapshotSchedule deletes a snapshot schedule for the resolved instance.
func (s *ApiService) DeleteInstanceSnapshotSchedule(ctx context.Context, request oapi.DeleteInstanceSnapshotScheduleRequestObject) (oapi.DeleteInstanceSnapshotScheduleResponseObject, error) {
inst := mw.GetResolvedInstance[instances.Instance](ctx)
if inst == nil {
return oapi.DeleteInstanceSnapshotSchedule500JSONResponse{Code: "internal_error", Message: "resource not resolved"}, nil
}

scheduleManager, ok := s.InstanceManager.(instances.SnapshotScheduleManager)
if !ok {
return oapi.DeleteInstanceSnapshotSchedule500JSONResponse{Code: "internal_error", Message: "snapshot scheduling is not available"}, nil
}

err := scheduleManager.DeleteSnapshotSchedule(ctx, inst.Id)
if err != nil {
log := logger.FromContext(ctx)
switch {
case errors.Is(err, instances.ErrSnapshotScheduleNotFound):
return oapi.DeleteInstanceSnapshotSchedule404JSONResponse{Code: "not_found", Message: "snapshot schedule not found"}, nil
case errors.Is(err, instances.ErrNotFound):
return oapi.DeleteInstanceSnapshotSchedule404JSONResponse{Code: "not_found", Message: "instance not found"}, nil
default:
log.ErrorContext(ctx, "failed to delete snapshot schedule", "error", err)
return oapi.DeleteInstanceSnapshotSchedule500JSONResponse{Code: "internal_error", Message: "failed to delete snapshot schedule"}, nil
}
}

return oapi.DeleteInstanceSnapshotSchedule204Response{}, nil
}

func snapshotToOAPI(snapshot instances.Snapshot) oapi.Snapshot {
kind := oapi.SnapshotKind(snapshot.Kind)
sourceHypervisor := oapi.SnapshotSourceHypervisor(snapshot.SourceHypervisor)
Expand Down Expand Up @@ -239,3 +358,30 @@ func snapshotToOAPI(snapshot instances.Snapshot) oapi.Snapshot {
}
return out
}

func snapshotScheduleToOAPI(schedule instances.SnapshotSchedule) oapi.SnapshotSchedule {
retention := oapi.SnapshotScheduleRetention{
MaxCount: lo.ToPtr(schedule.Retention.MaxCount),
}
if schedule.Retention.MaxAge > 0 {
maxAge := schedule.Retention.MaxAge.String()
retention.MaxAge = &maxAge
}

out := oapi.SnapshotSchedule{
InstanceId: schedule.InstanceID,
Interval: schedule.Interval.String(),
Metadata: toOAPITags(schedule.Metadata),
Retention: retention,
NextRunAt: schedule.NextRunAt,
CreatedAt: schedule.CreatedAt,
UpdatedAt: schedule.UpdatedAt,
}
if schedule.NamePrefix != "" {
out.NamePrefix = lo.ToPtr(schedule.NamePrefix)
}
out.LastRunAt = schedule.LastRunAt
out.LastSnapshotId = schedule.LastSnapshotID
out.LastError = schedule.LastError
return out
}
32 changes: 32 additions & 0 deletions cmd/api/api/snapshots_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package api

import (
"testing"
"time"

"github.com/kernel/hypeman/lib/instances"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

func TestSnapshotScheduleToOAPIPreservesZeroMaxCount(t *testing.T) {
t.Parallel()

schedule := instances.SnapshotSchedule{
InstanceID: "inst-1",
Interval: time.Hour,
Retention: instances.SnapshotScheduleRetention{
MaxCount: 0,
MaxAge: 24 * time.Hour,
},
NextRunAt: time.Now().UTC().Add(time.Hour),
CreatedAt: time.Now().UTC(),
UpdatedAt: time.Now().UTC(),
}

out := snapshotScheduleToOAPI(schedule)
require.NotNil(t, out.Retention.MaxCount)
assert.Equal(t, 0, *out.Retention.MaxCount)
require.NotNil(t, out.Retention.MaxAge)
assert.Equal(t, "24h0m0s", *out.Retention.MaxAge)
}
23 changes: 23 additions & 0 deletions cmd/api/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -510,6 +510,29 @@ func run() error {
}
})

// Snapshot schedule scheduler
if scheduleManager, ok := app.InstanceManager.(instances.SnapshotScheduleManager); ok {
const snapshotSchedulePollInterval = time.Minute
grp.Go(func() error {
ticker := time.NewTicker(snapshotSchedulePollInterval)
defer ticker.Stop()

logger.Info("snapshot schedule scheduler started", "interval", snapshotSchedulePollInterval)
for {
select {
case <-gctx.Done():
return nil
case <-ticker.C:
if err := scheduleManager.RunSnapshotSchedules(gctx); err != nil {
logger.Error("snapshot schedule run completed with errors", "error", err)
}
}
}
})
} else {
logger.Warn("snapshot schedule manager unavailable; scheduled snapshots disabled")
}

err = grp.Wait()
slog.Info("all goroutines finished")
return err
Expand Down
22 changes: 22 additions & 0 deletions lib/instances/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,28 @@ Any State → Stopped
- Don't prefault pages (lazy loading)
- Parallel with TAP device setup

## Scheduled Snapshot Behavior

- Schedules are configured per instance and persisted in the server data store (outside snapshot payloads).
- A background scheduler evaluates due schedules every minute.
- Each due run chooses snapshot behavior from current source state:
- `Running`/`Standby` sources use `Standby` snapshots.
- `Stopped` sources use `Stopped` snapshots.
- `Standby` runs from `Running` sources perform a brief pause/resume cycle during capture.
- The minimum interval is `1m`, but larger intervals are recommended for heavier or latency-sensitive workloads because running captures pause/resume the guest.
- Scheduled snapshot `name_prefix` is optional and capped at 47 chars so generated names stay within the 63-char snapshot name limit.
- New schedules establish cadence at `now + interval + deterministic jitter` derived from the instance ID.
- Updating only retention, metadata, or `name_prefix` preserves `next_run_at`; changing `interval` establishes a new cadence.
- Schedule runs advance to the next future interval (no backfill flood after downtime).
- Each schedule stores operational status:
- `next_run_at`
- `last_run_at`
- `last_snapshot_id`
- `last_error`
- Retention cleanup runs after successful scheduled snapshot creation and only affects scheduled snapshots for that instance.
- If an instance is deleted, its schedule is retained so retention can continue cleaning existing scheduled snapshots.
- Once the deleted instance has no scheduled snapshots left, the scheduler removes that schedule automatically.

## Reference Handling

Instances use OCI image references directly:
Expand Down
3 changes: 3 additions & 0 deletions lib/instances/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,7 @@ var (

// ErrSnapshotNotFound is returned when a snapshot is not found.
ErrSnapshotNotFound = errors.New("snapshot not found")

// ErrSnapshotScheduleNotFound is returned when a snapshot schedule is not found.
ErrSnapshotScheduleNotFound = errors.New("snapshot schedule not found")
)
5 changes: 5 additions & 0 deletions lib/instances/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package instances
import (
"context"
"fmt"
"os"
"sync"
"time"

Expand Down Expand Up @@ -88,6 +89,8 @@ type manager struct {
meter metric.Meter
tracer trace.Tracer
now func() time.Time
writeFile func(string, []byte, os.FileMode) error
deleteSnapshotFn func(context.Context, string) error
egressProxy *egressproxy.Service
egressProxyServiceOptions egressproxy.ServiceOptions
egressProxyMu sync.Mutex
Expand Down Expand Up @@ -141,13 +144,15 @@ func NewManager(p *paths.Paths, imageManager images.Manager, systemManager syste
vmStarters: vmStarters,
defaultHypervisor: defaultHypervisor,
now: time.Now,
writeFile: os.WriteFile,
meter: meter,
tracer: tracer,
guestMemoryPolicy: policy,
snapshotDefaults: snapshotDefaults,
compressionJobs: make(map[string]*compressionJob),
nativeCodecPaths: make(map[string]string),
}
m.deleteSnapshotFn = m.deleteSnapshot

// Initialize metrics if meter is provided
if meter != nil {
Expand Down
Loading
Loading