Add trainer container

Linked-Liszt · Linked-Liszt · commit f5ed98c0f1a6 · 2026-01-14T17:45:26.000-06:00
diff --git a/.gitignore b/.gitignore
@@ -13,6 +13,9 @@ og
 /src/trainer/outputs
 /src/trainer/mlruns
 
+# Docker Training
+outputs
+
 # Temp uv setup
 .python-version
 pyproject.toml
diff --git a/compose.yaml b/compose.yaml
@@ -26,6 +26,27 @@ services:
     command: python -m simulator.diffraction_generator --config /app/configs/simulator.yaml
     restart: "no"
 
+  trainer:
+    build:
+      context: .
+      dockerfile: docker/trainer.Dockerfile
+    volumes:
+      - ./data/:/data/
+      - ./configs:/configs:ro
+      - ./outputs:/outputs
+      - ./src/trainer:/app:ro
+    environment:
+      - PYTHONUNBUFFERED=1
+      - MPLCONFIGDIR=/tmp/matplotlib
+      - MKL_THREADING_LAYER=GNU
+      - MKL_SERVICE_FORCE_INTEL=1
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+    runtime: nvidia
+    user: "${UID}:${GID}"
+    command: python /app/run_train_with_manifests.py /configs/trainer.docker.yaml
+    restart: "no"
+
   ui:
     build:
       context: .
@@ -38,4 +59,5 @@ services:
     environment:
       - PYTHONUNBUFFERED=1
       - PORT=7860
-    restart: unless-stopped
+    restart: unless-stopped
+
diff --git a/configs/trainer.docker.yaml b/configs/trainer.docker.yaml
@@ -0,0 +1,121 @@
+data:
+  manifest_dir: "/data/manifests"
+  dataset_root: "/data/dataset"
+  auto_generate_manifests: true
+  train_ratio: 0.8
+  val_ratio: 0.1
+  test_ratio: 0.1
+  seed: 42
+
+  loader:
+    # --- DataLoader ---
+    batch_size: 64
+    num_workers: 8
+    pin_memory: true
+    persistent_workers: true
+    prefetch_factor: 2
+    train_file: "train.jsonl"
+    val_file: "val.jsonl"
+    test_file: "test.jsonl"
+
+  preprocessing:
+    validate_paths: false
+    extract_labels: true
+    allow_pickle: true
+    labels_key_map:
+      x: "dp"
+      cs: "cs"
+      sg: "sg"
+      lattice_params: null
+      lp_a: "_cell_length_a"
+      lp_b: "_cell_length_b"
+      lp_c: "_cell_length_c"
+      lp_alpha: "_cell_angle_alpha"
+      lp_beta: "_cell_angle_beta"
+      lp_gamma: "_cell_angle_gamma"
+    dtype: "float32"
+    mmap_mode: null
+    floor_at_zero: true
+    normalize_log1p: false
+    shift_labels: true
+
+  augmentation:
+    noise_poisson_range: [1.0, 100.0]
+    noise_gaussian_range: [0.001, 0.1]
+    standardize_to: [0.0, 100.0]
+
+model:
+  type: "multiscale"
+
+  backbone:
+    dim_in: 8192
+    dims: [80, 80, 80]
+    kernel_sizes: [100, 50, 25]
+    strides: [5, 5, 5]
+    dropout_rate: 0.3
+    layer_scale_init_value: 0.0
+    drop_path_rate: 0.3
+    ramped_dropout_rate: false
+    block_type: "convnext"
+    pooling_type: "average"
+    final_pool: true
+    use_batchnorm: false
+    activation: "leaky_relu"
+    output_type: "flatten"
+
+  heads:
+    head_dropout: 0.5
+    cs_hidden: [2300, 1150]
+    sg_hidden: [2300, 1150]
+    lp_hidden: [512, 256]
+
+  tasks:
+    num_cs_classes: 7
+    num_sg_classes: 230
+    num_lp_outputs: 6
+
+    lp_bounds_min: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+    lp_bounds_max: [300.0, 300.0, 300.0, 180.0, 180.0, 180.0]
+    bound_lp_with_sigmoid: true
+
+  loss:
+    lambda_cs: 1.0
+    lambda_sg: 1.0
+    lambda_lp: 1.0
+
+    gemd_mu: 0.0
+    gemd_distance_matrix_path: null
+
+optimizer:
+  lr: 0.0002
+  weight_decay: 0.01
+  use_adamw: true
+  gradient_clip_val: 1.0
+  gradient_clip_algorithm: "norm"
+
+trainer:
+  default_root_dir: "/outputs/convnext_paper"
+  max_epochs: 100
+  accumulate_grad_batches: 1
+  precision: "32"
+  accelerator: "gpu"
+  devices: 1
+  log_every_n_steps: 200
+  deterministic: false
+  benchmark: true
+
+logging:
+  logger: "mlflow"
+  csv_logger_name: "model_logs_convnext_paper"
+  mlflow_experiment_name: "AlphaDiffract_Paper_ConvNeXt"
+  mlflow_tracking_uri: "file:/outputs/mlruns"
+  mlflow_run_name: "ConvNeXt_Paper_Run"
+
+checkpointing:
+  monitor: "val/loss"
+  mode: "min"
+  save_top_k: 1
+  every_n_epochs: 1
+
+  resume_from: null
+  test_after_train: true
diff --git a/configs/trainer.local.yaml b/configs/trainer.local.yaml
@@ -1,6 +1,6 @@
 data:
-  manifest_dir: "../../../ad_data/manifests"
-  dataset_root: "../../../ad_data/data/dataset"
+  manifest_dir: "data/manifests"
+  dataset_root: "data/dataset"
   extra_val_file: "rruff.jsonl"
   auto_generate_manifests: true
   train_ratio: 0.8
diff --git a/docker/trainer.Dockerfile b/docker/trainer.Dockerfile
@@ -0,0 +1,18 @@
+FROM pytorch/pytorch:2.2.2-cuda11.8-cudnn8-runtime
+
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+ENV MPLCONFIGDIR=/tmp/matplotlib
+
+WORKDIR /app
+
+# Install Python dependencies for the trainer
+COPY src/trainer/requirements.txt /tmp/requirements.txt
+RUN pip install --no-cache-dir -r /tmp/requirements.txt
+
+# Trainer source (can be overridden by bind-mount in compose)
+COPY src/trainer /app
+
+ENV PYTHONPATH=/app
+
+CMD ["python", "/app/run_train_with_manifests.py", "/configs/trainer.docker.yaml"]
diff --git a/src/trainer/requirements.txt b/src/trainer/requirements.txt
@@ -0,0 +1,7 @@
+pytorch-lightning>=2.1,<3
+PyYAML>=6.0
+numpy>=1.24
+tqdm>=4.66
+matplotlib>=3.7
+scikit-learn>=1.3
+mlflow>=2.8
diff --git a/src/trainer/run_train_with_manifests.py b/src/trainer/run_train_with_manifests.py
@@ -0,0 +1,88 @@
+from __future__ import annotations
+
+import argparse
+import os
+import subprocess
+import sys
+from typing import Any, Dict
+
+import yaml
+
+from dataset.manifest_utils import generate_manifests
+
+
+def _parse_args() -> argparse.Namespace:
+    p = argparse.ArgumentParser(
+        description="Generate dataset manifests (if needed) then run training."
+    )
+    p.add_argument(
+        "config",
+        type=str,
+        help="Path to trainer config YAML (e.g., /configs/trainer.docker.yaml)",
+    )
+    p.add_argument(
+        "--skip-manifests",
+        action="store_true",
+        help="Skip manifest generation step.",
+    )
+    p.add_argument(
+        "--only-manifests",
+        action="store_true",
+        help="Only generate manifests, then exit.",
+    )
+    return p.parse_args()
+
+
+def _load_config(path: str) -> Dict[str, Any]:
+    if not os.path.isfile(path):
+        raise FileNotFoundError(f"Config file not found: {path}")
+    with open(path, "r", encoding="utf-8") as f:
+        cfg = yaml.safe_load(f)
+    if not isinstance(cfg, dict):
+        raise ValueError(f"Config must be a mapping (YAML dict), got: {type(cfg)}")
+    return cfg
+
+
+def _resolve_from_cwd(path: str) -> str:
+    return path if os.path.isabs(path) else os.path.normpath(os.path.join(os.getcwd(), path))
+
+
+def _generate_from_config(cfg: Dict[str, Any]) -> None:
+    if "data" not in cfg:
+        raise KeyError("Config missing required 'data' section")
+
+    data_cfg = cfg["data"]
+    required = ["dataset_root", "manifest_dir", "train_ratio", "val_ratio", "test_ratio", "seed"]
+    for key in required:
+        if key not in data_cfg:
+            raise KeyError(f"Config data.{key} is required")
+
+    dataset_root = _resolve_from_cwd(str(data_cfg["dataset_root"]))
+    manifest_dir = _resolve_from_cwd(str(data_cfg["manifest_dir"]))
+
+    generate_manifests(
+        dataset_root=dataset_root,
+        manifest_dir=manifest_dir,
+        train_ratio=float(data_cfg["train_ratio"]),
+        val_ratio=float(data_cfg["val_ratio"]),
+        test_ratio=float(data_cfg["test_ratio"]),
+        seed=int(data_cfg["seed"]),
+    )
+
+
+def main() -> None:
+    args = _parse_args()
+    cfg = _load_config(args.config)
+
+    if not args.skip_manifests:
+        _generate_from_config(cfg)
+
+    if args.only_manifests:
+        return
+
+    train_path = os.path.join(os.path.dirname(__file__), "train.py")
+    subprocess.check_call([sys.executable, train_path, args.config])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/trainer/train.py b/src/trainer/train.py
@@ -83,7 +83,7 @@ def build_datamodule_from_cfg(cfg: Dict[str, Any]) -> NpyDataModule:
         val_ratio=data_cfg["val_ratio"],
         test_ratio=data_cfg["test_ratio"],
         seed=data_cfg["seed"],
-        extra_val_file=data_cfg["extra_val_file"],
+        extra_val_file=data_cfg.get("extra_val_file"),
         # Optional noise augmentation: apply to training split only
         noise_poisson_range=tuple(aug_cfg["noise_poisson_range"]) if aug_cfg["noise_poisson_range"] is not None else None,
         noise_gaussian_range=tuple(aug_cfg["noise_gaussian_range"]) if aug_cfg["noise_gaussian_range"] is not None else None,