feat: Logging and sim changes to make more HPC friendly

Linked-Liszt · Linked-Liszt · commit ce2b49a2b127 · 2025-10-25T00:38:21.000Z
diff --git a/configs/simulator.yaml b/configs/simulator.yaml
@@ -12,10 +12,12 @@ instprm_file: "/app/configs/instruments/11_id.prm.instprm"
 worker_base_dir: "/data/workers"
 
 # --- Execution Control ---
-parallel_jobs: 10
-sims_per_file: 2
+parallel_jobs: 256
+sims_per_file: 100
 master_seed: 42           # Reproducible parameter sampling
 cleanup_worker_dirs: true
+progress_step_pct: 0.1   # Progress logging interval in percent
+log_to_console: false     # Useful in HPC settings
  
 # If true, read SG from "# _original_symmetry_space_group_name_H-M" comment in CIF
 # If using custom CIFs without this comment, set to false to use standard tags
diff --git a/src/simulator/diffraction_generator.py b/src/simulator/diffraction_generator.py
@@ -5,10 +5,10 @@
 from multiprocessing import Pool
 import time
 import shutil
-from tqdm import tqdm
 import argparse
 import yaml
 from typing import Dict, Any
+import logging
 
 # Container: insert parent GSAS-II directory and import package
 GSAS_II_PARENT = os.environ.get("GSAS_II_PATH", "/opt/conda/envs/sim/GSAS-II")
@@ -17,6 +17,90 @@
 
 from . import simulation_worker
 
+
+def setup_logging(output_dir: Path, level: int = logging.INFO, log_to_console: bool = False) -> Path:
+    """Configure root logger to write to a file in the output (data) directory.
+
+    Returns the log file path being used.
+    """
+    if not isinstance(output_dir, Path):
+        output_dir = Path(output_dir)
+    log_file = output_dir / "simulator.log"
+
+    # Clear existing handlers to avoid duplicate logs on re-entry
+    root_logger = logging.getLogger()
+    for h in list(root_logger.handlers):
+        root_logger.removeHandler(h)
+
+    root_logger.setLevel(level)
+
+    formatter = logging.Formatter(
+        fmt="%(asctime)s | %(levelname)s | %(processName)s | %(name)s | %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    if not log_to_console:
+        # File logging
+        output_dir.mkdir(parents=True, exist_ok=True)
+        file_handler = logging.FileHandler(log_file, mode="a", encoding="utf-8")
+        file_handler.setLevel(level)
+        file_handler.setFormatter(formatter)
+        root_logger.addHandler(file_handler)
+
+    # Optional: keep console lean; INFO to file is the default requirement
+    console = logging.StreamHandler(stream=sys.stdout)
+    console.setLevel(level if log_to_console else logging.WARNING)
+    console.setFormatter(formatter)
+    root_logger.addHandler(console)
+
+    logging.info("Logging initialized.")
+    if not log_to_console:
+        logging.info(f"Log file: {log_file}")
+        return log_file
+    else:
+        logging.info("Logging to console")
+        return Path("CONSOLE")
+
+
+class ProgressLogger:
+    """Helper to log progress at fixed percentage steps and report rate (ips)."""
+
+    def __init__(self, total: int, label: str, step_pct: float = 10.0):
+        self.total = max(1, int(total))
+        self.label = label
+        self.step_pct = float(step_pct)
+        self.completed = 0
+        self._tick_idx = 0
+        # lightweight rate tracking
+        self._start_time = time.monotonic()
+        self._last_time = self._start_time
+        self._last_count = 0
+        logging.info(f"{self.label}: 0% complete (0/{self.total}) | ips=0.00 avg_ips=0.00")
+
+    def update(self, n: int = 1) -> None:
+        self.completed += n
+        pct = (self.completed / self.total) * 100.0
+        # compute next threshold based on tick index, guard at 100%
+        threshold = min(100.0, (self._tick_idx + 1) * self.step_pct)
+        while pct >= threshold:
+            now = time.monotonic()
+            dt_inst = max(now - self._last_time, 1e-9)
+            dt_avg = max(now - self._start_time, 1e-9)
+            inst_ips = (self.completed - self._last_count) / dt_inst
+            avg_ips = self.completed / dt_avg
+            # choose percent formatting based on step size
+            if self.step_pct >= 1.0:
+                pct_str = f"{int(threshold)}%"
+            else:
+                pct_str = f"{threshold:.1f}%"
+            logging.info(
+                f"{self.label}: {pct_str} complete ({self.completed}/{self.total}) | ips={inst_ips:.2f} avg_ips={avg_ips:.2f}"
+            )
+            # reset window
+            self._last_time = now
+            self._last_count = self.completed
+            self._tick_idx += 1
+            threshold = min(100.0, (self._tick_idx + 1) * self.step_pct)
+
 class DiffractionGenerator:
     """
     Library for large-scale diffraction simulation.
@@ -35,115 +119,113 @@ def __init__(self, input_dir, output_dir, instprm_file, n_parallel_sims=70, erro
         self.worker_base_dir = Path(worker_base_dir).resolve() if worker_base_dir else self.output_dir / 'worker_temp_dirs'
         for dirname in [self.data_dir, self.error_dir, self.worker_base_dir]:
             dirname.mkdir(parents=True, exist_ok=True)
-        print(f"Using instrument parameter file: {self.instprm_file}")
+        logging.info(f"Using instrument parameter file: {self.instprm_file}")
 
     def find_files(self):
-        print("Searching for simulation input files...")
+        logging.info("Searching for simulation input files...")
         start_time = time.time()
         self.file_list = sorted(list(set(self.input_dir.glob('*.cif'))))
         elapsed = time.time() - start_time
-        print(f"--> Found {len(self.file_list)} unique files in {elapsed:.2f} seconds.")
+        logging.info(f"--> Found {len(self.file_list)} unique files in {elapsed:.2f} seconds.")
 
 
     def _generate_simulation_tasks(self, n_sims_per_file, master_seed, cleanup_worker_dirs, **kwargs):
         """Generator that yields tasks, skips completed jobs, and includes noise parameters."""
         if not self.file_list: self.find_files()
-        
-        param_ranges = { # Defaults
-            'strain_range': (0.0, 0.0), 'size_range': (0.0, 0.0), 'U_range': (0.0, 0.0), 
-            'V_range': (0.0, 0.0), 'W_range': (0.0, 0.0), 'st_range': (5.0, 20.0), 
-            'en_range': (20.0, 20.0), 'Npoints_range': (8192, 8192), 'scaler_range': (1.0, 1.0), 
-            'wl_range': (0.6199, 0.6199), 'proportional_noise_range': (0.0, 0.0),
-            'constant_noise_range': (0, 0),
-        }
+
+        param_ranges = {}
         param_ranges.update(kwargs)
         
         rng = np.random.default_rng(master_seed)
         job_id_counter = 0
-        tasks_to_run, tasks_to_skip = [], 0
-        
+        tasks_to_skip = 0
+        processed_count = 0
+
         total_planned = len(self.file_list) * int(n_sims_per_file)
-        disable_bar = not sys.stderr.isatty()
-        with tqdm(
-            total=total_planned,
-            desc="Preparing tasks",
-            unit="task",
-            mininterval=1.0,
-            disable=disable_bar,
-        ) as pbar:
-            for file_path in self.file_list:
-                stem = Path(file_path).stem
-                for variation_index in range(1, n_sims_per_file + 1):
-                    final_output_path = self.data_dir / f"{stem}-{variation_index}.npy"
-                    if final_output_path.exists():
-                        tasks_to_skip += 1
-                        job_id_counter += 1
-                        pbar.update(1)
-                        continue
-
-                    worker_dir = self.worker_base_dir / f'job_{job_id_counter:09d}'
-                    params = {
-                        'job_id': job_id_counter, 'worker_dir': str(worker_dir), 'input_file': str(file_path),
-                        'output_data_dir': str(self.data_dir), 'error_dir': str(self.error_dir),
-                        'instprm_file': str(self.instprm_file), 'noise_seed': rng.integers(1e9),
-                        'strain': rng.uniform(*param_ranges['strain_range']), 'size': rng.uniform(*param_ranges['size_range']),
-                        'U': rng.uniform(*param_ranges['U_range']), 'V': rng.uniform(*param_ranges['V_range']),
-                        'W': rng.uniform(*param_ranges['W_range']), 'st': rng.uniform(*param_ranges['st_range']),
-                        'en': rng.uniform(*param_ranges['en_range']), 'Npoints': int(rng.uniform(*param_ranges['Npoints_range'])),
-                        'scaler': rng.uniform(*param_ranges['scaler_range']), 'wl': rng.uniform(*param_ranges['wl_range']),
+        step_pct = float(kwargs['progress_step_pct'])
+        prog_logger = ProgressLogger(total_planned, label="Task preparation", step_pct=step_pct)
+        for file_path in self.file_list:
+            stem = Path(file_path).stem
+            for variation_index in range(1, n_sims_per_file + 1):
+                worker_dir = self.worker_base_dir / f'job_{job_id_counter:09d}'
+                params = {
+                    'job_id': job_id_counter,
+                    'worker_dir': str(worker_dir),
+                    'input_file': str(file_path),
+                    'output_data_dir': str(self.data_dir),
+                    'error_dir': str(self.error_dir),
+                    'instprm_file': str(self.instprm_file),
+                    # Ranges for worker-side parameter generation
+                    'param_ranges': {
+                        'strain_range': param_ranges['strain_range'],
+                        'size_range': param_ranges['size_range'],
+                        'U_range': param_ranges['U_range'],
+                        'V_range': param_ranges['V_range'],
+                        'W_range': param_ranges['W_range'],
+                        'st_range': param_ranges['st_range'],
+                        'en_range': param_ranges['en_range'],
+                        'Npoints_range': param_ranges['Npoints_range'],
+                        'scaler_range': param_ranges['scaler_range'],
+                        'wl_range': param_ranges['wl_range'],
                         'proportional_noise_range': param_ranges['proportional_noise_range'],
                         'constant_noise_range': param_ranges['constant_noise_range'],
-                        'cleanup_worker_dir': cleanup_worker_dirs,
-                        'output_filename': f"{stem}-{variation_index}.npy",
-                        'parse_from_comment': bool(kwargs.get('parse_from_comment', False)),
-                    }
-                    tasks_to_run.append(params)
-                    job_id_counter += 1
-                    pbar.update(1)
-        
-        if tasks_to_skip > 0: print(f"--> Found and skipped {tasks_to_skip} previously completed jobs.")
-        for task in tasks_to_run: yield task
+                    },
+                    'seed': int(rng.integers(1_000_000_000)),
+                    'cleanup_worker_dir': cleanup_worker_dirs,
+                    'output_filename': f"{stem}-{variation_index}.npy",
+                    'parse_from_comment': bool(kwargs['parse_from_comment']),
+                }
+                job_id_counter += 1
+                processed_count += 1
+                prog_logger.update(1)
+                yield params
 
     def run(self, n_sims_per_file, master_seed=12345, cleanup_worker_dirs=True, **kwargs):
-        tasks = list(self._generate_simulation_tasks(n_sims_per_file, master_seed, cleanup_worker_dirs, **kwargs))
-        if not tasks:
-            print("All simulation jobs are already complete. Nothing to do.")
+        # Prepare counts but stream tasks to pool to avoid huge memory/time
+        if not self.file_list:
+            self.find_files()
+        total_jobs_to_run = len(self.file_list) * int(n_sims_per_file)
+        progress_step_pct = float(kwargs['progress_step_pct'])
+        tasks_iter = self._generate_simulation_tasks(n_sims_per_file, master_seed, cleanup_worker_dirs, **kwargs)
+        if total_jobs_to_run == 0:
+            logging.info("All simulation jobs are already complete. Nothing to do.")
             return
 
-        total_jobs_to_run = len(tasks)
-        print(f"Starting {total_jobs_to_run} new simulations with {self.n_parallel_sims} parallel processes...")
-        
+        logging.info(f"Starting {total_jobs_to_run} new simulations with {self.n_parallel_sims} parallel processes...")
+
         simulation_start_time = time.time()
         success_count = 0
-        
+        prog_logger = ProgressLogger(total_jobs_to_run, label="Simulation progress", step_pct=progress_step_pct)
+
         with Pool(self.n_parallel_sims, maxtasksperchild=1000, initializer=simulation_worker.suppress_worker_stdout) as p:
-            results = tqdm(p.imap_unordered(simulation_worker.run_single_simulation, tasks), total=total_jobs_to_run)
-            for result in results:
-                if result: success_count += 1
+            for result in p.imap_unordered(simulation_worker.run_single_simulation, tasks_iter, chunksize=1000):
+                if result:
+                    success_count += 1
+                prog_logger.update(1)
         
         simulation_end_time = time.time()
         
         cleanup_duration = 0
         if cleanup_worker_dirs:
-            print("Cleaning up temporary worker directories...")
+            logging.info("Cleaning up temporary worker directories...")
             cleanup_start_time = time.time()
             shutil.rmtree(self.worker_base_dir)
             cleanup_duration = time.time() - cleanup_start_time
-            print(f"--> Cleanup complete in {cleanup_duration:.2f} seconds.")
+            logging.info(f"--> Cleanup complete in {cleanup_duration:.2f} seconds.")
 
         simulation_duration = simulation_end_time - simulation_start_time
         total_duration = simulation_duration + cleanup_duration
         failure_count = total_jobs_to_run - success_count
         sims_per_sec = (success_count / simulation_duration) if simulation_duration > 0 else 0
-        avg_time_per_sim = (success_count / total_jobs_to_run) if total_jobs_to_run > 0 else 0
+        avg_time_per_sim = (simulation_duration / success_count) if success_count > 0 else 0
 
-        print("\n--- Performance Summary ---")
-        print(f"Jobs to Run: {total_jobs_to_run} | Succeeded: {success_count} | Failed: {failure_count}")
-        print(f"Total Simulation Time: {time.strftime('%H:%M:%S', time.gmtime(simulation_duration))}")
-        print(f"Overall Run Time:      {time.strftime('%H:%M:%S', time.gmtime(total_duration))}")
-        print(f"Throughput:            {sims_per_sec:.2f} simulations/sec")
-        print(f"Avg. Time/Simulation:  {avg_time_per_sim:.3f} seconds")
-        print("---------------------------")
+        logging.info("\n--- Performance Summary ---")
+        logging.info(f"Jobs to Run: {total_jobs_to_run} | Succeeded: {success_count} | Failed: {failure_count}")
+        logging.info(f"Total Simulation Time: {time.strftime('%H:%M:%S', time.gmtime(simulation_duration))}")
+        logging.info(f"Overall Run Time:      {time.strftime('%H:%M:%S', time.gmtime(total_duration))}")
+        logging.info(f"Throughput:            {sims_per_sec:.2f} simulations/sec")
+        logging.info(f"Avg. Time/Simulation:  {avg_time_per_sim:.3f} seconds")
+        logging.info("---------------------------")
 
 
 def load_config(path: Path) -> Dict[str, Any]:
@@ -171,12 +253,23 @@ def main() -> None:
     error_directory = str(Path(cfg["error_directory"]))
     worker_base_dir = str(Path(cfg["worker_base_dir"]))
 
+    # Initialize logging (default to file in /data mount)
+    log_to_console = bool(cfg["log_to_console"])
+    try:
+        log_path = setup_logging('/data', log_to_console=log_to_console)
+    except Exception as e:
+        # Fallback to a writable temp directory if /data cannot be used
+        fallback = Path('/tmp')
+        log_path = setup_logging(fallback, log_to_console=log_to_console)
+        logging.warning(f"Failed to initialize logging in /data: {e}. Using fallback: {fallback}")
+
     # Execution controls
     parallel_jobs = int(cfg["parallel_jobs"])
     sims_per_file = int(cfg["sims_per_file"])
     master_seed = int(cfg["master_seed"])
     cleanup_worker_dirs = bool(cfg["cleanup_worker_dirs"])
-    parse_from_comment = bool(cfg.get("parse_from_comment", False))
+    parse_from_comment = bool(cfg["parse_from_comment"])
+    progress_step_pct = float(cfg["progress_step_pct"])
 
     # Parameter ranges
     ranges = {
@@ -195,21 +288,24 @@ def main() -> None:
     }
 
     # Log planned operation
-    print("\n--- Simulator configuration ---")
-    print(f"Input directory:            {input_directory}")
-    print(f"Output directory:           {output_directory}")
-    print(f"Error directory:            {error_directory}")
-    print(f"Worker base dir:            {worker_base_dir}")
-    print(f"Instrument file:            {instprm_file}")
-    print(f"Parallel jobs:              {parallel_jobs}")
-    print(f"Sims per file:              {sims_per_file}")
-    print(f"Master seed:                {master_seed}")
-    print(f"Cleanup worker dirs:        {cleanup_worker_dirs}")
-    print(f"Parse from comment:         {parse_from_comment}")
-    print("Parameter ranges:")
+    logging.info("\n--- Simulator configuration ---")
+    logging.info(f"Log file:                   {log_path}")
+    logging.info(f"Input directory:            {input_directory}")
+    logging.info(f"Output directory:           {output_directory}")
+    logging.info(f"Error directory:            {error_directory}")
+    logging.info(f"Worker base dir:            {worker_base_dir}")
+    logging.info(f"Instrument file:            {instprm_file}")
+    logging.info(f"Parallel jobs:              {parallel_jobs}")
+    logging.info(f"Sims per file:              {sims_per_file}")
+    logging.info(f"Master seed:                {master_seed}")
+    logging.info(f"Cleanup worker dirs:        {cleanup_worker_dirs}")
+    logging.info(f"Parse from comment:         {parse_from_comment}")
+    logging.info(f"Progress step percent:      {progress_step_pct}")
+    logging.info(f"Log to console:             {log_to_console}")
+    logging.info("Parameter ranges:")
     for k, v in ranges.items():
-        print(f"  {k}: {v}")
-    print("--------------------------------\n")
+        logging.info(f"  {k}: {v}")
+    logging.info("--------------------------------\n")
 
     # Run simulations
     try:
@@ -226,10 +322,11 @@ def main() -> None:
             master_seed=master_seed,
             cleanup_worker_dirs=cleanup_worker_dirs,
             parse_from_comment=parse_from_comment,
+            progress_step_pct=progress_step_pct,
             **ranges,
         )
     except Exception as e:
-        print(f"FATAL: Simulator run failed: {e}", file=sys.stderr)
+        logging.exception(f"FATAL: Simulator run failed: {e}")
         sys.exit(1)
 
 
diff --git a/src/simulator/simulation_worker.py b/src/simulator/simulation_worker.py