|
""" |
|
utils for submitting to clusters, such as slurm |
|
""" |
|
|
|
import os |
|
from omegaconf import DictConfig, OmegaConf |
|
from datetime import datetime |
|
from pathlib import Path |
|
|
|
from utils.print_utils import cyan |
|
|
|
|
|
REPO_DIR = None |
|
|
|
|
|
def submit_slurm_job( |
|
cfg: DictConfig, |
|
python_args: str, |
|
project_root: Path, |
|
): |
|
log_dir = project_root / "slurm_logs" / f"{datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}-{cfg.name}" |
|
log_dir.mkdir(exist_ok=True, parents=True) |
|
(project_root / "slurm_logs" / "latest").unlink(missing_ok=True) |
|
(project_root / "slurm_logs" / "latest").symlink_to(log_dir, target_is_directory=True) |
|
|
|
params = dict(name=cfg.name, log_dir=log_dir, project_root=project_root, python_args=python_args) |
|
params.update(cfg.cluster.params) |
|
|
|
slurm_script = cfg.cluster.launch_template.format(**params) |
|
|
|
slurm_script_path = log_dir / "job.slurm" |
|
with slurm_script_path.open("w") as f: |
|
f.write(slurm_script) |
|
|
|
os.system(f"chmod +x {slurm_script_path}") |
|
os.system(f"sbatch {slurm_script_path}") |
|
|
|
print(f"\n{cyan('script:')} {slurm_script_path}\n{cyan('slurm errors and logs:')} {log_dir}\n") |
|
|
|
return log_dir |
|
|