import os import gc import sys import numpy as np from sklearn.model_selection import train_test_split, KFold from sklearn.metrics import r2_score import tensorflow as tf import logging import psutil import subprocess import matplotlib.pyplot as plt from tensorflow.keras.mixed_precision import set_global_policy set_global_policy('mixed_float16') BATCHSIZE = int(sys.argv[1]) EPOCHS = int(sys.argv[2]) lr = float(sys.argv[3]) fps_file = sys.argv[4] y_true_file = sys.argv[5] ################################################################## model_name = sys.argv[6] if len(sys.argv) > 6 else None target_path = sys.argv[7] if len(sys.argv) > 7 else None cv = int(sys.argv[8]) if len(sys.argv) > 8 and sys.argv[8] != 'None' else None test_size = float(sys.argv[9]) if len(sys.argv) > 9 else 0.1 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') def print_cpu_memory(): memory_info = psutil.virtual_memory() logging.info(f"Total Memory: {memory_info.total / (1024 ** 3):.2f} GB") logging.info(f"Available Memory: {memory_info.available / (1024 ** 3):.2f} GB") logging.info(f"Used Memory: {memory_info.used / (1024 ** 3):.2f} GB") logging.info(f"Memory Usage: {memory_info.percent}%") def print_gpu_memory(status=""): try: result = subprocess.run(['nvidia-smi', '--query-gpu=memory.used,memory.total', '--format=csv,nounits,noheader'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) if result.returncode == 0: lines = result.stdout.strip().split('\n') for idx, line in enumerate(lines): used, total = line.split(', ') logging.info(f"[{status}] GPU {idx}: Memory Usage: {used} MB / {total} MB") except Exception as e: logging.error(f"Error executing nvidia-smi: {e}") def save_history_plot(history, target_path, model_name, test_size, fold=None): plt.figure(figsize=(12, 8)) plt.subplot(2, 1, 1) plt.plot(history.history['loss'], label='Training Loss') if 'val_loss' in history.history: plt.plot(history.history['val_loss'], label='Validation Loss') plt.title(f'Model Loss (test_size={test_size})') plt.ylabel('Loss') plt.xlabel('Epoch') plt.legend() plt.subplot(2, 1, 2) for metric in history.history: if metric.startswith('val_'): continue plt.plot(history.history[metric], label=f'Training {metric}') val_metric = f'val_{metric}' if val_metric in history.history: plt.plot(history.history[val_metric], label=f'Validation {metric}') plt.title(f'Model Metrics (test_size={test_size})') plt.ylabel('Value') plt.xlabel('Epoch') plt.legend() plt.tight_layout() file_name = f"{model_name}_history{'_fold'+str(fold) if fold else ''}_test_size[{test_size}].png" plt.savefig(os.path.join(target_path, model_name, file_name), dpi=300) plt.close() def load_model(target_path, model_name, test_size, cv=None): model_path = f"{target_path}/{model_name}/{model_name}_full_model{'_cv'+str(cv) if cv else ''}_test_size[{test_size}].keras" try: if os.path.exists(model_path): model = tf.keras.models.load_model(model_path, compile=False) logging.info(f"Model successfully loaded from {model_path}") return model else: logging.error(f"Model path does not exist: {model_path}") return None except Exception as e: logging.error(f"Error loading model: {e}") return None def preprocess_data(xtr, ytr, use_parallel=False): dataset = tf.data.Dataset.from_tensor_slices((xtr, ytr)) if use_parallel: dataset = dataset.map(lambda x, y: (x, y), num_parallel_calls=tf.data.AUTOTUNE) dataset = dataset.shuffle(buffer_size=len(xtr)).batch(BATCHSIZE).cache().prefetch(tf.data.AUTOTUNE) return dataset # def train_model(model, train_dataset, target_path, model_name, fold=None): def train_model(model, train_dataset, valid_dataset, target_path, model_name, fold=None): checkpoint_dir = f"{target_path}/checkpoints/{model_name}" os.makedirs(checkpoint_dir, exist_ok=True) checkpoint_path = os.path.join(checkpoint_dir, f"model{'_fold'+str(fold) if fold else ''}.keras") cp = tf.keras.callbacks.ModelCheckpoint( filepath=checkpoint_path, save_weights_only=False, save_best_only=True, monitor='val_loss', mode='min', verbose=1, ) # es = tf.keras.callbacks.EarlyStopping( # monitor='val_loss', # patience=EPOCHS, # restore_best_weights=True, # mode='min', # verbose=0, # ) history = model.fit( train_dataset, epochs=EPOCHS, validation_data=valid_dataset, # callbacks=[cp, es], callbacks=[cp], #, es], verbose=0, ) save_history_plot(history, target_path, model_name, fold) del train_dataset gc.collect() def clear_gpu_memory(): tf.keras.backend.clear_session() gc.collect() logging.info("GPU memory cleared.") def main(): try: os.makedirs(f"{target_path}/{model_name}", exist_ok=True) model = load_model(target_path, model_name, test_size, cv) if model is None: raise ValueError("Failed to load model") model.compile( optimizer=tf.keras.optimizers.Adam(learning_rate=lr), loss=tf.keras.losses.MeanSquaredError(), metrics=[ tf.keras.metrics.MeanSquaredError(), tf.keras.metrics.MeanAbsoluteError(), tf.keras.metrics.RootMeanSquaredError() ] ) fps = np.load(fps_file) y_true = np.load(y_true_file) model_input_shape = model.input_shape if model_input_shape[1] != fps.shape[1]: raise ValueError(f"Model input dimension ({model_input_shape[1]}) does not match data dimension ({fps.shape[1]})") if cv is not None and cv > 1: xtr, xte, ytr, yte = train_test_split(fps, y_true, test_size=test_size, random_state=42) kf = KFold(n_splits=cv, shuffle=True, random_state=42) avg_r2_score = [] for fold, (train_index, test_index) in enumerate(kf.split(xtr), 1): xtr_cv, xte_cv = xtr[train_index], xtr[test_index] ytr_cv, yte_cv = ytr[train_index], ytr[test_index] train_dataset = preprocess_data(xtr_cv, ytr_cv, use_parallel=True) train_model(model, train_dataset, target_path, model_name, fold) ypred = model.predict(xte_cv, verbose=0) r2_scores = r2_score(yte_cv, ypred) if np.isnan(r2_scores) or np.isinf(r2_scores) or r2_scores <= 0: logging.warning(f"[cv][{fold}th] : R2 score : 0.000000 (prune)") else: logging.info(f"[cv][{fold}th] : R2 score : {r2_scores:.6f}") avg_r2_score.append(r2_scores) clear_gpu_memory() print_cpu_memory() print_gpu_memory(f"Fold {fold}") r2_result_res_avg = np.mean(avg_r2_score) logging.info(f"[cv][{fold}th][Avg] : R2 score : {r2_result_res_avg:.6f}") ypred = model.predict(xte, verbose=0) r2_result = r2_score(yte, ypred) os.makedirs(f"save_model/{model_name}", exist_ok=True) model.save(f"save_model/{model_name}/{model_name}_full_model{'_cv'+str(cv) if cv else ''}_test_size[{test_size}]_r2score[{r2_result:<.4f}].keras") del model logging.info(f"[cv][{fold}th][Result] : R2 score : {r2_result:.6f}") print(f"{r2_result:.6f}") else: xtr, xte, ytr, yte = train_test_split(fps, y_true, test_size=test_size, random_state=42) xtr, xtev, ytr, ytev = train_test_split(xtr, ytr, test_size=0.1, random_state=42) train_dataset = preprocess_data(xtr, ytr, use_parallel=True) valid_dataset = preprocess_data(xtev, ytev, use_parallel=True) train_model(model, train_dataset, valid_dataset, target_path, model_name) # train_model(model, train_dataset, target_path, model_name) ypred = model.predict(xte, verbose=0) r2_result = r2_score(yte, ypred) os.makedirs(f"save_model/{model_name}", exist_ok=True) model.save(f"save_model/{model_name}/{model_name}_full_model{'_cv'+str(cv) if cv else ''}_test_size[{test_size}]_r2score[{r2_result:<.4f}].keras") del model if np.isnan(r2_result) or np.isinf(r2_result) or r2_result <= 0: logging.warning("R2: 0.000000 (prune)") else: logging.info(f"R2: {r2_result:.6f}") print(f"{r2_result:.6f}") except Exception as e: logging.error(f"Error in learning process: {e}") print("0.000000") finally: clear_gpu_memory() print_cpu_memory() print_gpu_memory("Final") if __name__ == "__main__": main()