본문 바로가기
App Programming/MLops

[MLops] 학습 결과 기록하기

by goatlab 2024. 8. 12.
728x90
반응형
SMALL

wandb

 

W&B (Weight and Bias)은 실험 그룹 혹은 실험 단위로 실험 이력 요소들을 관리할 수 있다. https://wandb.ai/site에 접속하여 가입한 후 API 키를 발급받는다.

 

pip install wandb

 

그리고 다음 경로에서 API 키를 저장하기 위해 환경 변수 파일인 .env를 생성한다.

 

 

src/utils/utils.py

 

Run name 자동 지정하기 위해 다음을 추가한다.

 

def auto_increment_run_suffix(name: str, pad=3):
    suffix = name.split("-")[-1]
    next_suffix = str(int(suffix) + 1).zfill(pad)
    return name.replace(suffix, next_suffix)

 

src/main.py

 

import os
import sys

sys.path.append(
    os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
)

import fire
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm
from src.dataset.watch_log import get_datasets
from src.model.movie_predictor import MoviePredictor
from src.utils.utils import init_seed, model_dir, auto_increment_run_suffix
from src.train.train import train
from src.evaluate.evaluate import evaluate
from src.model.movie_predictor import MoviePredictor, model_save
from utils.constant import Optimizers, Models
import wandb
from dotenv import load_dotenv

load_dotenv()
init_seed()

def get_runs(project_name):
    return wandb.Api().runs(path=project_name, order="-created_at")


def get_latest_run(project_name):
    runs = get_runs(project_name)
    if not runs:
        return f"{project_name}-000"

    return runs[0].name

def run_train(model_name, optimizer, num_epochs=10, lr=0.001, model_ext="pth"):
    api_key = os.environ["WANDB_API_KEY"]
    wandb.login(key=api_key)

    project_name = model_name.replace("_", "-")
    wandb.init(
        project=project_name,
        notes="content-based movie recommend model",
        tags=["content-based", "movie", "recommend"],
        config=locals(),
    )

    Models.validation(model_name)
    Optimizers.validation(optimizer)

    # 데이터셋 및 DataLoader 생성
    train_dataset, val_dataset, test_dataset = get_datasets()
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=0, pin_memory=False)
    val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=0, pin_memory=False)
    test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=0, pin_memory=False)

    # 모델 초기화
    model_params = {
        "input_dim": train_dataset.features_dim,
        "num_classes": train_dataset.num_classes
    }
    model_class = Models[model_name.upper()].value
    model = model_class(**model_params)

    # 손실 함수 및 옵티마이저 정의
    criterion = nn.CrossEntropyLoss()
    optimizer_class = Optimizers[optimizer.upper()].value
    optimizer = optimizer_class(model.parameters(), lr=lr)

    # 학습 루프
    epoch = 0
    train_loss = 0
    num_epochs = 10
    for epoch in tqdm(range(num_epochs)):
        train_loss = train(model, train_loader, criterion, optimizer)
        val_loss, _ = evaluate(model, val_loader, criterion)
        wandb.log({"Loss/Train": train_loss})
        wandb.log({"Loss/Valid": val_loss})
        print(f"Epoch {epoch + 1}/{num_epochs}, "
              f"Train Loss: {train_loss:.4f}, "
              f"Val Loss: {val_loss:.4f}, "
              f"Val-Train Loss : {val_loss-train_loss:.4f}")

    model_ext = "onnx"  # or "pth"
    model_save(
        model=model,
        model_params=model_params,
        epoch=num_epochs,
        optimizer=optimizer,
        loss=train_loss,
        scaler=train_dataset.scaler,
        contents_id_map=train_dataset.contents_id_map,
        ext=model_ext,
    )

    # 테스트
    model.eval()
    test_loss, predictions = evaluate(model, test_loader, criterion)
    print(f"Test Loss : {test_loss:.4f}")
    # print([train_dataset.decode_content_id(idx) for idx in predictions])

if __name__ == '__main__':
    fire.Fire({
        "train": run_train,
    })
python src/main.py train --model_name movie_predictor --optimizer adam --num_epochs 20 --lr 0.002

728x90
반응형
LIST

'App Programming > MLops' 카테고리의 다른 글

[MLops] 데이터베이스  (0) 2024.08.13
[MLops] 모델 추론  (0) 2024.08.13
[MLops] 모델 저장하기  (0) 2024.08.12
[MLops] 모델 학습 및 평가  (0) 2024.08.12
[MLops] 모델 훈련  (0) 2024.08.09