App Programming/MLops

[MLops] TMDB API 데이터 수집 및 전처리

goatlab 2024. 8. 9. 14:59
728x90
반응형
SMALL

TMDB API

 

https://developer.themoviedb.org/reference/intro/getting-started에 회원가입 후 아래와 같이 개발자용 API 토큰을 발급받는다.

 

 

 

opt 디렉토리로 이동하여 실습 디렉토리를 생성한다.

 

 

그 다음, 러스트 언어로 개발된 uv 패키지 관리자를 설치하여 네트워크 통신 등 코드 동작 속도를 빠르게 해준다.

 

pip install uv

 

사용법은 pip 앞에 uv를 붙여준다.

 

uv pip install requests pandas numpy matplotlib python-dotenv

 

Vim 설정

 

vi ~/.vimrc
syntax on
set expandtab
set autoindent
set ts=4
set shiftwidth=4
set nu
set cursorline

 

.env

 

https://developer.themoviedb.org/reference/movie-popular-list에서 base url을 얻는다.

 

vi .env
TMDB_BASE_URL=https://api.themoviedb.org/3/movie
TMDB_API_KEY=API 키

 

crawler.py

 

TMDB API를 활용한 데이터 크롤링하는 코드를 작성한다.

 

mkdir result
vi crawler.py
import os
import json
import time
import requests

class TMDBCrawler:
    def __init__(self, region="KR", language="ko-KR", image_language="ko", request_interval_seconds=0.4):
        self._base_url = os.environ.get("TMDB_BASE_URL")
        self._api_key = os.environ.get("TMDB_API_KEY")
        self._region = region
        self._language = language
        self._request_interval_seconds = request_interval_seconds

    def get_popular_movies(self, page):
        params = {
            "api_key": self._api_key,
            "language": self._language,
            "region": self._region,
            "page": page
        }
        response = requests.get(f"{self._base_url}/popular", params=params)

        if not response.status_code == 200:
            return

        return json.loads(response.text)["results"]

    @staticmethod
    def save_movies_to_json_file(movies, dst="./result", filename="popular"):
        data = {"movies": movies}
        with open(f"{os.path.join(dst, filename)}.json", "w", encoding='utf-8') as f:
            f.write(json.dumps(data))

 

preprocessing.py

 

TMDB 원천 데이터를 전처리하여 모델링에 필요한 데이터 구성으로 변환하는 코드를 작성한다.

 

import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

class TMDBPreProcessor:
    def __init__(self, movies: list, user_count=100, max_select_count=20):
        random.seed(0)
        self._movies = movies
        self._features = pd.DataFrame()
        self._users = list(range(1, user_count+1))
        self._max_select_count = max_select_count
        self._max_runtime_seconds = 120 * 60

    @staticmethod
    def augmentation(movie):
        rating = movie["vote_average"]
        count = int(pow(2, rating))
        data = {
            "content_id": movie["id"],
            "rating": rating,
            "popularity": movie["popularity"]
        }
        return [data] * count

    def generate_watch_second(self, rating):
        base = 1.1
        noise_level = 0.1
        base_time = self._max_runtime_seconds * (base ** (rating - 5) - base ** -5) / (base ** 5 - base ** -5)
        noise = np.random.normal(0, noise_level * base_time)
        watch_second = base_time + noise

        watch_second = int(np.clip(watch_second, 0, self._max_runtime_seconds))
        print(f"{rating}/{watch_second}")
        return watch_second

    def selection(self, user_id, features):
        select_count = random.randint(0, self._max_select_count)
        print(f"user [{user_id}] is select [{select_count}] contents")
        if select_count == 0:
            return []

        selected_feature = random.choices(features, k=select_count)

        result = [
            {
                "user_id": str(user_id),
                "content_id": str(feature["content_id"]),
                "watch_seconds": self.generate_watch_second(feature["rating"]),
                "rating": feature["rating"],
                "popularity": feature["popularity"],
            } for feature in selected_feature
        ]
        return result

    def run(self):
        features = []
        selected_features = []
        for movie in self._movies:
            features.extend(self.augmentation(movie))

        for user_id in self._users:
            selected_features.extend(self.selection(user_id, features))

        df = pd.DataFrame.from_records(selected_features)

        self._features = df

    def plot(self):
        if not self._features.empty:
            plt.figure()
            plt.scatter(self._features["rating"], self._features["watch_seconds"])
            plt.xlim(0, 10)
            plt.ylim(0, 7200)
            plt.show()  # or plt.savefig("chart.png")

    def save(self, filename):
        if not self._features.empty:
            self._features.to_csv(f"./result/{filename}.csv", header=True, index=False)

    @property
    def features(self):
        return self._features

 

main.py

 

import pandas as pd
from dotenv import load_dotenv
from crawler import TMDBCrawler
from preprocessing import TMDBPreProcessor

load_dotenv()

def run_popular_movie_crawler():
    tmdb_crawler = TMDBCrawler()
    result = tmdb_crawler.get_popular_movies(page=1)
    tmdb_crawler.save_movies_to_json_file([result], "./result", "popular")

    tmdb_preprocessor = TMDBPreProcessor(result)
    tmdb_preprocessor.run()
    tmdb_preprocessor.plot()
    tmdb_preprocessor.save("watch_log")

if __name__ == '__main__':
    run_popular_movie_crawler()

728x90
반응형
LIST