App Programming/MLops
[MLops] TMDB API 데이터 수집 및 전처리
goatlab
2024. 8. 9. 14:59
728x90
반응형
SMALL
TMDB API
https://developer.themoviedb.org/reference/intro/getting-started에 회원가입 후 아래와 같이 개발자용 API 토큰을 발급받는다.
opt 디렉토리로 이동하여 실습 디렉토리를 생성한다.
그 다음, 러스트 언어로 개발된 uv 패키지 관리자를 설치하여 네트워크 통신 등 코드 동작 속도를 빠르게 해준다.
pip install uv
사용법은 pip 앞에 uv를 붙여준다.
uv pip install requests pandas numpy matplotlib python-dotenv
Vim 설정
vi ~/.vimrc
syntax on
set expandtab
set autoindent
set ts=4
set shiftwidth=4
set nu
set cursorline
.env
https://developer.themoviedb.org/reference/movie-popular-list에서 base url을 얻는다.
vi .env
TMDB_BASE_URL=https://api.themoviedb.org/3/movie
TMDB_API_KEY=API 키
crawler.py
TMDB API를 활용한 데이터 크롤링하는 코드를 작성한다.
mkdir result
vi crawler.py
import os
import json
import time
import requests
class TMDBCrawler:
def __init__(self, region="KR", language="ko-KR", image_language="ko", request_interval_seconds=0.4):
self._base_url = os.environ.get("TMDB_BASE_URL")
self._api_key = os.environ.get("TMDB_API_KEY")
self._region = region
self._language = language
self._request_interval_seconds = request_interval_seconds
def get_popular_movies(self, page):
params = {
"api_key": self._api_key,
"language": self._language,
"region": self._region,
"page": page
}
response = requests.get(f"{self._base_url}/popular", params=params)
if not response.status_code == 200:
return
return json.loads(response.text)["results"]
@staticmethod
def save_movies_to_json_file(movies, dst="./result", filename="popular"):
data = {"movies": movies}
with open(f"{os.path.join(dst, filename)}.json", "w", encoding='utf-8') as f:
f.write(json.dumps(data))
preprocessing.py
TMDB 원천 데이터를 전처리하여 모델링에 필요한 데이터 구성으로 변환하는 코드를 작성한다.
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
class TMDBPreProcessor:
def __init__(self, movies: list, user_count=100, max_select_count=20):
random.seed(0)
self._movies = movies
self._features = pd.DataFrame()
self._users = list(range(1, user_count+1))
self._max_select_count = max_select_count
self._max_runtime_seconds = 120 * 60
@staticmethod
def augmentation(movie):
rating = movie["vote_average"]
count = int(pow(2, rating))
data = {
"content_id": movie["id"],
"rating": rating,
"popularity": movie["popularity"]
}
return [data] * count
def generate_watch_second(self, rating):
base = 1.1
noise_level = 0.1
base_time = self._max_runtime_seconds * (base ** (rating - 5) - base ** -5) / (base ** 5 - base ** -5)
noise = np.random.normal(0, noise_level * base_time)
watch_second = base_time + noise
watch_second = int(np.clip(watch_second, 0, self._max_runtime_seconds))
print(f"{rating}/{watch_second}")
return watch_second
def selection(self, user_id, features):
select_count = random.randint(0, self._max_select_count)
print(f"user [{user_id}] is select [{select_count}] contents")
if select_count == 0:
return []
selected_feature = random.choices(features, k=select_count)
result = [
{
"user_id": str(user_id),
"content_id": str(feature["content_id"]),
"watch_seconds": self.generate_watch_second(feature["rating"]),
"rating": feature["rating"],
"popularity": feature["popularity"],
} for feature in selected_feature
]
return result
def run(self):
features = []
selected_features = []
for movie in self._movies:
features.extend(self.augmentation(movie))
for user_id in self._users:
selected_features.extend(self.selection(user_id, features))
df = pd.DataFrame.from_records(selected_features)
self._features = df
def plot(self):
if not self._features.empty:
plt.figure()
plt.scatter(self._features["rating"], self._features["watch_seconds"])
plt.xlim(0, 10)
plt.ylim(0, 7200)
plt.show() # or plt.savefig("chart.png")
def save(self, filename):
if not self._features.empty:
self._features.to_csv(f"./result/{filename}.csv", header=True, index=False)
@property
def features(self):
return self._features
main.py
import pandas as pd
from dotenv import load_dotenv
from crawler import TMDBCrawler
from preprocessing import TMDBPreProcessor
load_dotenv()
def run_popular_movie_crawler():
tmdb_crawler = TMDBCrawler()
result = tmdb_crawler.get_popular_movies(page=1)
tmdb_crawler.save_movies_to_json_file([result], "./result", "popular")
tmdb_preprocessor = TMDBPreProcessor(result)
tmdb_preprocessor.run()
tmdb_preprocessor.plot()
tmdb_preprocessor.save("watch_log")
if __name__ == '__main__':
run_popular_movie_crawler()
728x90
반응형
LIST