본문 바로가기
Python Library/Selenium

[Selenium] 유튜브 댓글 크롤링

by goatlab 2023. 3. 14.
728x90
반응형
SMALL

라이브러리

 

import time
import pandas as pd
from bs4 import BeautifulSoup
from openpyxl import Workbook
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

import warnings 

warnings.filterwarnings('ignore')

 

url

 

url = input("링크 입력 (예: https://www.youtube.com/) : ")

wb = Workbook(write_only=True)
ws = wb.create_sheet()

driver = webdriver.Chrome("/Users/댓글크롤링/chromedriver")
driver.get(url)
driver.implicitly_wait(3)

time.sleep(3)

스크롤

 

last_height = driver.execute_script("return document.documentElement.scrollHeight")

while True:
    driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
    time.sleep(1.5)

    new_height = driver.execute_script("return document.documentElement.scrollHeight")
    
    if new_height == last_height:
        break
    last_height = new_height

time.sleep(1.5)

 

팝업 닫기

 

try:
    driver.find_element_by_css_selector("#dismiss-button > a").click()
    
except:
    pass

 

대댓글 누르기

 

buttons = driver.find_elements_by_css_selector("#more-replies > a")

time.sleep(1.5)

for button in buttons:
    button.send_keys(Keys.ENTER)
    time.sleep(1.5)
    button.click()

 

데이터 가져오기

 

html_source = driver.page_source
soup = BeautifulSoup(html_source, 'html.parser')

id_list = soup.select("div#header-author > h3 > #author-text > span")
comment_list = soup.select("yt-formatted-string#content-text")

id_final = []
comment_final = []

for i in range(len(comment_list)):
    temp_id = id_list[i].text
    temp_id = temp_id.replace('\n', '')
    temp_id = temp_id.replace('\t', '')
    temp_id = temp_id.replace('    ', '')
    id_final.append(temp_id) # 댓글 작성자

    temp_comment = comment_list[i].text
    temp_comment = temp_comment.replace('\n', '')
    temp_comment = temp_comment.replace('\t', '')
    temp_comment = temp_comment.replace('    ', '')
    comment_final.append(temp_comment) # 댓글 내용

 

데이터 저장하기

 

pd_data = {"아이디" : id_final , "댓글 내용" : comment_final}
youtube_pd = pd.DataFrame(pd_data)

youtube_pd.to_excel('result.xlsx')
youtube_pd.to_csv('result.csv')
728x90
반응형
LIST