728x90
◎ 유튜브 댓글 추출하기 쿼리¶
In [1]:
# 라이브러리 불러오기
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
import pandas as pd
from selenium.common.exceptions import NoSuchElementException
In [2]:
#구필수는 없다 유튜브 댓글 추출
options = webdriver.ChromeOptions()
#창 최대화
options.add_argument('start-maximized')
options.binary_location= 'C:/Program Files/Google/Chrome/Application/chrome.exe'
wd = webdriver.Chrome('C:/dev_python/chromedriver.exe', chrome_options = options)
url = 'http://www.youtube.com'
wd.get(url)
time.sleep(2)
search = wd.find_element_by_name('search_query') # 검색 창 찾기
key = '구필수는 없다'
search.send_keys(key) # 검색어 입력
search.send_keys(Keys.ENTER) # 검색어 검색
time.sleep(2)
# 필터 클릭
wd.find_element_by_xpath('/html/body/ytd-app/div/ytd-page-manager/ytd-search/div[1]/ytd-two-column-search-results-renderer/div/ytd-section-list-renderer/div[1]/div[2]/ytd-search-sub-menu-renderer/div[1]/div/ytd-toggle-button-renderer/a/tp-yt-paper-button').click()
# 조회수 클릭
wd.find_element_by_xpath('/html/body/ytd-app/div/ytd-page-manager/ytd-search/div[1]/ytd-two-column-search-results-renderer/div/ytd-section-list-renderer/div[1]/div[2]/ytd-search-sub-menu-renderer/div[1]/iron-collapse/div/ytd-search-filter-group-renderer[5]/ytd-search-filter-renderer[3]/a/div/yt-formatted-string').click()
time.sleep(1)
# 1등 동영상 클릭
wd.find_element_by_xpath('/html/body/ytd-app/div[1]/ytd-page-manager/ytd-search/div[1]/ytd-two-column-search-results-renderer/div/ytd-section-list-renderer/div[2]/ytd-item-section-renderer/div[3]/ytd-video-renderer[1]/div[1]/div/div[1]/div/h3/a').click()
# 페이지 내리기를 통해 추출되는 댓글 양 조절
for i in range(10):
wd.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)
time.sleep(0.5)
html0 = wd.page_source
wd.close()
html = BeautifulSoup(html0, 'html.parser')
data_list = []
comments_list = html.findAll('ytd-comment-thread-renderer', {'class':'style-scope ytd-item-section-renderer'})
# print (comments_list)
for j in range(len(comments_list)):
#댓글 추출
comment = comments_list[j].find('yt-formatted-string',{'id':'content-text'}).text
comment = comment.replace('\n', '') #전처리
comment = comment.replace('\t', '')
# 아이디 추출
youtube_id = comments_list[j].find('a', {'id': 'author-text'}).span.text
youtube_id = youtube_id.replace('\n', '')
youtube_id = youtube_id.replace('\t', '')
youtube_id = youtube_id.strip()
#날짜 데이터 추출
raw_date = comments_list[j].find('yt-formatted-string', {'class': 'published-time-text style-scope ytd-comment-renderer'})
date = raw_date.a.text
#좋아요 추출
like_num = comments_list[j].find('span', {'id': 'vote-count-middle', 'class': 'style-scope ytd-comment-action-buttons-renderer'}).text
like_num = like_num.replace('\n', '')
like_num = like_num.replace('\t', '')
like_num = like_num.strip()
data = {'youtube_id': youtube_id, 'comment': comment, 'date': date, 'like_num': like_num}
data_list.append(data)
# 데이터 프레임 형성
result_df = pd.DataFrame(data_list, columns=['youtube_id','comment','date','like_num'])
#result_df.to_excel("C:/Users/90000527/Desktop/업무/data11.xlsx", index = False)
C:\Users\90000527\AppData\Local\Temp/ipykernel_9296/574342302.py:7: DeprecationWarning: executable_path has been deprecated, please pass in a Service object
wd = webdriver.Chrome('C:/dev_python/chromedriver.exe', chrome_options = options)
C:\Users\90000527\AppData\Local\Temp/ipykernel_9296/574342302.py:7: DeprecationWarning: use options instead of chrome_options
wd = webdriver.Chrome('C:/dev_python/chromedriver.exe', chrome_options = options)
C:\Users\90000527\AppData\Local\Temp/ipykernel_9296/574342302.py:13: DeprecationWarning: find_element_by_name is deprecated. Please use find_element(by=By.NAME, value=name) instead
search = wd.find_element_by_name('search_query') # 검색 창 찾기
C:\Users\90000527\AppData\Local\Temp/ipykernel_9296/574342302.py:19: DeprecationWarning: find_element_by_xpath is deprecated. Please use find_element(by=By.XPATH, value=xpath) instead
wd.find_element_by_xpath('/html/body/ytd-app/div/ytd-page-manager/ytd-search/div[1]/ytd-two-column-search-results-renderer/div/ytd-section-list-renderer/div[1]/div[2]/ytd-search-sub-menu-renderer/div[1]/div/ytd-toggle-button-renderer/a/tp-yt-paper-button').click()
C:\Users\90000527\AppData\Local\Temp/ipykernel_9296/574342302.py:21: DeprecationWarning: find_element_by_xpath is deprecated. Please use find_element(by=By.XPATH, value=xpath) instead
wd.find_element_by_xpath('/html/body/ytd-app/div/ytd-page-manager/ytd-search/div[1]/ytd-two-column-search-results-renderer/div/ytd-section-list-renderer/div[1]/div[2]/ytd-search-sub-menu-renderer/div[1]/iron-collapse/div/ytd-search-filter-group-renderer[5]/ytd-search-filter-renderer[3]/a/div/yt-formatted-string').click()
C:\Users\90000527\AppData\Local\Temp/ipykernel_9296/574342302.py:24: DeprecationWarning: find_element_by_xpath is deprecated. Please use find_element(by=By.XPATH, value=xpath) instead
wd.find_element_by_xpath('/html/body/ytd-app/div[1]/ytd-page-manager/ytd-search/div[1]/ytd-two-column-search-results-renderer/div/ytd-section-list-renderer/div[2]/ytd-item-section-renderer/div[3]/ytd-video-renderer[1]/div[1]/div/div[1]/div/h3/a').click()
C:\Users\90000527\AppData\Local\Temp/ipykernel_9296/574342302.py:28: DeprecationWarning: find_element_by_tag_name is deprecated. Please use find_element(by=By.TAG_NAME, value=name) instead
wd.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)
In [3]:
#결과 확인
result_df.head()
Out[3]:
youtube_id | comment | date | like_num | |
---|---|---|---|---|
0 | 지무비 : G Movie | 고몽의 《고필수는없다》 소개 영상 보러 가기\rhttps://youtu.be/tZd... | 1개월 전 | 453 |
1 | 알랑방구 | 몇억 빌려주고 돈 안갚은건데 할머니 개착하네 | 1개월 전 | 349 |
2 | 코코낸네 | 곽도원이 나오면 퀄리티가 달라지네 무슨 영화인줄 알았는데 드라마라니 ㄷㄷ | 1개월 전 | 2.1천 |
3 | robin L | 지무비님! 지무비님은 진짜 저에게 하루중 유일하게 웃음을 주시는 분 입니다! 영상 ... | 1개월 전 | 232 |
4 | snj lae | 곽도원 코믹연기 대박 ㅋㅋ 배꼽잡음. 애드립도 진짜 잘함 | 2주 전 | 11 |
In [4]:
# 댓글 크롤링 버전 2
options = webdriver.ChromeOptions()
#창 최대화
options.add_argument('start-maximized')
#셀레니움 다운 위치
options.binary_location= 'C:/Program Files/Google/Chrome/Application/chrome.exe'
#크롬 다운 위치
wd = webdriver.Chrome('C:/dev_python/chromedriver.exe', chrome_options = options)
#크롬과 셀레니움은 버전이 동일해야지 작동
data_list = [] #데이터 저장 리스트
url = 'http://www.youtube.com'
wd.get(url)
time.sleep(2)
search = wd.find_element_by_name('search_query') # 검색 창 찾기
key = input("원하는 검색어를 입력해 주세요 : ")
#key = '구필수는 없다'
search.send_keys(key) # 검색어 입력
search.send_keys(Keys.ENTER) # 검색어 검색
time.sleep(2)
# 필터 클릭
wd.find_element_by_xpath('/html/body/ytd-app/div/ytd-page-manager/ytd-search/div[1]/ytd-two-column-search-results-renderer/div/ytd-section-list-renderer/div[1]/div[2]/ytd-search-sub-menu-renderer/div[1]/div/ytd-toggle-button-renderer/a/tp-yt-paper-button').click()
# 조회수 클릭
wd.find_element_by_xpath('/html/body/ytd-app/div/ytd-page-manager/ytd-search/div[1]/ytd-two-column-search-results-renderer/div/ytd-section-list-renderer/div[1]/div[2]/ytd-search-sub-menu-renderer/div[1]/iron-collapse/div/ytd-search-filter-group-renderer[5]/ytd-search-filter-renderer[3]/a/div/yt-formatted-string').click()
time.sleep(1)
num1 = int(input("원하는 댓글 추출 동영상 수 : "))
# 동영상 클릭
for i in range(num1):
wd.find_element_by_xpath('/html/body/ytd-app/div[1]/ytd-page-manager/ytd-search/div[1]/ytd-two-column-search-results-renderer/div/ytd-section-list-renderer/div[2]/ytd-item-section-renderer/div[3]/ytd-video-renderer['+str(i+1)+']/div[1]/div/div[1]/div/h3/a').click()
# 페이지 내리기를 통해 추출되는 댓글 양 조절
for i in range(10):
wd.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)
time.sleep(0.5)
html0 = wd.page_source
html = BeautifulSoup(html0, 'html.parser')
comments_list = html.findAll('ytd-comment-thread-renderer', {'class':'style-scope ytd-item-section-renderer'})
# print (comments_list)
for j in range(len(comments_list)):
#댓글 추출
comment = comments_list[j].find('yt-formatted-string',{'id':'content-text'}).text
comment = comment.replace('\n', '') #전처리
comment = comment.replace('\t', '')
# 아이디 추출
youtube_id = comments_list[j].find('a', {'id': 'author-text'}).span.text
youtube_id = youtube_id.replace('\n', '')
youtube_id = youtube_id.replace('\t', '')
youtube_id = youtube_id.strip()
#날짜 데이터 추출
raw_date = comments_list[j].find('yt-formatted-string', {'class': 'published-time-text style-scope ytd-comment-renderer'})
date = raw_date.a.text
#좋아요 추출
like_num = comments_list[j].find('span', {'id': 'vote-count-middle', 'class': 'style-scope ytd-comment-action-buttons-renderer'}).text
like_num = like_num.replace('\n', '')
like_num = like_num.replace('\t', '')
like_num = like_num.strip()
data = {'youtube_id': youtube_id, 'comment': comment, 'date': date, 'like_num': like_num}
data_list.append(data)
wd.back()
# 데이터 프레임 형성
result_df = pd.DataFrame(data_list, columns=['youtube_id','comment','date','like_num'])
#엑셀 파일로 만들기
result_df.to_excel("C:/Users/90000527/Desktop/업무/data11.xlsx", index = False)
C:\Users\90000527\AppData\Local\Temp/ipykernel_9296/4257878128.py:8: DeprecationWarning: executable_path has been deprecated, please pass in a Service object
wd = webdriver.Chrome('C:/dev_python/chromedriver.exe', chrome_options = options)
C:\Users\90000527\AppData\Local\Temp/ipykernel_9296/4257878128.py:8: DeprecationWarning: use options instead of chrome_options
wd = webdriver.Chrome('C:/dev_python/chromedriver.exe', chrome_options = options)
C:\Users\90000527\AppData\Local\Temp/ipykernel_9296/4257878128.py:17: DeprecationWarning: find_element_by_name is deprecated. Please use find_element(by=By.NAME, value=name) instead
search = wd.find_element_by_name('search_query') # 검색 창 찾기
원하는 검색어를 입력해 주세요 : 구필수는 없다
C:\Users\90000527\AppData\Local\Temp/ipykernel_9296/4257878128.py:25: DeprecationWarning: find_element_by_xpath is deprecated. Please use find_element(by=By.XPATH, value=xpath) instead
wd.find_element_by_xpath('/html/body/ytd-app/div/ytd-page-manager/ytd-search/div[1]/ytd-two-column-search-results-renderer/div/ytd-section-list-renderer/div[1]/div[2]/ytd-search-sub-menu-renderer/div[1]/div/ytd-toggle-button-renderer/a/tp-yt-paper-button').click()
C:\Users\90000527\AppData\Local\Temp/ipykernel_9296/4257878128.py:27: DeprecationWarning: find_element_by_xpath is deprecated. Please use find_element(by=By.XPATH, value=xpath) instead
wd.find_element_by_xpath('/html/body/ytd-app/div/ytd-page-manager/ytd-search/div[1]/ytd-two-column-search-results-renderer/div/ytd-section-list-renderer/div[1]/div[2]/ytd-search-sub-menu-renderer/div[1]/iron-collapse/div/ytd-search-filter-group-renderer[5]/ytd-search-filter-renderer[3]/a/div/yt-formatted-string').click()
원하는 댓글 추출 동영상 수 : 2
C:\Users\90000527\AppData\Local\Temp/ipykernel_9296/4257878128.py:33: DeprecationWarning: find_element_by_xpath is deprecated. Please use find_element(by=By.XPATH, value=xpath) instead
wd.find_element_by_xpath('/html/body/ytd-app/div[1]/ytd-page-manager/ytd-search/div[1]/ytd-two-column-search-results-renderer/div/ytd-section-list-renderer/div[2]/ytd-item-section-renderer/div[3]/ytd-video-renderer['+str(i+1)+']/div[1]/div/div[1]/div/h3/a').click()
C:\Users\90000527\AppData\Local\Temp/ipykernel_9296/4257878128.py:37: DeprecationWarning: find_element_by_tag_name is deprecated. Please use find_element(by=By.TAG_NAME, value=name) instead
wd.find_element_by_tag_name('body').send_keys(Keys.PAGE_DOWN)
In [5]:
#결과 확인
result_df.head()
Out[5]:
youtube_id | comment | date | like_num | |
---|---|---|---|---|
0 | 지무비 : G Movie | 고몽의 《고필수는없다》 소개 영상 보러 가기\rhttps://youtu.be/tZd... | 1개월 전 | 453 |
1 | 팡주댕 | 우와.. 영상퀄 뭔가요 ㅎ 끝내주는 편집한고은..와 너무 예쁘고 멋지다 | 1개월 전(수정됨) | 4 |
2 | 노오란셔츠 | 마지막에 노래 너무 귀엽게 찰떡인거 아닙니까 ㅋㅋㅋㅋ | 10일 전 | 0 |
3 | 윤경호 | 이제 곧 예슈아 오십니다 - 예슈아커밍 메세지 -앞으로 이 땅에서 일어날 일들에 대... | 1개월 전 | 2 |
4 | J | 정동원 구필수는없다 구준표 대활약 대반전 이야기 드라마 퍼펙트 화제성을 부르는 듯나... | 1개월 전 | 11 |
In [6]:
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:90% !important;}</style>"))
◎ 엑셀로 저장한 마지막 결과물
끝~
728x90
'Ccode > 크롤링(crawling)' 카테고리의 다른 글
당일 주가 정보 크롤링 (0) | 2022.07.20 |
---|---|
AP_CGV 사이트에서 영화 리뷰 crawling (0) | 2021.08.02 |