Selenium 爬取 Youtube 无限流分页页面
原理:通过执行 js 来获取页面可滚动的高度,抓取该高度范围内的数据后,再跳转到底部。
这里获取可滚动高度用的是document.documentElement.scrollHeight
而不是 document.body.scrollHeight
,用 body 的时候这个返回值一直是 0
。
代码如下:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import time
chrome_options = Options()
chrome_options.add_argument('disable_infobars')
driver = webdriver.Chrome(options=chrome_options)
driver.set_window_position(0, 0)
driver.set_window_size(1024, 768)
driver.get("https://www.youtube.com/results?search_query=python")
# Get scroll height
last_height = driver.execute_script("return document.documentElement.scrollHeight")
print("scrollHeight0=" + str(last_height))
j = 1
n = 10
links = []
while True:
user_data = []
try:
user_data = WebDriverWait(driver, timeout=5).until(lambda d: d.find_elements(by=By.ID, value='video-title'))
except Exception as e:
print(e)
for i in user_data:
links.append(i.get_attribute('href'))
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
# Wait to load page
time.sleep(0.5)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.documentElement.scrollHeight")
print("scrollHeight1=" + str(new_height))
if new_height == last_height:
break
last_height = new_height
print("finish "+str(j)+" time")
if j > n:
break
j += 1
print(len(links))
print(links)
driver.close()