蒙国造博客

Selenium 爬取 Youtube 无限流分页页面

原理:通过执行 js 来获取页面可滚动的高度,抓取该高度范围内的数据后,再跳转到底部。

这里获取可滚动高度用的是document.documentElement.scrollHeight 而不是 document.body.scrollHeight,用 body 的时候这个返回值一直是 0

代码如下:

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import time

chrome_options = Options()
chrome_options.add_argument('disable_infobars')
driver = webdriver.Chrome(options=chrome_options)

driver.set_window_position(0, 0)
driver.set_window_size(1024, 768)
driver.get("https://www.youtube.com/results?search_query=python")

# Get scroll height
last_height = driver.execute_script("return document.documentElement.scrollHeight")
print("scrollHeight0=" + str(last_height))

j = 1
n = 10
links = []
while True:
    user_data = []
    try:
        user_data = WebDriverWait(driver, timeout=5).until(lambda d: d.find_elements(by=By.ID, value='video-title'))
    except Exception as e:
        print(e)

    for i in user_data:
        links.append(i.get_attribute('href'))

    # Scroll down to bottom
    driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")

    # Wait to load page
    time.sleep(0.5)

    # Calculate new scroll height and compare with last scroll height
    new_height = driver.execute_script("return document.documentElement.scrollHeight")
    print("scrollHeight1=" + str(new_height))
    if new_height == last_height:
        break
    last_height = new_height

    print("finish "+str(j)+" time")
    if j > n:
        break
    j += 1

print(len(links))
print(links)
driver.close()
退出移动版