课程目标
课程内容
编码实现
爬虫部分
import requests
from fake_useragent import UserAgent
import bs4
from tqdm import tqdm
import time
import pandas as pd
def get_detail(data_rid):headers = {}url = f"https://movie.douban.com/j/review/{data_rid}/full" response = requests.get(url, headers=headers) rj = response.json() soup_ht = bs4.BeautifulSoup(rj['html'], 'html.parser') return soup_ht.text
infos = []
for page in range(10):print("正在爬取第{}页".format(page+1)) headers = {}url = "https://movie.douban.com/review/best"start = page * 20 params = {"start": str(start), }response = requests.get(url, headers=headers, params=params) soup = bs4.BeautifulSoup(response.text, 'html.parser') son_divs = soup.find_all('div', class_='main review-item')for son_div in tqdm(son_divs): movie_name = son_div.a.img['alt']movie_name_img_url = son_div.a.img['src']writer_name = son_div.find("a", class_="name").textdata_rid = son_div.find("div", class_="review-short")["data-rid"]comment_text = get_detail(data_rid) create_time = son_div.find('span', class_='main-meta').textfavour = int(son_div.find('a', {'title': '有用'}).span.text.strip()) if son_div.find('a', {'title': '有用'}) else 0tread = int(son_div.find('a', {'title': '没用'}).span.text.strip()) if son_div.find('a', {'title': '没用'}) else 0info = {"电影名称": movie_name,"电影图片": movie_name_img_url,"影评人": writer_name,"评论内容": comment_text,"发布时间": create_time,"赞": favour,"踩": tread}infos.append(info)time.sleep(0.3)
df = pd.DataFrame(infos)
print(df.head())
df.to_excel('豆瓣影评.xlsx', index=False)