2. Htmls 页面解析


查找方法:find(): 查找第一个匹配到的节点。find_all(): 查找所有匹配到的节点,并返回一个列表。

遍历方法:contents: 返回当前节点的直接子节点列表。 children: 返回当前节点的直接子节点的迭代器。descendants: 返回当前节点的所有子孙节点的迭代器。

parent: 返回当前节点的父节点。parents: 返回当前节点的所有祖先节点的迭代器。





1 import requests # 模拟浏览器进行网络请求
2 from lxml import etree # 进行数据预处理
3 import csv # 进行写入csv文件


1 resp = requests.get(url, headers=headers)




1 resp_html = etree.HTML(resp.text)


1 resp_list = resp_html.xpath(“//ul[@class=‘thrui’]/li”)


 1 for li in resp_list: 2 day_weather_info = {} 3 # 日期4 day_weather_info['date_time'] = li.xpath("./div[1]/text()")[0].split(' ')[0]5 # 最高气温 (包含摄氏度符号)6 high = li.xpath("./div[2]/text()")[0]7 day_weather_info['high'] = high[:high.find('℃')]8 # 最低气温9 low = li.xpath("./div[3]/text()")[0]
10 day_weather_info['low'] = low[:low.find('℃')]
11 # 天气
12 day_weather_info['weather'] = li.xpath("./div[4]/text()")[0]
13 weather_info.append(day_weather_info)
14 return weather_info



# for循环生成有顺序的1-12
for month in range(1, 13):# 获取某一月的天气信息# 三元表达式weather_time = '2022' + ('0' + str(month) if month < 10 else str(month))print(weather_time)url = f'https://lishi.tianqi.com/quanzhou/{weather_time}.html'# 爬虫获取这个月的天气信息weather = getWeather(url)# 存到列表中weathers.append(weather)
print(weathers)# 数据写入(一次性写入)
with open("weather.csv", "w",newline='') as csvfile:writer = csv.writer(csvfile)# 先写入列名:columns_name 日期 最高气温 最低气温  天气writer.writerow(["日期", "最高气温", "最低气温", '天气'])# 一次写入多行用writerows(写入的数据类型是列表,一个列表对应一行)writer.writerows([list(day_weather_dict.values()) for month_weather in weathers for day_weather_dict in month_weather])
import sqlite3




1 import requests
2 from lxml import etree
3 import csv
4 from wordcloud import WordCloud
5 import matplotlib.pyplot as plt


 1 # 从URL获取天气信息的函数2 def getWeather(url): 3     weather_info = []  # 存储天气信息的列表4     headers = { 5         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1'6     }7     resp = requests.get(url, headers=headers)  # 发送GET请求到指定的URL8     resp_html = etree.HTML(resp.text)  # 解析响应的HTML9     resp_list = resp_html.xpath("//ul[@class='thrui']/li")  # 使用XPath选择器提取天气信息列表
10     for li in resp_list:
11         day_weather_info = {}  # 存储每天天气信息的字典
12         day_weather_info['date_time'] = li.xpath("./div[1]/text()")[0].split(' ')[0]  # 提取日期时间并存入字典
13         high = li.xpath("./div[2]/text()")[0]  # 提取最高温度
14         day_weather_info['high'] = high[:high.find('℃')]  # 去除温度单位并存入字典
15         low = li.xpath("./div[3]/text()")[0]  # 提取最低温度
16         day_weather_info['low'] = low[:low.find('℃')]  # 去除温度单位并存入字典
17         day_weather_info['weather'] = li.xpath("./div[4]/text()")[0]  # 提取天气情况并存入字典
18         weather_info.append(day_weather_info)  # 将每天天气信息字典添加到天气信息列表中
19     return weather_info
20 def main():
21     weathers = []  # 存储所有月份的天气信息的列表
22     for month in range(1, 13):
23         weather_time = '2022' + ('0' + str(month) if month < 10 else str(month))
24         print(weather_time)
25         url = f'https://lishi.tianqi.com/quanzhou/{weather_time}.html'
26         weather = getWeather(url)
27         weathers.append(weather)  # 将每个月份的天气信息添加到weathers列表中
28     print(weathers)
30     weather_data = ""  # 存储所有天气情况的字符串
31     for month_weather in weathers:
32         for day_weather_dict in month_weather:
33             weather = day_weather_dict['weather']  # 提取天气情况
34             weather_data += weather + " "  # 将天气情况添加到weather_data字符串中,用空格分隔



1    wordcloud = WordCloud(font_path='C:\Windows\Fonts\微软雅黑\msyh.ttc', width=800, height=400, font_step=1,
2                           prefer_horizontal=0.9).generate(weather_data)  # 根据天气数据生成词云
3     plt.figure(figsize=(10, 5))
4     plt.imshow(wordcloud, interpolation='bilinear')  # 显示词云图像
5     plt.axis('off')
6 plt.show()
8 if __name__ == '__main__':
9     main()


import sqlite3def create_weather_table():conn = sqlite3.connect('weather.db')  # 连接到数据库文件cursor = conn.cursor()# 创建天气表格cursor.execute('''CREATE TABLE IF NOT EXISTS weather (date_time TEXT,high TEXT,low TEXT,weather TEXT)''')  # 创建天气表格,如果不存在则创建conn.commit()  # 提交更改到数据库conn.close()  # 关闭数据库连接def insert_weather_data(weather_data):conn = sqlite3.connect('weather.db')  # 连接到数据库文件cursor = conn.cursor()# 插入天气数据for month_weather in weather_data:for day_weather_dict in month_weather:date_time = day_weather_dict['date_time']  # 获取日期时间high = day_weather_dict['high']  # 获取最高温度low = day_weather_dict['low']  # 获取最低温度weather = day_weather_dict['weather']  # 获取天气情况cursor.execute("INSERT INTO weather VALUES (?, ?, ?, ?)", (date_time, high, low, weather))  # 插入数据到天气表格conn.commit()  # 提交更改到数据库conn.close()  # 关闭数据库连接def main():create_weather_table()  # 创建天气表格weathers = []  # 存储所有月份的天气信息的列表for month in range(1, 13):weather_time = '2022' + ('0' + str(month) if month < 10 else str(month))print(weather_time)url = f'https://lishi.tianqi.com/quanzhou/{weather_time}.html'weather = getWeather(url)  # 获取天气信息weathers.append(weather)
print(weathers)insert_weather_data(weathers)if __name__ == '__main__':main()




1 import pandas as pd
2 from pyecharts import options as opts
3 from pyecharts.charts import Pie, Bar, Timeline, Line, Scatter


1 df = pd.read_csv(‘weather.csv’,encoding=‘gb18030’)


1 df[‘日期’] = df[‘日期’].apply(lambda x: pd.to_datetime(x))

使用GroupBy聚合对象 以及size().reset_index()方法来将每种天气出现的次数等数据进行分组,统计。

1 df_agg = df.groupby(['month','天气']).size().reset_index()
2 print(df_agg)


df_agg.columns = ['month','tianqi','count']


1 print(df_agg[df_agg['month']==1][['tianqi','count']]\
2     .sort_values(by='count',ascending=False).values.tolist())


 1 # 画图2 # 实例化一个时间序列的对象3 timeline = Timeline() 4 # 播放参数:设置时间间隔 1s  单位是:ms(毫秒)5 timeline.add_schema(play_interval=1000)    # 单位是:ms(毫秒)6 7 # 循环遍历df_agg['month']里的唯一值8 for month in df_agg['month'].unique():9     data = (
11         df_agg[df_agg['month']==month][['tianqi','count']]
12         .sort_values(by='count',ascending=True)
13 .values.tolist()
14 )
15     # print(data)
16     # 绘制柱状图
17     bar = Bar()
18     # x轴是天气名称
19     bar.add_xaxis([x[0] for x in data])
20     # y轴是出现次数
21     bar.add_yaxis('',[x[1] for x in data])
23     # 让柱状图横着放
24 bar.reversal_axis()
25     # 将计数标签放置在图形右边
26     bar.set_series_opts(label_opts=opts.LabelOpts(position='right'))
27     # 设置下图表的名称
28     bar.set_global_opts(title_opts=opts.TitleOpts(title='泉州2022年每月天气变化 '))
29     # 将设置好的bar对象放置到时间轮播图当中,并且标签选择月份 格式为: 数字月
30     timeline.add(bar, f'{month}月')
32 # 将设置好的图表保存为'weathers.html'文件
33 timeline.render('weathers1.html')



 1 # 画图2 # 实例化一个时间序列的对象3 timeline = Timeline() 4 # 播放参数:设置时间间隔 1s 单位是:ms(毫秒)5 timeline.add_schema(play_interval=1000)  # 单位是:ms(毫秒)6 7 # 循环遍历df_agg['tianqi']里的唯一值(天气类型)8 for tianqi in df_agg['tianqi'].unique():9     data = (
10         df_agg[df_agg['tianqi'] == tianqi][['month', 'count']]
11         .sort_values(by='month', ascending=True)
12 .values.tolist()
13 )
14     # print(data)
15     # 绘制折线图
16     line = Line()
17     # x轴是月份
18     line.add_xaxis([x[0] for x in data])
19     # y轴是出现次数
20     line.add_yaxis(tianqi, [x[1] for x in data], is_smooth=True)
22     # 设置图线平滑曲线
23 line.set_series_opts(
24         markpoint_opts=opts.MarkPointOpts(
25             data=[opts.MarkPointItem(type_="max", name="最大值")]
26 )
27 )
29     # 设置下图表的名称
30 line.set_global_opts(
31         title_opts=opts.TitleOpts(title='泉州2022年天气趋势'),
32         datazoom_opts=opts.DataZoomOpts(type_="slider", range_start=0, range_end=100),
33 )
35     # 将设置好的line对象放置到时间轮播图中,并且标签选择天气类型
36 timeline.add(line, tianqi)
38 # 将设置好的时间轮播图渲染为HTML文件
39 timeline.render("weather_trend.html")


 1 # 画图2 # 实例化一个散点图对象3 scatter = Scatter() 4 # 播放参数:设置时间间隔 1s 单位是:ms(毫秒)5 timeline.add_schema(play_interval=1000) # 单位是:ms(毫秒)6 7 # 循环遍历df_agg['month']里的唯一值8 for month in df_agg['month'].unique():9     data = (
10         df_agg[df_agg['month']==month][['tianqi','count']]
11         .sort_values(by='count',ascending=True)
12 .values.tolist()
13 )
14     # 绘制散点图
15     scatter = Scatter()
16     # x轴是天气名称
17     scatter.add_xaxis([x[0] for x in data])
18     # y轴是出现次数
19     scatter.add_yaxis('',[x[1] for x in data])
21     # 设置下图表的名称
22     scatter.set_global_opts(title_opts=opts.TitleOpts(title=f'{month}月天气散点图'))
24     # 将设置好的scatter对象放置到时间轮播图当中,并且标签选择月份 格式为: 数字月
25     timeline.add(scatter, f'{month}月')
27 # 将设置好的时间轮播图渲染为html文件
28 timeline.render('scatter_timeline.html')





