需要引入的库

  1. import Requests
  2. from lxml import etree
  3. import time
  4. import os
  5. import csv

代码部分

import requests
from lxml import etree
import time
import base64
import os
import csv

class TqSpider:
def __init__(self):
# 请求的ip地址
self.index_url = 'http://127.0.0.1:5000/'
# 请求头
self.headers = {
"Cookie": 'salt="\302\210D\303\2609\302\221\007\302\230\302\211f9\303\254J:U\027\303\205V\302\276\302\213\303\257\303\227\303\230\303\223\303\246\302\230*4"',
# 要从首页跳转 127.0.0.1:5000
"Referer": "http://127.0.0.1:5000/",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
}
self.base_url = "http://127.0.0.1:5000/city_weather?city={}&page={}&TOKEN={}"
# 创建文件夹 天
self.day_dir = "day_data"
# 创建文件夹 月
self.month_dir = "month_data"
# 判断
if not os.path.exists(self.day_dir):
os.makedirs(self.day_dir)
if not os.path.exists(self.month_dir):
os.makedirs(self.month_dir)

def get_html(self, url):
# 发送请求
resp = requests.get(url=url, headers=self.headers)
return resp.content.decode()

# 城市(city)
def parse_citys(self):
# 首页
index_content = self.get_html(self.index_url)
# 返回 HTML 标签对象
p = etree.HTML(index_content)
# city 列表
city_list = p.xpath("//div[@class='right']/span/button/text()")
print(city_list)
return city_list

def encode_time(self, t):
byte_t = t.encode('utf-8')
encode_t = base64.b64encode(byte_t)
encode_t_s = encode_t.decode('utf-8')
print(encode_t_s)
return encode_t_s

def parse_month_data(self, p, city, page):
"""解析月汇总数据"""
# 平均高温
avg_high_gaowen = p.xpath("//ul/li/span[@class='high-avg']/span[@class='digit']/text()")[0]
# 平均低温
avg_high_diwen = p.xpath("//ul/li/span[@class='low-avg']/span[@class='digit']/text()")[0]
# 极端高温
avg_high_jdgw = p.xpath("//ul/li[2]/span/span[@class='digit']/text()")[0]
# 极端低温
avg_high_jddw = p.xpath("//ul/li[3]/span/span[@class='digit']/text()")[0]
# 平均空气质量指数
avg_high_pjkqz = p.xpath("//ul/li[4]/span/span[@class='digit']/text()")[0]
# 空气最好(01/05)
avg_high_kqzh = p.xpath("//ul/li[5]/span/span[@class='digit']/text()")[0]
avg_high_kqzh_2 = p.xpath("//ul/li[5]/span/text()")[0].strip()[5:-1]
# 空气最差(01/03)
avg_high_kqzc = p.xpath("//ul/li[6]/span/span[@class='digit']/text()")[0]
avg_high_kqzc_2 = p.xpath("//ul/li[6]/span/text()")[0].strip()[5:-1]
data = [city, '%d月' % page, avg_high_gaowen, avg_high_diwen, avg_high_jdgw, avg_high_jddw, avg_high_pjkqz,
avg_high_kqzh, avg_high_kqzh_2, avg_high_kqzc, avg_high_kqzc_2]
print(data)
return data

def parse_day_data(self, p, city):
"""解析每日数据"""
day_datas = []
tr_list = p.xpath("//table[@class='tianqi']//tr")[1:]
for tr in tr_list:
#日期
riqi = tr.xpath("./td[1]/text()")[0]
# 最高气温
zgqw = tr.xpath("./td[2]/text()")[0]
# 最低气温
zdqw = tr.xpath("./td[3]/text()")[0]
# 天气
tq = tr.xpath("./td[4]/text()")[0]
# 风向
fx = tr.xpath("./td[5]/text()")[0]
data = [city, riqi, zgqw, zdqw, tq, fx]
print(data)
# 列表
day_datas.append(data)
return day_datas


def parse_city_data(self, city):
"""传入url,爬取数据"""
day_data_list, month_data_list = [], []
for page in range(1, 13):
now = str(int(time.time() * 1000))
# 加密
token = self.encode_time(now)
# 拼接
url = self.base_url.format(city, page, token)
# 发送请求
page_content = self.get_html(url)
# 休眠
time.sleep(1.6)
p = etree.HTML(page_content)
# 解析一个月汇总数据
one_month_data = self.parse_month_data(p, city, page)
month_data_list.append(one_month_data)
# 解析一个月中每天数据
day_datas = self.parse_day_data(p, city)
day_data_list.extend(day_datas)
# 将一个城市数据存入csv文件中
day_path = os.path.join(self.day_dir, '%s_day.csv' % city)
month_path = os.path.join(self.month_dir, '%s_month.csv' % city)
with open(day_path, 'w', encoding='utf-8', newline='') as f:
writer = csv.writer(f)
writer.writerow(['城市', '日期', '最高气温', '最低气温', '天气', '风向'])
writer.writerows(day_data_list)
with open(month_path, 'w', encoding='utf-8', newline='') as f:
writer = csv.writer(f)
writer.writerow(['城市', '月份', '平均高温', '平均低温', '极端高温', '极端低温', '平均空气质量指数',
'空气最好', '空气最好日期', '空气最差', '空气最差日期'])
writer.writerows(month_data_list)


def get_all_data(self):
# huoqu suoyou chengshi
citys = self.parse_citys()
for city in citys:
self.parse_city_data(city)

def run(self):
self.get_all_data()

if __name__ == '__main__':
spider = TqSpider()
spider.run()