import requests from lxml import etree import time import base64 import os import csv
class TqSpider: def __init__(self): # 请求的ip地址 self.index_url = 'http://127.0.0.1:5000/' # 请求头 self.headers = { "Cookie": 'salt="\302\210D\303\2609\302\221\007\302\230\302\211f9\303\254J:U\027\303\205V\302\276\302\213\303\257\303\227\303\230\303\223\303\246\302\230*4"', # 要从首页跳转 127.0.0.1:5000 "Referer": "http://127.0.0.1:5000/", "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36" } self.base_url = "http://127.0.0.1:5000/city_weather?city={}&page={}&TOKEN={}" # 创建文件夹 天 self.day_dir = "day_data" # 创建文件夹 月 self.month_dir = "month_data" # 判断 if not os.path.exists(self.day_dir): os.makedirs(self.day_dir) if not os.path.exists(self.month_dir): os.makedirs(self.month_dir)
def get_html(self, url): # 发送请求 resp = requests.get(url=url, headers=self.headers) return resp.content.decode()
# 城市(city) def parse_citys(self): # 首页 index_content = self.get_html(self.index_url) # 返回 HTML 标签对象 p = etree.HTML(index_content) # city 列表 city_list = p.xpath("//div[@class='right']/span/button/text()") print(city_list) return city_list
def encode_time(self, t): byte_t = t.encode('utf-8') encode_t = base64.b64encode(byte_t) encode_t_s = encode_t.decode('utf-8') print(encode_t_s) return encode_t_s
def parse_month_data(self, p, city, page): """解析月汇总数据""" # 平均高温 avg_high_gaowen = p.xpath("//ul/li/span[@class='high-avg']/span[@class='digit']/text()")[0] # 平均低温 avg_high_diwen = p.xpath("//ul/li/span[@class='low-avg']/span[@class='digit']/text()")[0] # 极端高温 avg_high_jdgw = p.xpath("//ul/li[2]/span/span[@class='digit']/text()")[0] # 极端低温 avg_high_jddw = p.xpath("//ul/li[3]/span/span[@class='digit']/text()")[0] # 平均空气质量指数 avg_high_pjkqz = p.xpath("//ul/li[4]/span/span[@class='digit']/text()")[0] # 空气最好(01/05) avg_high_kqzh = p.xpath("//ul/li[5]/span/span[@class='digit']/text()")[0] avg_high_kqzh_2 = p.xpath("//ul/li[5]/span/text()")[0].strip()[5:-1] # 空气最差(01/03) avg_high_kqzc = p.xpath("//ul/li[6]/span/span[@class='digit']/text()")[0] avg_high_kqzc_2 = p.xpath("//ul/li[6]/span/text()")[0].strip()[5:-1] data = [city, '%d月' % page, avg_high_gaowen, avg_high_diwen, avg_high_jdgw, avg_high_jddw, avg_high_pjkqz, avg_high_kqzh, avg_high_kqzh_2, avg_high_kqzc, avg_high_kqzc_2] print(data) return data
def parse_day_data(self, p, city): """解析每日数据""" day_datas = [] tr_list = p.xpath("//table[@class='tianqi']//tr")[1:] for tr in tr_list: #日期 riqi = tr.xpath("./td[1]/text()")[0] # 最高气温 zgqw = tr.xpath("./td[2]/text()")[0] # 最低气温 zdqw = tr.xpath("./td[3]/text()")[0] # 天气 tq = tr.xpath("./td[4]/text()")[0] # 风向 fx = tr.xpath("./td[5]/text()")[0] data = [city, riqi, zgqw, zdqw, tq, fx] print(data) # 列表 day_datas.append(data) return day_datas
def parse_city_data(self, city): """传入url,爬取数据""" day_data_list, month_data_list = [], [] for page in range(1, 13): now = str(int(time.time() * 1000)) # 加密 token = self.encode_time(now) # 拼接 url = self.base_url.format(city, page, token) # 发送请求 page_content = self.get_html(url) # 休眠 time.sleep(1.6) p = etree.HTML(page_content) # 解析一个月汇总数据 one_month_data = self.parse_month_data(p, city, page) month_data_list.append(one_month_data) # 解析一个月中每天数据 day_datas = self.parse_day_data(p, city) day_data_list.extend(day_datas) # 将一个城市数据存入csv文件中 day_path = os.path.join(self.day_dir, '%s_day.csv' % city) month_path = os.path.join(self.month_dir, '%s_month.csv' % city) with open(day_path, 'w', encoding='utf-8', newline='') as f: writer = csv.writer(f) writer.writerow(['城市', '日期', '最高气温', '最低气温', '天气', '风向']) writer.writerows(day_data_list) with open(month_path, 'w', encoding='utf-8', newline='') as f: writer = csv.writer(f) writer.writerow(['城市', '月份', '平均高温', '平均低温', '极端高温', '极端低温', '平均空气质量指数', '空气最好', '空气最好日期', '空气最差', '空气最差日期']) writer.writerows(month_data_list)
def get_all_data(self): # huoqu suoyou chengshi citys = self.parse_citys() for city in citys: self.parse_city_data(city)
def run(self): self.get_all_data()
if __name__ == '__main__': spider = TqSpider() spider.run()
|