这两天学了原生爬虫和多线程~写了自己的第一个爬虫小程序~感觉蛮好的
七夕快到了,好想和女神出去玩哦,残念~
豆瓣TOP250 多线程版
已知bug 2号表格存储会附带单线程数据
""" 计算第06次作业爬虫运行的时间,然后使用多线程优化爬虫,再次计算多线程爬虫运行完的时间。 完成作业就行,格式、方法、手段不限。 """ import os import requests import re from openpyxl import Workbook import json import csv import time import concurrent.futures class DouBan: def __init__(self): self.base_url = "https://movie.douban.com/top250" self.wb = Workbook() self.sheet = self.wb.create_sheet('DouBan') def get_html(self, url): headers = { "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/75.0.3770.142 Safari/537.36) ' } # Chrome浏览器 response = requests.get(url, headers=headers) html = response.content.decode() # print(html) self.parsel_html(html) def parsel_html(self, html): # print(html) temp = re.findall( '(.*?).*? (.*?) .*? .*?(.*?)', html, re.S) # print(temp) for i in temp: # 依次向下传递 print('获取数据:', i[0], i[1].strip().replace(' ', ''), '评分:{}'.format(i[2]), '“{}”'.format(i[3])) self.save_data(i[0], i[1].strip().replace(' ', ''), '评分:{}'.format(i[2]), '“{}”'.format(i[3])) def save_data(self, name, director, mark, comment): data = [name, mark, director, comment] self.save_csv(data) self.sheet.append(data) self.save_json(data) def save_csv(self, movie_data): with open('results.csv', 'a', newline='', encoding='utf-8') as f: spamwriter = csv.writer(f, delimiter='|') spamwriter.writerow(movie_data) def save_json(self, movie_data): data = { '电影名': movie_data[0], '导演': movie_data[1], '评分': movie_data[2], "评论": movie_data[3] } json_temp = json.dumps(data, ensure_ascii=False) # 加个循环判断 with open('results.json', mode='a', encoding='utf-8') as f: f.write(json_temp) def main(self): for i in range(0, 275, 25): url = self.base_url + '?start={}&filter='.format(i) # print(i) print('单线程模式:正在爬取第{}页。'.format(i / 25 + 1)) self.get_html(url) # if os.path.isfile("results.xlsx"): # self.wb.save("results(1).xlsx") # else: # self.wb.save("results.xlsx") # 保存workbook def main2(self, page): self.wb.close() url = "https://movie.douban.com/top250?start={}&filter=".format(page) print('多线程模式:开始爬取第{}页。'.format(page / 25 + 1)) self.get_html(url) # if os.path.isfile("results.xlsx"): # self.wb.save("results(1).xlsx") # else: # self.wb.save("results.xlsx") # 虽然有点多余 但还是记一下os.path这个方法吧 if __name__ == '__main__': douban = DouBan() start_time = time.time() douban.main() print("单线程模式用时:", time.time() - start_time) start_time = time.time() with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: executor.map(douban.main2, [i for i in range(0, 275, 25)]) print("多线程模式用时:", time.time() - start_time)
退出登录?