这两天学了原生爬虫和多线程~写了自己的第一个爬虫小程序~感觉蛮好的
七夕快到了,好想和女神出去玩哦,残念~
豆瓣TOP250 多线程版
已知bug 2号表格存储会附带单线程数据
"""
计算第06次作业爬虫运行的时间,然后使用多线程优化爬虫,再次计算多线程爬虫运行完的时间。
完成作业就行,格式、方法、手段不限。
"""
import os
import requests
import re
from openpyxl import Workbook
import json
import csv
import time
import concurrent.futures
class DouBan:
def __init__(self):
self.base_url = "https://movie.douban.com/top250"
self.wb = Workbook()
self.sheet = self.wb.create_sheet('DouBan')
def get_html(self, url):
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/75.0.3770.142 Safari/537.36) '
} # Chrome浏览器
response = requests.get(url, headers=headers)
html = response.content.decode()
# print(html)
self.parsel_html(html)
def parsel_html(self, html):
# print(html)
temp = re.findall(
'(.*?).*?
(.*?)
.*? .*?(.*?)',
html, re.S)
# print(temp)
for i in temp: # 依次向下传递
print('获取数据:', i[0], i[1].strip().replace(' ', ''), '评分:{}'.format(i[2]), '“{}”'.format(i[3]))
self.save_data(i[0], i[1].strip().replace(' ', ''), '评分:{}'.format(i[2]), '“{}”'.format(i[3]))
def save_data(self, name, director, mark, comment):
data = [name, mark, director, comment]
self.save_csv(data)
self.sheet.append(data)
self.save_json(data)
def save_csv(self, movie_data):
with open('results.csv', 'a', newline='', encoding='utf-8') as f:
spamwriter = csv.writer(f, delimiter='|')
spamwriter.writerow(movie_data)
def save_json(self, movie_data):
data = {
'电影名': movie_data[0],
'导演': movie_data[1],
'评分': movie_data[2],
"评论": movie_data[3]
}
json_temp = json.dumps(data, ensure_ascii=False) # 加个循环判断
with open('results.json', mode='a', encoding='utf-8') as f:
f.write(json_temp)
def main(self):
for i in range(0, 275, 25):
url = self.base_url + '?start={}&filter='.format(i)
# print(i)
print('单线程模式:正在爬取第{}页。'.format(i / 25 + 1))
self.get_html(url)
# if os.path.isfile("results.xlsx"):
# self.wb.save("results(1).xlsx")
# else:
# self.wb.save("results.xlsx") # 保存workbook
def main2(self, page):
self.wb.close()
url = "https://movie.douban.com/top250?start={}&filter=".format(page)
print('多线程模式:开始爬取第{}页。'.format(page / 25 + 1))
self.get_html(url)
# if os.path.isfile("results.xlsx"):
# self.wb.save("results(1).xlsx")
# else:
# self.wb.save("results.xlsx") # 虽然有点多余 但还是记一下os.path这个方法吧
if __name__ == '__main__':
douban = DouBan()
start_time = time.time()
douban.main()
print("单线程模式用时:", time.time() - start_time)
start_time = time.time()
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
executor.map(douban.main2, [i for i in range(0, 275, 25)])
print("多线程模式用时:", time.time() - start_time)
评论 (0)