这两天学了原生爬虫和多线程~写了自己的第一个爬虫小程序~感觉蛮好的

七夕快到了,好想和女神出去玩哦,残念~

豆瓣TOP250 多线程版

已知bug 2号表格存储会附带单线程数据

 

"""

计算第06次作业爬虫运行的时间,然后使用多线程优化爬虫,再次计算多线程爬虫运行完的时间。

完成作业就行,格式、方法、手段不限。

"""
import os
import requests
import re
from openpyxl import Workbook
import json
import csv
import time
import concurrent.futures


class DouBan:
    def __init__(self):
        self.base_url = "https://movie.douban.com/top250"
        self.wb = Workbook()
        self.sheet = self.wb.create_sheet('DouBan')

    def get_html(self, url):
        headers = {

            "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/75.0.3770.142 Safari/537.36) '
        }  # Chrome浏览器
        response = requests.get(url, headers=headers)
        html = response.content.decode()
        # print(html)
        self.parsel_html(html)

    def parsel_html(self, html):
        # print(html)
        temp = re.findall(
            '(.*?).*?
(.*?)
.*?(.*?).*?(.*?)',
            html, re.S)
        # print(temp)
        for i in temp:  # 依次向下传递
            print('获取数据:', i[0], i[1].strip().replace(' ', ''), '评分:{}'.format(i[2]), '“{}”'.format(i[3]))
            self.save_data(i[0], i[1].strip().replace(' ', ''), '评分:{}'.format(i[2]), '“{}”'.format(i[3]))

    def save_data(self, name, director, mark, comment):
        data = [name, mark, director, comment]
        self.save_csv(data)
        self.sheet.append(data)
        self.save_json(data)

    def save_csv(self, movie_data):
        with open('results.csv', 'a', newline='', encoding='utf-8') as f:
            spamwriter = csv.writer(f, delimiter='|')
            spamwriter.writerow(movie_data)

    def save_json(self, movie_data):
        data = {
            '电影名': movie_data[0],
            '导演': movie_data[1],
            '评分': movie_data[2],
            "评论": movie_data[3]
        }
        json_temp = json.dumps(data, ensure_ascii=False)  # 加个循环判断
        with open('results.json', mode='a', encoding='utf-8') as f:
            f.write(json_temp)

    def main(self):
        for i in range(0, 275, 25):
            url = self.base_url + '?start={}&filter='.format(i)
            # print(i)
            print('单线程模式:正在爬取第{}页。'.format(i / 25 + 1))
            self.get_html(url)
        # if os.path.isfile("results.xlsx"):
        #     self.wb.save("results(1).xlsx")
        # else:
        #     self.wb.save("results.xlsx")  # 保存workbook

    def main2(self, page):
        self.wb.close()
        url = "https://movie.douban.com/top250?start={}&filter=".format(page)
        print('多线程模式:开始爬取第{}页。'.format(page / 25 + 1))
        self.get_html(url)
        # if os.path.isfile("results.xlsx"):
        #     self.wb.save("results(1).xlsx")
        # else:
        #     self.wb.save("results.xlsx")  # 虽然有点多余 但还是记一下os.path这个方法吧


if __name__ == '__main__':
    douban = DouBan()
    start_time = time.time()
    douban.main()
    print("单线程模式用时:", time.time() - start_time)
    start_time = time.time()
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        executor.map(douban.main2, [i for i in range(0, 275, 25)])
    print("多线程模式用时:", time.time() - start_time)