time.sleep -- asyncio.sleep import asyncio import time start_time = time.time() async def get_request(url): await asyncio.sleep(2) print(url,'下载完成!') urls = [ 'www.1.com', 'www.2.com', ] task_lst = [] # 任务对象列表 for url in urls: c = get_request(url) # 协程对象 task = asyncio.ensure_future(c) # 任务对象 # task.add_done_callback(...) # 绑定回调 task_lst.append(task) loop = asyncio.get_event_loop() # 事件循环对象 loop.run_until_complete(asyncio.wait(task_lst)) # 注册,手动挂起 线程池+requests模块 # 线程池 import time from multiprocessing.dummy import Pool start_time = time.time() url_list = [ 'www.1.com', 'www.2.com', 'www.3.com', ] def get_request(url): print('正在下载...',url) time.sleep(2) print('下载完成!',url) pool = Pool(3) pool.map(get_request,url_list) print('总耗时:',time.time()-start_time) 两个方法提升爬虫效率 起一个flask服务端 from flask import Flask import time app = Flask(__name__) @app.route('/bobo') def index_bobo(): time.sleep(2) return 'hello bobo!' @app.route('/jay') def index_jay(): time.sleep(2) return 'hello jay!' @app.route('/tom') def index_tom(): time.sleep(2) return 'hello tom!' if __name__ == '__main__': app.run(threaded=True) aiohttp模块+单线程多任务异步协程 import asyncio import aiohttp import requests import time start = time.time() async def get_page(url): # page_text = requests.get(url=url).text # print(page_text) # return page_text async with aiohttp.ClientSession() as s: #生成一个session对象 async with await s.get(url=url) as response: page_text = await response.text() print(page_text) return page_text urls = [ '', '', '', ] tasks = [] for url in urls: c = get_page(url) task = asyncio.ensure_future(c) tasks.append(task) loop = asyncio.get_event_loop() loop.run_until_complete(asyncio.wait(tasks)) end = time.time() print(end-start) # 异步执行! # hello tom! # hello bobo! # hello jay! # 2.0311079025268555 ''' aiohttp模块实现单线程+多任务异步协程 并用xpath解析数据 ''' import aiohttp import asyncio from lxml import etree import time start = time.time() # 特殊函数:请求的发送和数据的捕获 # 注意async with await关键字 async def get_request(url): async with aiohttp.ClientSession() as s: async with await s.get(url=url) as response: page_text = await response.text() return page_text # 返回页面源码 # 回调函数,解析数据 def parse(task): page_text = task.result() tree = etree.HTML(page_text) msg = tree.xpath('/html/body/ul//text()') print(msg) urls = [ '', '', '', ] tasks = [] for url in urls: c = get_request(url) task = asyncio.ensure_future(c) task.add_done_callback(parse) #绑定回调函数! tasks.append(task) loop = asyncio.get_event_loop() loop.run_until_complete(asyncio.wait(tasks)) end = time.time() print(end-start) requests模块+线程池 import time import requests from multiprocessing.dummy import Pool start = time.time() urls = [ '', '', '', ] def get_request(url): page_text = requests.get(url=url).text print(page_text) return page_text pool = Pool(3) pool.map(get_request, urls) end = time.time() print('总耗时:', end-start) # 实现异步请求 # hello jay! # hello bobo! # hello tom! # 总耗时: 2.0467123985290527 小结
