单线程58s
import requests
from lxml import etree
import threading
from urllib import parse
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
base_url = "http://blog.jobbole.com"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/71.0.3573.0 Safari/537.36"
}
def get_url():
global urls
t = []
url1 = "http://blog.jobbole.com/all-posts/page/{}/"
for i in range(1, 11):
x = url1.format(i)
response = requests.get(x, headers=headers)
response_text = etree.HTML(response.text)
lis = response_text.xpath("//*[@id='archive']/div")[:-1]
for i in lis:
try:
href = i.xpath(".//div[2]/p/a[1]/@href")[0]
except:
continue
a = parse.urljoin(base_url, href)
t.append(a)
return t
def get_title():
x = get_url()
for i in x:
response = requests.get(i, headers)
html = etree.HTML(response.text)
title = html.xpath("//*[@class='entry-header']/h1/text()")
print(title[0])
def main():
import time
start_ = time.time()
get_title()
print("时间为", time.time()-start_)
if __name__ == '__main__':
main()
多线程时间为: 23.578604221343994
# 使用共享全局变量实现通信
urls = []
base_url = "http://blog.jobbole.com"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/71.0.3573.0 Safari/537.36"
}
def get_url(x):
global urls
response = requests.get(x, headers=headers)
response_text = etree.HTML(response.text)
lis = response_text.xpath("//*[@id='archive']/div")[:-1]
for i in lis:
try:
href = i.xpath(".//div[2]/p/a[1]/@href")[0]
except:
continue
a = parse.urljoin(base_url, href)
urls.append(a)
def get_title():
global urls
while True:
i = urls.pop()
response = requests.get(i, headers)
html = etree.HTML(response.text)
title = html.xpath("//*[@class='entry-header']/h1/text()")
print(title[0])
if __name__ == '__main__':
import time
pool = ThreadPoolExecutor(5)
start_time = time.time()
url1 = "http://blog.jobbole.com/all-posts/page/{}/"
for i in range(1, 11):
x = url1.format(i)
pool.submit(get_url, x)
for i in range(5):
pool.submit(get_title, )
pool.shutdown()
print("时间为:", time.time()-start_time)
多进程时间:32.62045454978943
if __name__ == '__main__':
import time
pool = ProcessPoolExecutor(5)
start_time = time.time()
url1 = "http://blog.jobbole.com/all-posts/page/{}/"
for i in range(1, 11):
x = url1.format(i)
pool.submit(get_url, x)
for i in range(5):
pool.submit(get_title, )
pool.shutdown()
print("时间为:", time.time()-start_time)
协程: 时间为 21.144105672836304
初学协程,代码写得不好,勉强运行
import aiohttp
import asyncio
from lxml import etree
from urllib import parse
url1 = 'http://blog.jobbole.com/all-posts/page/{}/'
url = [url1.format(i) for i in range(1, 11)]
base_url = "http://blog.jobbole.com"
sem = asyncio.Semaphore(10) # 信号量,控制协程数,防止爬的过快
async def get_url(url):
async with aiohttp.ClientSession() as session:
async with session.request('GET', url) as resp:
print(resp.status)
data = await resp.text()
return data
def parse_html(data):
urls = []
html = etree.HTML(data)
lis = html.xpath("//*[@id='archive']/div")[:-1]
for i in lis:
try:
href = i.xpath(".//div[2]/p/a[1]/@href")[0]
except:
continue
a = parse.urljoin(base_url, href)
urls.append(a)
return urls
async def urls(url):
data = await get_url(url)
urls = parse_html(data)
await request_url(urls)
# 时间为 21.144105672836304
async def request_url(urls):
while True:
with(await sem):
async with aiohttp.ClientSession() as session:
if len(urls) == 0:
break
i = urls.pop()
try:
async with session.request('GET', i) as resp:
data = await resp.text()
html = etree.HTML(data)
title = html.xpath("//*[@class='entry-header']/h1/text()")
print(title[0])
except Exception as e:
print(e)
# print("".join(title).strip())
if __name__ == '__main__':
import time
start_t = time.time()
loop = asyncio.get_event_loop()
tasks = [urls(url3) for url3 in url]
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
print("时间为", time.time()-start_t)
评论列表
已有0条评论