爬虫-单线程,多线程,多进程,协程效率对比

网络爬虫 2019-02-15 1308

单线程58s
import requests  
from lxml import etree  
import threading  
from urllib import parse

from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor  


base_url = "http://blog.jobbole.com"  
headers = {  
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "  
                  "Chrome/71.0.3573.0 Safari/537.36"  
}  


def get_url():  
    global urls  
    t = []  
    url1 = "http://blog.jobbole.com/all-posts/page/{}/"  
    for i in range(1, 11):  
        x = url1.format(i)  
        response = requests.get(x, headers=headers)  
        response_text = etree.HTML(response.text)  
        lis = response_text.xpath("//*[@id='archive']/div")[:-1]  
        for i in lis:  
            try:  
                href = i.xpath(".//div[2]/p/a[1]/@href")[0]  
            except:  
                continue  
            a = parse.urljoin(base_url, href)  

            t.append(a)  
    return t  


def get_title():  
    x = get_url()  
    for i in x:  
        response = requests.get(i, headers)  
        html = etree.HTML(response.text)  
        title = html.xpath("//*[@class='entry-header']/h1/text()")  
        print(title[0])  


def main():  
    import time  
    start_ = time.time()  
    get_title()  
    print("时间为", time.time()-start_)  


if __name__ == '__main__':  
    main()  
多线程时间为: 23.578604221343994
# 使用共享全局变量实现通信  
urls = []  
base_url = "http://blog.jobbole.com"  
headers = {  
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "  
                  "Chrome/71.0.3573.0 Safari/537.36"  
}

def get_url(x):  
    global urls  
    response = requests.get(x, headers=headers)  
    response_text = etree.HTML(response.text)  
    lis = response_text.xpath("//*[@id='archive']/div")[:-1]  
    for i in lis:  
        try:  
            href = i.xpath(".//div[2]/p/a[1]/@href")[0]  
        except:  
            continue  
        a = parse.urljoin(base_url, href)  
        urls.append(a)  


def get_title():  
    global urls  
    while True:  
        i = urls.pop()  
        response = requests.get(i, headers)  
        html = etree.HTML(response.text)  
        title = html.xpath("//*[@class='entry-header']/h1/text()")  
        print(title[0])  

if __name__ == '__main__':  
    import time  
    pool = ThreadPoolExecutor(5)  
    start_time = time.time()  
    url1 = "http://blog.jobbole.com/all-posts/page/{}/"  
    for i in range(1, 11):  
        x = url1.format(i)  
        pool.submit(get_url, x)  

    for i in range(5):  
        pool.submit(get_title, )  
    pool.shutdown()  
    print("时间为:", time.time()-start_time)  
多进程时间:32.62045454978943
if __name__ == '__main__':  
    import time  
    pool = ProcessPoolExecutor(5)  
    start_time = time.time()  
    url1 = "http://blog.jobbole.com/all-posts/page/{}/"  
    for i in range(1, 11):  
        x = url1.format(i)  
        pool.submit(get_url, x)

    for i in range(5):  
        pool.submit(get_title, )  
    pool.shutdown()  
    print("时间为:", time.time()-start_time)  
协程: 时间为 21.144105672836304

初学协程,代码写得不好,勉强运行

import aiohttp  
import asyncio  
from lxml import etree  
from urllib import parse

url1 = 'http://blog.jobbole.com/all-posts/page/{}/'  
url = [url1.format(i) for i in range(1, 11)]  
base_url = "http://blog.jobbole.com"  
sem = asyncio.Semaphore(10) # 信号量,控制协程数,防止爬的过快  


async def get_url(url):  
    async with aiohttp.ClientSession() as session:  
        async with session.request('GET', url) as resp:  
            print(resp.status)  
            data = await resp.text()  
            return data  


def parse_html(data):  
    urls = []  
    html = etree.HTML(data)  
    lis = html.xpath("//*[@id='archive']/div")[:-1]  
    for i in lis:  
        try:  
            href = i.xpath(".//div[2]/p/a[1]/@href")[0]  
        except:  
            continue  
        a = parse.urljoin(base_url, href)  
        urls.append(a)  
    return urls  


async def urls(url):  
    data = await get_url(url)  
    urls = parse_html(data)  
    await request_url(urls)  

# 时间为 21.144105672836304  
async def request_url(urls):  
    while True:  
        with(await sem):  
            async with aiohttp.ClientSession() as session:  
                if len(urls) == 0:  
                    break  
                i = urls.pop()  
                try:  
                    async with session.request('GET', i) as resp:  
                        data = await resp.text()  
                        html = etree.HTML(data)  
                        title = html.xpath("//*[@class='entry-header']/h1/text()")  
                        print(title[0])  
                except Exception as e:  
                    print(e)  

# print("".join(title).strip())  

if __name__ == '__main__':  
    import time  
    start_t = time.time()  
    loop = asyncio.get_event_loop()  
    tasks = [urls(url3) for url3 in url]  
    loop.run_until_complete(asyncio.wait(tasks))  
    loop.close()  
    print("时间为", time.time()-start_t)  

 

标签:网络爬虫

文章评论

评论列表

已有0条评论