爬虫使用cookie

网络爬虫 2018-12-18 2477

1.urllib库

from urllib import request  
from urllib import parse  
from http.cookiejar import CookieJar  
import re

url = 目标网站  
headers = {  
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3573.0 Safari/537.36",  
    "Referer": url,  
}  
# cookiejar将cookie保存到内存中  
cookie = CookieJar()  
# 2使用cookiejar创建一个HTTPCookieProcess对象  
handle = request.HTTPCookieProcessor(cookie)  

opener = request.build_opener(handle)  
def get_csrf():  
    # 必须使用opener来获取csrf,因为它表示同一次连接,获取的csrf与目标网页对应  
    response = opener.open(request.Request(url))  
    csrf = re.search('.*name="csrfmiddlewaretoken" value="(.*?)"', response.read().decode('utf-8'))  
    if csrf:  
        return csrf.group(1)  
    else:  
        return ''  
# 获取data可以参考登录后的form-data来设置  
data = {  
        "csrfmiddlewaretoken": get_csrf(),  
        "username": "",  
        "password": "",  
        "this_is_the_login_form": ,  
        "next": "",  
    }  

req = request.Request(url, data=parse.urlencode(data).encode('utf-8'), method='POST')  
x = opener.open(req)  
print(x.read().decode('utf-8')) 

2.使用requests

import requests

from http.cookiejar import LWPCookieJar  

session = requests.session()  
session.cookies = LWPCookieJar(filename='cookie.txt')  
try:  
    session.cookies.load(ignore_discard=True)  
except:  
    print('cookie fail')  

headers = {  
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3573.0 Safari/537.36",  
    "Referer": url,  
}  

def get_csrf():  
    response = session.get(url)  
    csrf = re.search('.*name="csrfmiddlewaretoken" value="(.*?)"', response.text)  
    if csrf:  
        return csrf.group(1)  
    else:  
        return ''  

# save()函数带有两个参数,ignore_discard和ignore_expires。  
#  
# ignore_discard:  即保存需要被丢弃的cookie。  
# ignore_expires:  即过期的cookie也保存。  
def login():  
    data = {  
        "csrfmiddlewaretoken": get_csrf(),  
        "username": "",  
        "password": "",  
        "this_is_the_login_form": ,  
        "next": "",  
    }  
    response = session.get(url='验证网址', headers=headers)  
    if response.status_code == 200:  
        print(response.text)  
    else:  
        response_text = session.post(url, data=data, headers=headers)  
        session.cookies.save()  
        print(response_text.text)  

if __name__ == '__main__':  
    login()  

 

标签:网络爬虫

文章评论

评论列表

已有0条评论