一、验证码识别
1、第三方云码平台的使用
先注册一个账号,在个人中心有剩余积分和token(关注微信公众号可免费获取积分)
在开发文档里选择语言python
我们只需修改token、type、image参数
type是要解析的类型;
token在个人中心处;image是要解析的图片
import base64
import requests
from lxml import etree
def verify(encoded_image):
url = "http://api.jfbym.com/api/YmServer/customApi"
data = {
## 关于参数,一般来说有3个;不同类型id可能有不同的参数个数和参数名,找客服获取
"token": "Your Token",
"type": "10110",
"image": encoded_image,
}
_headers = {
"Content-Type": "application/json"
}
response = requests.request("POST", url, headers=_headers, json=data).json()
return(response['data']['data'])
url = 'https://www.gushiwen.cn/user/login.aspx?from=http://www.gushiwen.cn/user/collect.aspx'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36'
}
page_text = requests.get(url=url,headers=headers).text
tree = etree.HTML(page_text)
# 解析验证码图片的url
img_url = 'https://www.gushiwen.cn'+tree.xpath('//*[@id="imgCode"]/@src')[0]
# 解析出验证码图片的数据
img_data = requests.get(url=img_url,headers=headers).content
encoded_image = base64.b64encode(img_data).decode()
print(verify(encoded_image))
2、ddddocr 库的使用
从pypi安装ddddocr库,使用国内源加快下载速度(python版本问题可能会报错)
pip install ddddocr -i https://pypi.douban.com/simple
git安装ddddocr库
git clone https://github.com/sml2h3/ddddocr.git
cd ddddocr
python setup.py
安装完成测试一下
import ddddocr # 导入 ddddocr ocr = ddddocr.DdddOcr() # 实例化 with open('20241031_113342_code.jpg', 'rb') as f: # 打开图片 img_bytes = f.read() # 读取图片 res = ocr.classification(img_bytes) # 识别 print(res)
import ddddocr
import base64
import requests
from lxml import etree
url = 'https://www.gushiwen.cn/user/login.aspx?from=http://www.gushiwen.cn/user/collect.aspx'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36'
}
page_text = requests.get(url=url,headers=headers).text
tree = etree.HTML(page_text)
# 解析验证码图片的url
img_url = 'https://www.gushiwen.cn'+tree.xpath('//*[@id="imgCode"]/@src')[0]
# 解析出验证码图片的数据
img_data = requests.get(url=img_url,headers=headers).content
# 使用ddddocr进行OCR识别
ocr = ddddocr.DdddOcr()
res = ocr.classification(img_data)
print(res)
能识别到 qsbu,但是会出现"欢迎使用ddddocr,本项目专注带动行业内卷***"提示语, 可以加一个参数show_ad=False
ocr = ddddocr.DdddOcr(show_ad=False)
二、模拟登录
登陆抓包
重新登录发现__VIEWSTATE和__VIEWSTATEGENERATOR是动态变化的,在页面元素中可以解析
import requests
import base64
from lxml import etree
def verify(encoded_image):
url = "http://api.jfbym.com/api/YmServer/customApi"
data = {
## 关于参数,一般来说有3个;不同类型id可能有不同的参数个数和参数名,找客服获取
"token": "Your Token",
"type": "10110",
"image": encoded_image,
}
_headers = {
"Content-Type": "application/json"
}
response = requests.request("POST", url, headers=_headers, json=data).json()
return(response['data']['data'])
url = 'https://www.gushiwen.cn/user/login.aspx?from=http://www.gushiwen.cn/user/collect.aspx'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36'
}
page_text = requests.get(url=url,headers=headers).text
tree = etree.HTML(page_text)
# 解析验证码图片的url
img_url = 'https://www.gushiwen.cn'+tree.xpath('//*[@id="imgCode"]/@src')[0]
# 解析出验证码图片的数据
img_data = requests.get(url=img_url,headers=headers).content
encoded_image = base64.b64encode(img_data).decode()
print(verify(encoded_image))
login_url = 'https://www.gushiwen.cn/user/login.aspx?from=http://www.gushiwen.cn/user/collect.aspx'
data = {
'__VIEWSTATE':tree.xpath('//*[@id="__VIEWSTATE"]/@value'),
'__VIEWSTATEGENERATOR':tree.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value'),
'from':'http://www.gushiwen.cn/user/collect.aspx',
'email':'[email protected]',
'pwd':'12345678',
'code':verify(encoded_image),
'denglu ':'登录'
}
admin_url = 'https://www.gushiwen.cn/user/collect.aspx'
login_response = requests.post(url=login_url,data=data,headers=headers)
if login_response.status_code == 200:
print('success')
admin_response = requests.get(url=admin_url,headers=headers)
with open('./gushiwenwang.html','w',encoding='utf-8') as fp:
fp.write(admin_response.text)
else:
print('登陆失败,状态码为',login_response.status_code)
爬取后台页面需要携带cookie发起请求:
手工处理cookie:把cookie添加到headers中发起请求,成功爬取到登陆后的页面数据
headers_ = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36',
'cookie': 'Hm_lvt_9007fab6814e892d3020a64454da5a55=1730338566; HMACCOUNT=FD4F3638578FCE59; login=flase; ASP.NET_SessionId=5szzvii3koeubyqxwdp2s1ad; wsEmail=123456789%40qq.com; ticketStr=207674433%7cgQF98DwAAAAAAAAAAS5odHRwOi8vd2VpeGluLnFxLmNvbS9xLzAyQ1BwSlI0bGVkN2kxb2NyYTFEMTAAAgQMDiNnAwQAjScA; gsw2017user=6605987%7c11ADD12B4E53BFAD07AC176F7D79097B%7c2000%2f1%2f1%7c2000%2f1%2f1; wxopenid=defoaltid; gswZhanghao=123456789%40qq.com; gswEmail=123456789%40qq.com; Hm_lpvt_9007fab6814e892d3020a64454da5a55=1730350618; codeyz=5a5e47bd79cbe52e'
}
但是cookie会变
使用session会话对象自动处理
import requests import base64 from lxml import etree def verify(encoded_image): url = "http://api.jfbym.com/api/YmServer/customApi" data = { ## 关于参数,一般来说有3个;不同类型id可能有不同的参数个数和参数名,找客服获取 "token": "YourToken", "type": "10110", "image": encoded_image, } _headers = { "Content-Type": "application/json" } response = requests.request("POST", url, headers=_headers, json=data).json() return (response['data']['data']) url = 'https://www.gushiwen.cn/user/login.aspx?from=http://www.gushiwen.cn/user/collect.aspx' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36' } page_text = requests.get(url=url, headers=headers).text tree = etree.HTML(page_text) # 解析验证码图片的url img_url = 'https://www.gushiwen.cn' + tree.xpath('//*[@id="imgCode"]/@src')[0] # 解析出验证码图片的数据 img_data = requests.get(url=img_url, headers=headers).content encoded_image = base64.b64encode(img_data).decode() print(verify(encoded_image)) login_url = 'https://www.gushiwen.cn/user/login.aspx?from=http://www.gushiwen.cn/user/collect.aspx' data = { '__VIEWSTATE': tree.xpath('//*[@id="__VIEWSTATE"]/@value'), '__VIEWSTATEGENERATOR': tree.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value'), 'from': 'http://www.gushiwen.cn/user/collect.aspx', 'email': '[email protected]', 'pwd': 'cjt030930', 'code': verify(encoded_image), 'denglu ': '登录' } # 发送登录请求 admin_url = 'https://www.gushiwen.cn/user/collect.aspx' session = requests.Session() login_response = session.post(url=login_url, json=data, headers=headers) print(login_response.cookies) admin_response = session.get(url=admin_url, headers=headers) if login_response.status_code == 200: print('success') with open('./admin.html', 'w', encoding='utf-8') as fp: fp.write(admin_response.text) else: print('登陆失败,状态码为', login_response.status_code) session.close()
输出了我们登陆的cookie,携带该cookie对后台页面发起请求,成功爬取后台页面
4A评测 - 免责申明
本站提供的一切软件、教程和内容信息仅限用于学习和研究目的。
不得将上述内容用于商业或者非法用途,否则一切后果请用户自负。
本站信息来自网络,版权争议与本站无关。您必须在下载后的24个小时之内,从您的电脑或手机中彻底删除上述内容。
如果您喜欢该程序,请支持正版,购买注册,得到更好的正版服务。如有侵权请邮件与我们联系处理。敬请谅解!
程序来源网络,不确保不包含木马病毒等危险内容,请在确保安全的情况下或使用虚拟机使用。
侵权违规投诉邮箱:4ablog168#gmail.com(#换成@)