这里我爬取“智慧校园”中“校园信息”板块里的信息,其中包含“学院新闻”“学生活动”“教务简报”“校内通知”“教学通知”“教务通知”等。
主要用到的requests、execjs、json2html、lxml包
首先打开开发者工具,输入账号密码点击登录后发现,数据指向如图所示接口
为post请求,携带了如上图所示数据,rsa看上去应该是RSA算法先对其进行处理。回到登陆界面,搜索rsa后找到对应js代码打一个断点,重新执行登陆操作,果然停在了此处(见下图)。对其进行分析不难发现,ul值为账号的长度,pl值为密码的长度,lt值是从网页中取到的。
点击strEnc函数进入到对应js文件,发现是DES加密(猜错了),下载此js文件并进行strEnc函数调用,代码如下:
# DES加密
def strEnc(data, firstKey, secondKey, thirdKey):
with open('des.js', 'r', encoding='utf-8') as f:
js_code = f.read()
# 通过execjs.compile()进行编译js文件内容
compile_result = execjs.compile(js_code)
# 调用js文件传参
return compile_result.call('strEnc', data, firstKey, secondKey, thirdKey)
然后回到首页查找lt,搜索后发现剩余的参数均在此处,用xpath即可解决,图片代码如下:
session = requests.session()
url = 'http://approve.qqhru.edu.cn/tpass/login?service=http%3A%2F%2Fapprove.qqhru.edu.cn%2Ftp_up%2F'
headers = {
"Host": "approve.qqhru.edu.cn",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48"
}
response = session.get(url=url, headers=headers)
html = etree.HTML(response.text)
lt = html.xpath('/html/body/form/input[4]/@value')[0]
execution = html.xpath('/html/body/form/input[5]/@value')[0]
_eventId = html.xpath('/html/body/form/input[6]/@value')[0]
# print(_eventId)
data = Username + Password + lt
# DES加密
rsa = strEnc(data, '1', '2', '3')
ul = len(Username)
pl = len(Password)
# print(rsa)
登陆代码如下:
# 开始登陆
url = 'http://approve.qqhru.edu.cn/tpass/login'
data_post = {
"rsa": rsa,
"ul": ul,
"pl": pl,
"lt": lt,
"execution": execution,
"_eventId": _eventId
}
response = session.post(url=url, headers=headers, data=data_post)
# print(response.text)
通过点击后发现POST调用如下接口获取数据
实现代码如下:
# list保存到文本
def listTxt(list, path):
file = open(path, 'w', encoding="utf-8")
for l in list:
l = str(l) # 强制转换
if l[-1] != '\n':
l = l + '\n'
file.write(l)
file.close()
print(f"{path}文件存储成功")
# 开始爬取接口,因为是学校网站还是小心为妙,把请求头写全
if NewsID in ("220146", "220145", "220133"):
OwnwrID = "1662531519"
elif NewsID in ("126841", "126842", "126840"):
OwnwrID = "1260695749"
else:
NewsID = "220133"
OwnwrID = "1662531519"
url = 'http://approve.qqhru.edu.cn/tp_up/up/subgroup/getOANewsInfo'
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Cache-Control': 'no-cache',
'Content-Type': 'application/json;charset=UTF-8',
'Origin': 'http://approve.qqhru.edu.cn',
'Pragma': 'no-cache',
'Proxy-Connection': 'keep-alive',
'Referer': 'http://approve.qqhru.edu.cn/tp_up/view?m=up',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48',
'X-Requested-With': 'XMLHttpRequest',
}
json_data = {
'ONETYPE': '0', # 也可能是0,测试后暂无发现区别
'OWNERID': OwnwrID,
'NEWSID': NewsID,
'NEWSCOUNT': NewsCount
}
response = session.post(
url=url,
headers=headers,
json=json_data,
)
response.encoding = 'utf-8'
print(response.text)
print(response.json())
# 结果保存至jg.txt
listTxt(response.json(), './jg.txt')
为了方便起见(后续可以放在云端定时执行),这里我将结果推送到微信,这里使用pushplus进行推送,代码如下:
# pushplus微信推送(https://www.pushplus.plus/)
def wxfs(title_wxfs, content_wxfs=time.strftime('运行时间: \n%Y-%m-%d %H:%M:%S', time.localtime(time.time()))):
url_wxfs = 'http://www.pushplus.plus/send'
data = {
"token": token,
"title": title_wxfs,
"content": content_wxfs,
"template": "html"
}
response_wxfs = requests.post(url=url_wxfs, json=data)
content_wxfs = response_wxfs.text
print("微信推送:", content_wxfs)
因为手机端屏幕尺寸较小,所以我将不重要的数据隐藏,并将剩下的数据做成表格并进行美化,这里用到了json2html模块、Bootstrap前端样式,代码如下:
sc = json2html.convert(json=response.text, table_attributes='id="tableone" class="table table-hover"')
sc = sc + time.strftime('执行时间: \n%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
sc = '<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/3.4.1/css/bootstrap.min.css" integrity="sha384-HSMxcRTRxnN+Bdg0JdbxYKrThecOKuH5zCYotlSAcp1+c8xmyTe9GYg1l9a69psu" crossorigin="anonymous">' + sc + '<style>td:nth-child(1),td:nth-child(2),td:nth-child(4),td:nth-child(6),td:nth-child(7),th:nth-child(1),th:nth-child(2),th:nth-child(4),th:nth-child(6),th:nth-child(7){display:none}.jumbotron{background-color:#fff}</style>'
# 将结果推送至微信
wxfs('智慧校园', sc)
效果图如下
综上我对代码整理后如下(敏感信息进行了隐藏处理):
# -*- coding=utf-8 -*-
# @author by 丁靖宇 on 2023/4/21 15:15
# @ClassName 智慧校园
# @Description TODO:(齐齐哈尔大学智慧校园(http://approve.qqhru.edu.cn/)自动登陆爬取相关信息)
import time
import execjs
import requests
from json2html import *
from lxml import etree
Username = "********" # 账号
Password = "********" # 密码
NewsID = "126840" # 从下方选择相关代码填入
# 学院新闻: "220146"
# 学生活动: "220145"
# 校内通知: "220133"
# 教学通知: "126842"
# 教务简报: "126841"
# 教务通知: "126840"
NewsCount = 10 # 爬取多少条数据
token = "****************" # pushplus微信推送,token详见官网文档
# list保存到文本
def listTxt(list, path):
file = open(path, 'w', encoding="utf-8")
for l in list:
l = str(l) # 强制转换
if l[-1] != '\n':
l = l + '\n'
file.write(l)
file.close()
print(f"{path}文件存储成功")
# pushplus微信推送(https://www.pushplus.plus/)
def wxfs(title_wxfs, content_wxfs=time.strftime('运行时间: \n%Y-%m-%d %H:%M:%S', time.localtime(time.time()))):
url_wxfs = 'http://www.pushplus.plus/send'
data = {
"token": token,
"title": title_wxfs,
"content": content_wxfs,
"template": "html"
}
response_wxfs = requests.post(url=url_wxfs, json=data)
content_wxfs = response_wxfs.text
print("微信推送:", content_wxfs)
# DES加密
def strEnc(data, firstKey, secondKey, thirdKey):
with open('des.js', 'r', encoding='utf-8') as f:
js_code = f.read()
# 通过execjs.compile()进行编译js文件内容
compile_result = execjs.compile(js_code)
# 调用js文件传参
return compile_result.call('strEnc', data, firstKey, secondKey, thirdKey)
# 获取登陆所需数据
session = requests.session()
url = 'http://approve.qqhru.edu.cn/tpass/login?service=http%3A%2F%2Fapprove.qqhru.edu.cn%2Ftp_up%2F'
headers = {
"Host": "approve.qqhru.edu.cn",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48"
}
response = session.get(url=url, headers=headers)
html = etree.HTML(response.text)
lt = html.xpath('/html/body/form/input[4]/@value')[0]
execution = html.xpath('/html/body/form/input[5]/@value')[0]
_eventId = html.xpath('/html/body/form/input[6]/@value')[0]
# print(_eventId)
data = Username + Password + lt
# DES加密
rsa = strEnc(data, '1', '2', '3')
ul = len(Username)
pl = len(Password)
# print(rsa)
# 开始登陆
url = 'http://approve.qqhru.edu.cn/tpass/login'
data_post = {
"rsa": rsa,
"ul": ul,
"pl": pl,
"lt": lt,
"execution": execution,
"_eventId": _eventId
}
response = session.post(url=url, headers=headers, data=data_post)
# print(response.text)
# 开始爬取接口,因为是学校网站还是小心为妙,把请求头写全
if NewsID in ("220146", "220145", "220133"):
OwnwrID = "1662531519"
elif NewsID in ("126841", "126842", "126840"):
OwnwrID = "1260695749"
else:
NewsID = "220133"
OwnwrID = "1662531519"
url = 'http://approve.qqhru.edu.cn/tp_up/up/subgroup/getOANewsInfo'
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Cache-Control': 'no-cache',
'Content-Type': 'application/json;charset=UTF-8',
'Origin': 'http://approve.qqhru.edu.cn',
'Pragma': 'no-cache',
'Proxy-Connection': 'keep-alive',
'Referer': 'http://approve.qqhru.edu.cn/tp_up/view?m=up',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48',
'X-Requested-With': 'XMLHttpRequest',
}
json_data = {
'ONETYPE': '1', # 也可能是0,测试后暂无发现区别
'OWNERID': OwnwrID,
'NEWSID': NewsID,
'NEWSCOUNT': NewsCount
}
response = session.post(
url=url,
headers=headers,
json=json_data,
)
response.encoding = 'utf-8'
print(response.text)
print(response.json())
# 结果保存至jg.txt
listTxt(response.json(), './jg.txt')
#
sc = json2html.convert(json=response.text, table_attributes='id="tableone" class="table table-hover"')
sc = sc + time.strftime('执行时间: \n%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
sc = '<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/3.4.1/css/bootstrap.min.css" integrity="sha384-HSMxcRTRxnN+Bdg0JdbxYKrThecOKuH5zCYotlSAcp1+c8xmyTe9GYg1l9a69psu" crossorigin="anonymous">' + sc + '<style>td:nth-child(1),td:nth-child(2),td:nth-child(4),td:nth-child(6),td:nth-child(7),th:nth-child(1),th:nth-child(2),th:nth-child(4),th:nth-child(6),th:nth-child(7){display:none}.jumbotron{background-color:#fff}</style>'
# 将结果推送至微信
wxfs('智慧校园', sc)