MENU

Python爬取“智慧校园”信息

2023 年 04 月 21 日 • 笔记

这里我爬取“智慧校园”中“校园信息”板块里的信息,其中包含“学院新闻”“学生活动”“教务简报”“校内通知”“教学通知”“教务通知”等。
主要用到的requests、execjs、json2html、lxml包

1.jpg

首先打开开发者工具,输入账号密码点击登录后发现,数据指向如图所示接口
图片1.png

图片2.png

为post请求,携带了如上图所示数据,rsa看上去应该是RSA算法先对其进行处理。回到登陆界面,搜索rsa后找到对应js代码打一个断点,重新执行登陆操作,果然停在了此处(见下图)。对其进行分析不难发现,ul值为账号的长度,pl值为密码的长度,lt值是从网页中取到的。
图片3.png

点击strEnc函数进入到对应js文件,发现是DES加密(猜错了),下载此js文件并进行strEnc函数调用,代码如下:

# DES加密
def strEnc(data, firstKey, secondKey, thirdKey):
    with open('des.js', 'r', encoding='utf-8') as f:
        js_code = f.read()
    # 通过execjs.compile()进行编译js文件内容
    compile_result = execjs.compile(js_code)
    # 调用js文件传参
    return compile_result.call('strEnc', data, firstKey, secondKey, thirdKey)

然后回到首页查找lt,搜索后发现剩余的参数均在此处,用xpath即可解决,图片代码如下:

图片4.png

session = requests.session()
url = 'http://approve.qqhru.edu.cn/tpass/login?service=http%3A%2F%2Fapprove.qqhru.edu.cn%2Ftp_up%2F'
headers = {
    "Host": "approve.qqhru.edu.cn",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48"
}
response = session.get(url=url, headers=headers)

html = etree.HTML(response.text)
lt = html.xpath('/html/body/form/input[4]/@value')[0]
execution = html.xpath('/html/body/form/input[5]/@value')[0]
_eventId = html.xpath('/html/body/form/input[6]/@value')[0]
# print(_eventId)

data = Username + Password + lt
# DES加密
rsa = strEnc(data, '1', '2', '3')
ul = len(Username)
pl = len(Password)
# print(rsa)

登陆代码如下:

# 开始登陆
url = 'http://approve.qqhru.edu.cn/tpass/login'
data_post = {
    "rsa": rsa,
    "ul": ul,
    "pl": pl,
    "lt": lt,
    "execution": execution,
    "_eventId": _eventId
}
response = session.post(url=url, headers=headers, data=data_post)
# print(response.text)

通过点击后发现POST调用如下接口获取数据

图片5.png

实现代码如下:

# list保存到文本
def listTxt(list, path):
    file = open(path, 'w', encoding="utf-8")
    for l in list:
        l = str(l)  # 强制转换
        if l[-1] != '\n':
            l = l + '\n'
        file.write(l)
    file.close()
    print(f"{path}文件存储成功")
# 开始爬取接口,因为是学校网站还是小心为妙,把请求头写全
if NewsID in ("220146", "220145", "220133"):
    OwnwrID = "1662531519"
elif NewsID in ("126841", "126842", "126840"):
    OwnwrID = "1260695749"
else:
    NewsID = "220133"
    OwnwrID = "1662531519"
url = 'http://approve.qqhru.edu.cn/tp_up/up/subgroup/getOANewsInfo'
headers = {
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
    'Cache-Control': 'no-cache',
    'Content-Type': 'application/json;charset=UTF-8',
    'Origin': 'http://approve.qqhru.edu.cn',
    'Pragma': 'no-cache',
    'Proxy-Connection': 'keep-alive',
    'Referer': 'http://approve.qqhru.edu.cn/tp_up/view?m=up',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48',
    'X-Requested-With': 'XMLHttpRequest',
}
json_data = {
    'ONETYPE': '0',  # 也可能是0,测试后暂无发现区别
    'OWNERID': OwnwrID,
    'NEWSID': NewsID,
    'NEWSCOUNT': NewsCount
}
response = session.post(
    url=url,
    headers=headers,
    json=json_data,
)
response.encoding = 'utf-8'
print(response.text)
print(response.json())
# 结果保存至jg.txt
listTxt(response.json(), './jg.txt')

为了方便起见(后续可以放在云端定时执行),这里我将结果推送到微信,这里使用pushplus进行推送,代码如下:

# pushplus微信推送(https://www.pushplus.plus/)
def wxfs(title_wxfs, content_wxfs=time.strftime('运行时间: \n%Y-%m-%d %H:%M:%S', time.localtime(time.time()))):
    url_wxfs = 'http://www.pushplus.plus/send'
    data = {
        "token": token,
        "title": title_wxfs,
        "content": content_wxfs,
        "template": "html"
    }
    response_wxfs = requests.post(url=url_wxfs, json=data)
    content_wxfs = response_wxfs.text
print("微信推送:", content_wxfs)

因为手机端屏幕尺寸较小,所以我将不重要的数据隐藏,并将剩下的数据做成表格并进行美化,这里用到了json2html模块、Bootstrap前端样式,代码如下:

sc = json2html.convert(json=response.text, table_attributes='id="tableone" class="table table-hover"')
sc = sc + time.strftime('执行时间: \n%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
sc = '<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/3.4.1/css/bootstrap.min.css" integrity="sha384-HSMxcRTRxnN+Bdg0JdbxYKrThecOKuH5zCYotlSAcp1+c8xmyTe9GYg1l9a69psu" crossorigin="anonymous">' + sc + '<style>td:nth-child(1),td:nth-child(2),td:nth-child(4),td:nth-child(6),td:nth-child(7),th:nth-child(1),th:nth-child(2),th:nth-child(4),th:nth-child(6),th:nth-child(7){display:none}.jumbotron{background-color:#fff}</style>'
# 将结果推送至微信
wxfs('智慧校园', sc)

效果图如下

2023-04-21 213305.png

1682084238089.jpg

QQ截图20230421214110.jpg

综上我对代码整理后如下(敏感信息进行了隐藏处理):

# -*- coding=utf-8 -*-
# @author by 丁靖宇 on 2023/4/21 15:15
# @ClassName 智慧校园
# @Description TODO:(齐齐哈尔大学智慧校园(http://approve.qqhru.edu.cn/)自动登陆爬取相关信息)
import time

import execjs
import requests
from json2html import *
from lxml import etree

Username = "********"  # 账号
Password = "********"  # 密码
NewsID = "126840"  # 从下方选择相关代码填入
# 学院新闻: "220146"
# 学生活动: "220145"
# 校内通知: "220133"
# 教学通知: "126842"
# 教务简报: "126841"
# 教务通知: "126840"
NewsCount = 10  # 爬取多少条数据
token = "****************"  # pushplus微信推送,token详见官网文档




# list保存到文本
def listTxt(list, path):
    file = open(path, 'w', encoding="utf-8")
    for l in list:
        l = str(l)  # 强制转换
        if l[-1] != '\n':
            l = l + '\n'
        file.write(l)
    file.close()
    print(f"{path}文件存储成功")


# pushplus微信推送(https://www.pushplus.plus/)
def wxfs(title_wxfs, content_wxfs=time.strftime('运行时间: \n%Y-%m-%d %H:%M:%S', time.localtime(time.time()))):
    url_wxfs = 'http://www.pushplus.plus/send'
    data = {
        "token": token,
        "title": title_wxfs,
        "content": content_wxfs,
        "template": "html"
    }
    response_wxfs = requests.post(url=url_wxfs, json=data)
    content_wxfs = response_wxfs.text
    print("微信推送:", content_wxfs)


# DES加密
def strEnc(data, firstKey, secondKey, thirdKey):
    with open('des.js', 'r', encoding='utf-8') as f:
        js_code = f.read()
    # 通过execjs.compile()进行编译js文件内容
    compile_result = execjs.compile(js_code)
    # 调用js文件传参
    return compile_result.call('strEnc', data, firstKey, secondKey, thirdKey)


# 获取登陆所需数据
session = requests.session()
url = 'http://approve.qqhru.edu.cn/tpass/login?service=http%3A%2F%2Fapprove.qqhru.edu.cn%2Ftp_up%2F'
headers = {
    "Host": "approve.qqhru.edu.cn",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48"
}
response = session.get(url=url, headers=headers)

html = etree.HTML(response.text)
lt = html.xpath('/html/body/form/input[4]/@value')[0]
execution = html.xpath('/html/body/form/input[5]/@value')[0]
_eventId = html.xpath('/html/body/form/input[6]/@value')[0]
# print(_eventId)

data = Username + Password + lt
# DES加密
rsa = strEnc(data, '1', '2', '3')
ul = len(Username)
pl = len(Password)
# print(rsa)

# 开始登陆
url = 'http://approve.qqhru.edu.cn/tpass/login'
data_post = {
    "rsa": rsa,
    "ul": ul,
    "pl": pl,
    "lt": lt,
    "execution": execution,
    "_eventId": _eventId
}
response = session.post(url=url, headers=headers, data=data_post)
# print(response.text)

# 开始爬取接口,因为是学校网站还是小心为妙,把请求头写全
if NewsID in ("220146", "220145", "220133"):
    OwnwrID = "1662531519"
elif NewsID in ("126841", "126842", "126840"):
    OwnwrID = "1260695749"
else:
    NewsID = "220133"
    OwnwrID = "1662531519"
url = 'http://approve.qqhru.edu.cn/tp_up/up/subgroup/getOANewsInfo'
headers = {
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
    'Cache-Control': 'no-cache',
    'Content-Type': 'application/json;charset=UTF-8',
    'Origin': 'http://approve.qqhru.edu.cn',
    'Pragma': 'no-cache',
    'Proxy-Connection': 'keep-alive',
    'Referer': 'http://approve.qqhru.edu.cn/tp_up/view?m=up',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.48',
    'X-Requested-With': 'XMLHttpRequest',
}
json_data = {
    'ONETYPE': '1',  # 也可能是0,测试后暂无发现区别
    'OWNERID': OwnwrID,
    'NEWSID': NewsID,
    'NEWSCOUNT': NewsCount
}
response = session.post(
    url=url,
    headers=headers,
    json=json_data,
)
response.encoding = 'utf-8'
print(response.text)
print(response.json())
# 结果保存至jg.txt
listTxt(response.json(), './jg.txt')

#
sc = json2html.convert(json=response.text, table_attributes='id="tableone" class="table table-hover"')
sc = sc + time.strftime('执行时间: \n%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
sc = '<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/3.4.1/css/bootstrap.min.css" integrity="sha384-HSMxcRTRxnN+Bdg0JdbxYKrThecOKuH5zCYotlSAcp1+c8xmyTe9GYg1l9a69psu" crossorigin="anonymous">' + sc + '<style>td:nth-child(1),td:nth-child(2),td:nth-child(4),td:nth-child(6),td:nth-child(7),th:nth-child(1),th:nth-child(2),th:nth-child(4),th:nth-child(6),th:nth-child(7){display:none}.jumbotron{background-color:#fff}</style>'
# 将结果推送至微信
wxfs('智慧校园', sc)