【爬虫】模拟登录西电睿思领金币

爬虫入门

之前对爬虫做了点杂乱的简单学习:

西电教务处爬虫及数据处理

对西电研究生教务处系统进行了相关数据的爬取:

西电睿思爬虫

西电睿思就是西电学子们的一个家。
有个长久的打算,就是希望年底能对睿思做一个报告分析。了解西电er关心什么。

睿思爬虫领金币

第一个就拿领取金币来说吧。OK!
剩下待定,慢慢来。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# -*- coding: utf-8 -*-
import urllib.request, urllib.error, urllib.parse
import urllib.request, urllib.parse, urllib.error
import http.cookiejar
import re
import sys
import imp
imp.reload(sys)
def get_hash(url, opener):
c = opener.open(url).read()
c=c.decode('utf-8')
patt = re.compile(r'.*?name="formhash".*?value="(.*?)".*?')
formhash = patt.search(c)
if not formhash:
raise Exception('GET formhash Fail!')
formhash = formhash.group(1)
return formhash
class Spider():
def __init__(self, username=None, password=None):
self.front_page_url = 'http://rs.xidian.edu.cn/'
self.loginurl = 'http://rs.xidian.edu.cn/member.php?mod=logging&action=login&loginsubmit=yes&infloat=yes&lssubmit=yes'
self.shuiQu_url = 'http://rs.xidian.edu.cn/forum.php?mod=forumdisplay&fid=72&page=1'
self.postdata = urllib.parse.urlencode({
'username': username,
'password': password,
'quickforward': 'yes',
'handlekey': 'ls',
} ).encode(encoding='UTF8')
self.headers = {
'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:29.0) Gecko/20100101 Firefox/29.0'
}
self.cookieJar = http.cookiejar.CookieJar()
self.opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(self.cookieJar))
def login(self):
req = urllib.request.Request(
url = self.loginurl,
data= self.postdata,
headers = self.headers
)
try:
response = self.opener.open(req)
except Exception as e:
print(e)
else:
print("登录成功!抓取金币开始!")
def comment(self, tid):
url = self.front_page_url + 'forum.php?mod=post&action=reply&fid=72&tid=%s&extra=&replysubmit=yes&infloat=yes&handlekey=fastpost' % str(tid)
formhash = get_hash('http://rs.xidian.edu.cn/forum.php?mod=viewthread&tid=%s'%tid, self.opener)
data = urllib.parse.urlencode({
'formhash': formhash,
'message': '帮顶 ',
'usesig': '1',
}).encode(encoding='UTF8')
req = urllib.request.Request(
url = url,
data= data,
headers = self.headers
)
try:
response = self.opener.open(req)
except Exception as e:
print(e)
else:
if '成功' in response.read(500).decode('utf-8'):
print('水了一贴')
def guanShui(self):
req = self.opener.open(self.shuiQu_url)
decode_req = req.read().decode('utf')
items = re.findall('.*?<tbody id="no.*?<tr.*?<td.*?<th.*?<a.*?<a href="(.*?)".*?>(.*?)</a>.*?</tbody>',decode_req,re.S)
if items:
for item in items:
if '金币' in item[1]: # topic title
topic_url = item[0].replace('&amp;', '&') # topic url
print(item[1])
tid = re.match(r'.*?tid=(\d+)', topic_url).groups()[0]
self.comment(tid)
else:
print('没有散金币主题')
if __name__ == '__main__':
print("Hello!Rser! 正在登录!···")
username = ''
password = ''
my_spider = Spider(username=username, password=password)
my_spider.login()
my_spider.guanShui()
谢谢你请我吃糖果!