校招助手数据存储PyMySQL

之前做了招聘会信息提醒之微信机器人,微信群已经建了7西电+5个西邮,总体的用户大概有3000多。小目标是让西电今年找工作的人都能用上。
和几个小伙伴一拍即合,做个小程序吧!

老生长谈,爬虫的三步走:

  • 模拟登陆
  • 数据下载
  • 数据存储

今天就做了这第三步。作为小程序的数据来源。

建数据库

建库建表。
image.png

要注意的问题:

  • infoid 选择自动递增
  • contents的文本信息较多,应选择text

数据库链接

Python中先导入PyMySQL,链接语句(私密信息已做处理):

1
2
3
4
connection = pymysql.connect(host='XXXX', user='XXX', password='XXX', db='campushire', charset='utf8',
cursorclass=pymysql.cursors.DictCursor)
cur = connection.cursor()
cur.execute("USE campushire")

这样就可以链接到数据库,选择自己要插入数据的表。

插入数据

数据好插入,复杂的地方在于如何插入自己想要的数据,剔除掉不想要的数据。简单的几个函数,这里还是要再次提及,之前有使用过,但是又忘了。。。

1
[s.extract() for s in tiao_bsObj.findAll('p', attrs={'class': "windowClose"})]

嗯,这个的语句的意思是,去除指定标签下的所以内容。这里用于剔除睿思具体内容之前的编辑信息,以及就业信息网上的关闭窗口等等诸如此类不是正文的信息。

插入语句:

1
2
3
cur.execute("INSERT INTO hireinfo (title,links,viewnum,class,contents) VALUES (\"%s\",\"%s\",\"%s\",\"%s\",\"%s\")", (P,L,V,C,R))
cur.connection.commit()
time.sleep(3)

插入,提交。延时还是要做的,好像是之前访问数据库太快导致中断???

需要注意的是,表里列的名称要写对,以及VALUES的个数,还有后面要插入的数据与之前的一一对应。

基本这样就OK了。

image.png

吃饭去,饿死了。

代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# coding:utf - 8
import urllib.request
import sys, io
from bs4 import BeautifulSoup
import itchat
from datetime import datetime
import time
import re
import _thread
from urllib.request import urlopen
from urllib import request
from bs4 import BeautifulSoup
import time # 导入包
import urllib.request
import pymysql
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8')
# from __future__ import with_statement
import contextlib
try:
from urllib.parse import urlencode
except ImportError:
from urllib import urlencode
try:
from urllib.request import urlopen
except ImportError:
from urllib2 import urlopen
import sys
def getPageContent(url):
headers = {'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:29.0) Gecko/20100101 Firefox/29.0'}
req = urllib.request.Request(url=url,headers=headers)
try:
res = urllib.request.urlopen(req)
except urllib.error.URLError as e:
return e
page_content = res.read()
page_content=BeautifulSoup(page_content,"lxml")
return page_content
itchat.send_msg(jobinfo2, userName)
def make_tiny(url):
request_url = ('http://tinyurl.com/api-create.php?' +
urlencode({'url': url}))
with contextlib.closing(urlopen(request_url)) as response:
return response.read().decode('utf-8')
def timer(n):
itchat.auto_login(hotReload=True) # 可设置hotReload = True
time.sleep(n)
def rs():
pageURL = set()
# job_rs = '🔴今日睿思校园招聘'+"\n"+'[机器喵自动获取,仅供参考]'+"\n"+'有问题请艾特群主@肖洒'+"\n"+'更多有意思的小玩意'+"\n"+'👉https://x-nicolo.github.io/'+"\n"+"--------------------"+"\n"
for i in range(1, 10):
pages = 'http://rsbbs.xidian.edu.cn/forum.php?mod=forumdisplay&fid=554&page=' + str(i) + '&mobile=2'
if pages not in pageURL:
headers = {
'User-Agent': r'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
r'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3',
'Referer': r'http://rsbbs.xidian.edu.cn',
'Connection': 'keep-alive'}
req = request.Request(pages, headers=headers)
html = request.urlopen(req)
bsObj = BeautifulSoup(html.read(), "lxml")
[s.extract() for s in bsObj.findAll('i', attrs={'class': "pstatus"})]
tiezi = bsObj.findAll("ul")
for tiaos in tiezi:
for tiao in tiaos.findAll('a'):
for person in tiao.findAll('span', attrs={'class': "by"}):
T = person.get_text().strip()
[s.extract() for s in tiao.findAll('span', attrs={'class': "by"})]
# title
P = tiao.get_text().strip().strip('【散金币】').strip('【金币】').strip('(散金币)').strip('(金币)')
if 'href' in tiao.attrs:
try:
tiao_links = "http://rsbbs.xidian.edu.cn/" + tiao.attrs['href']
tiao_html = urlopen(tiao_links)
L=str(make_tiny(tiao_links))
tiao_bsObj = BeautifulSoup(tiao_html.read(), "lxml")
[s.extract() for s in tiao_bsObj.findAll('i', attrs={'class': "pstatus"})]
content = tiao_bsObj.findAll("div", {"class": "message"})[0]
R = content.get_text().strip()
V=0
C='rs'
except (ValueError, IndexError) as e:
pass
pageURL.add(pages)
cur.execute("INSERT INTO hireinfo (title,links,contents,viewnum,class) VALUES (%s,%s,%s,%s,%s)", (P,L,R,V,C))
cur.connection.commit()
time.sleep(3) # 设置时间间隔为3秒
def xdjobs():
# job_rs = '🔴就业信息网最新10条信息'+"\n"+'[机器喵自动获取,仅供参考]'+"\n"+'有问题请艾特群主@肖洒'+"\n"+'更多有意思的小玩意'+"\n"+'👉https://x-nicolo.github.io/'+"\n"+"--------------------"+"\n"
url = 'http://job.xidian.edu.cn/html/zpxx/jobs/'
headers = {'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:29.0) Gecko/20100101 Firefox/29.0'}
req = urllib.request.Request(url=url,headers=headers)
res = urllib.request.urlopen(req)
page_content = res.read()
page_content=BeautifulSoup(page_content,"lxml")
job_content = page_content.find("div", {"class": "content"})
rows = job_content.findAll("span")
job_info=[]
for row in rows:
for cell in row.findAll('a'):
info=cell.get_text()
P = cell.get_text().strip()
tiao_links = "http://job.xidian.edu.cn" + cell.attrs['href']
L=str(make_tiny(tiao_links))
tiao_html = urllib.request.Request(url=tiao_links,headers=headers)
tiao_res = urllib.request.urlopen(tiao_html)
tiao_bsObj = BeautifulSoup(tiao_res.read(), "lxml")
[s.extract() for s in tiao_bsObj.findAll('p', attrs={'class': "windowClose"})]
[s.extract() for s in tiao_bsObj.findAll('p', attrs={'class': "arcInfo"})]
[s.extract() for s in tiao_bsObj.findAll('a', attrs={'href': "javascript:window.print()"})]
[s.extract() for s in tiao_bsObj.findAll('a', attrs={'href': "javascript:window.close()"})]
[s.extract() for s in tiao_bsObj.findAll('div', attrs={'class': "context"})]
content = tiao_bsObj.findAll("div", {"class": "content"})[0]
R = content.get_text().strip()
# print(R)
V=0
C="就业信息网"
cur.execute("INSERT INTO hireinfo (title,links,viewnum,class,contents) VALUES (%s,%s,%s,%s,%s)", (P,L,V,C,R))
cur.connection.commit()
time.sleep(3)
connection = pymysql.connect(host='XXX', user='XXX', password='XXX', db='campushire', charset='utf8',
cursorclass=pymysql.cursors.DictCursor)
cur = connection.cursor()
cur.execute("USE campushire")
xdjobs()
rs()

sql:点击下载

程序及数据库更新

  • 增加了宣讲会信息的爬取
  • 增加了数据库操作的时间插入
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    132
    133
    134
    135
    136
    137
    138
    139
    140
    141
    142
    143
    144
    145
    146
    147
    148
    149
    150
    151
    152
    153
    154
    155
    156
    157
    158
    159
    160
    161
    162
    163
    164
    165
    166
    167
    168
    169
    170
    171
    172
    173
    174
    # coding:utf - 8
    import urllib.request
    import sys, io
    from bs4 import BeautifulSoup
    import itchat
    from datetime import datetime
    import time
    import re
    import _thread
    from urllib.request import urlopen
    from urllib import request
    from bs4 import BeautifulSoup
    import urllib.request
    import pymysql
    from datetime import datetime
    import datetime
    import time
    sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8')
    import contextlib
    try:
    from urllib.parse import urlencode
    except ImportError:
    from urllib import urlencode
    try:
    from urllib.request import urlopen
    except ImportError:
    from urllib2 import urlopen
    import sys
    def getJobInfo(page_content):
    job_content = page_content.find("table", {"class": "table cxxt-table"})
    rows = job_content.findAll("tr")[1:]
    job_info=[]
    for row in rows:
    for cell in row.findAll('td')[:-2]:
    info=cell.get_text().replace('&nbsp','').strip("官云宣讲").strip("官").strip("西电").strip("交大").strip("西工大")
    for i in cell.findAll('span')[1:2]:
    info1=i.get_text()
    job_info.append(info1)
    job_info.append(info)
    return job_info[1:]
    def getPageContent(url):
    headers = {'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:29.0) Gecko/20100101 Firefox/29.0'}
    req = urllib.request.Request(url=url,headers=headers)
    try:
    res = urllib.request.urlopen(req)
    except urllib.error.URLError as e:
    return e
    page_content = res.read()
    page_content=BeautifulSoup(page_content,"lxml")
    return page_content
    def make_tiny(url):
    request_url = ('http://tinyurl.com/api-create.php?' +
    urlencode({'url': url}))
    with contextlib.closing(urlopen(request_url)) as response:
    return response.read().decode('utf-8')
    def timer(n):
    itchat.auto_login(hotReload=True)
    time.sleep(n)
    def rs():
    pageURL = set()
    for i in range(1, 10):
    pages = 'http://rsbbs.xidian.edu.cn/forum.php?mod=forumdisplay&fid=554&page=' + str(i) + '&mobile=2'
    if pages not in pageURL:
    headers = {
    'User-Agent': r'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
    r'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3',
    'Referer': r'http://rsbbs.xidian.edu.cn',
    'Connection': 'keep-alive'}
    req = request.Request(pages, headers=headers)
    html = request.urlopen(req)
    bsObj = BeautifulSoup(html.read(), "lxml")
    [s.extract() for s in bsObj.findAll('i', attrs={'class': "pstatus"})]
    tiezi = bsObj.findAll("ul")
    for tiaos in tiezi:
    for tiao in tiaos.findAll('a'):
    for person in tiao.findAll('span', attrs={'class': "by"}):
    T = person.get_text().strip()
    [s.extract() for s in tiao.findAll('span', attrs={'class': "by"})]
    P = tiao.get_text().strip().strip('【散金币】').strip('【金币】').strip('(散金币)').strip('(金币)')
    if 'href' in tiao.attrs:
    try:
    tiao_links = "http://rsbbs.xidian.edu.cn/" + tiao.attrs['href']
    tiao_html = urlopen(tiao_links)
    L=str(make_tiny(tiao_links))
    tiao_bsObj = BeautifulSoup(tiao_html.read(), "lxml")
    [s.extract() for s in tiao_bsObj.findAll('i', attrs={'class': "pstatus"})]
    content = tiao_bsObj.findAll("div", {"class": "message"})[0]
    R = content.get_text().strip()
    V=0
    C='rs'
    time_now = time.strftime('%H%M', time.localtime(time.time()))
    DBT = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    except (ValueError, IndexError) as e:
    pass
    pageURL.add(pages)
    cur.execute("INSERT INTO hireinfo (title,links,contents,viewnum,classname,dbtime) VALUES (%s,%s,%s,%s,%s,%s)", (P,L,R,V,C,DBT))
    cur.connection.commit()
    time.sleep(3)
    def xdjobs():
    url = 'http://job.xidian.edu.cn/html/zpxx/jobs/'
    headers = {'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:29.0) Gecko/20100101 Firefox/29.0'}
    req = urllib.request.Request(url=url,headers=headers)
    res = urllib.request.urlopen(req)
    page_content = res.read()
    page_content=BeautifulSoup(page_content,"lxml")
    job_content = page_content.find("div", {"class": "content"})
    rows = job_content.findAll("span")
    job_info=[]
    for row in rows:
    for cell in row.findAll('a'):
    info=cell.get_text()
    P = cell.get_text().strip()
    tiao_links = "http://job.xidian.edu.cn" + cell.attrs['href']
    L=str(make_tiny(tiao_links))
    tiao_html = urllib.request.Request(url=tiao_links,headers=headers)
    tiao_res = urllib.request.urlopen(tiao_html)
    tiao_bsObj = BeautifulSoup(tiao_res.read(), "lxml")
    [s.extract() for s in tiao_bsObj.findAll('p', attrs={'class': "windowClose"})]
    [s.extract() for s in tiao_bsObj.findAll('p', attrs={'class': "arcInfo"})]
    [s.extract() for s in tiao_bsObj.findAll('a', attrs={'href': "javascript:window.print()"})]
    [s.extract() for s in tiao_bsObj.findAll('a', attrs={'href': "javascript:window.close()"})]
    [s.extract() for s in tiao_bsObj.findAll('div', attrs={'class': "context"})]
    content = tiao_bsObj.findAll("div", {"class": "content"})[0]
    R = content.get_text().strip()
    time_now = time.strftime('%H%M', time.localtime(time.time()))
    DBT = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    V=0
    C="就业信息网"
    cur.execute("INSERT INTO hireinfo (title,links,viewnum,classname,contents,dbtime) VALUES (%s,%s,%s,%s,%s,%s)", (P,L,V,C,R,DBT))
    cur.connection.commit()
    time.sleep(3)
    def seminar():
    url = ['https://xjh.haitou.cc/xa/uni-29','https://xjh.haitou.cc/xa/uni-28','https://xjh.haitou.cc/xa/uni-27']
    try:
    for i in url:
    page_content=getPageContent(i)
    job_content = page_content.find("table", {"class": "table cxxt-table"})
    rows = job_content.findAll("tr")[1:]
    job_info=[]
    for row in rows:
    for cell in row.findAll('td')[:-2]:
    info=cell.get_text().replace('&nbsp','').strip("官云宣讲").strip("热").strip("官").strip("西电").strip("交大").strip("西工大")
    for i in cell.findAll('span')[1:2]:
    info1=i.get_text()
    job_info.append(info1)
    job_info.append(info)
    jobinfo=job_info[1:]
    time_now = time.strftime('%H%M', time.localtime(time.time()))
    DBT = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    U = jobinfo[0]
    C = jobinfo[1]
    T = jobinfo[3]
    P = jobinfo[4]
    cur.execute("INSERT INTO seminar (university,company,time1,position,dbtime) VALUES (%s,%s,%s,%s,%s)", (U,C,T,P,DBT))
    cur.connection.commit()
    job_info=[]
    except:
    jobinfo=["【西电今日无招聘会信息哦】"]
    print("jobinfo")
    connection = pymysql.connect(host='localhost', user='root', password='root', db='campushire', charset='utf8',
    cursorclass=pymysql.cursors.DictCursor)
    cur = connection.cursor()
    cur.execute("USE campushire")
    while True:
    time_now = time.strftime('%H%M',time.localtime(time.time()))
    if int(time_now) == 0000:
    xdjobs()
    rs()
    seminar()

new sql downlaod

谢谢你请我吃糖果!