之前做了招聘会信息提醒之微信机器人,微信群已经建了7西电+5个西邮,总体的用户大概有3000多。小目标是让西电今年找工作的人都能用上。
和几个小伙伴一拍即合,做个小程序吧!
老生长谈,爬虫的三步走:
- 模拟登陆
- 数据下载
- 数据存储
今天就做了这第三步。作为小程序的数据来源。
建数据库
建库建表。
要注意的问题:
- infoid 选择自动递增
- contents的文本信息较多,应选择text
数据库链接
Python中先导入PyMySQL,链接语句(私密信息已做处理):
这样就可以链接到数据库,选择自己要插入数据的表。
插入数据
数据好插入,复杂的地方在于如何插入自己想要的数据,剔除掉不想要的数据。简单的几个函数,这里还是要再次提及,之前有使用过,但是又忘了。。。
嗯,这个的语句的意思是,去除指定标签下的所以内容。这里用于剔除睿思具体内容之前的编辑信息,以及就业信息网上的关闭窗口等等诸如此类不是正文的信息。
插入语句:
|
|
插入,提交。延时还是要做的,好像是之前访问数据库太快导致中断???
需要注意的是,表里列的名称要写对,以及VALUES的个数,还有后面要插入的数据与之前的一一对应。
基本这样就OK了。
吃饭去,饿死了。
代码:
sql:点击下载
程序及数据库更新
- 增加了宣讲会信息的爬取
- 增加了数据库操作的时间插入123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174# coding:utf - 8import urllib.requestimport sys, iofrom bs4 import BeautifulSoupimport itchatfrom datetime import datetimeimport timeimport reimport _threadfrom urllib.request import urlopenfrom urllib import requestfrom bs4 import BeautifulSoupimport urllib.requestimport pymysqlfrom datetime import datetimeimport datetimeimport timesys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8')import contextlibtry:from urllib.parse import urlencodeexcept ImportError:from urllib import urlencodetry:from urllib.request import urlopenexcept ImportError:from urllib2 import urlopenimport sysdef getJobInfo(page_content):job_content = page_content.find("table", {"class": "table cxxt-table"})rows = job_content.findAll("tr")[1:]job_info=[]for row in rows:for cell in row.findAll('td')[:-2]:info=cell.get_text().replace(' ','').strip("官云宣讲").strip("官").strip("西电").strip("交大").strip("西工大")for i in cell.findAll('span')[1:2]:info1=i.get_text()job_info.append(info1)job_info.append(info)return job_info[1:]def getPageContent(url):headers = {'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:29.0) Gecko/20100101 Firefox/29.0'}req = urllib.request.Request(url=url,headers=headers)try:res = urllib.request.urlopen(req)except urllib.error.URLError as e:return epage_content = res.read()page_content=BeautifulSoup(page_content,"lxml")return page_contentdef make_tiny(url):request_url = ('http://tinyurl.com/api-create.php?' +urlencode({'url': url}))with contextlib.closing(urlopen(request_url)) as response:return response.read().decode('utf-8')def timer(n):itchat.auto_login(hotReload=True)time.sleep(n)def rs():pageURL = set()for i in range(1, 10):pages = 'http://rsbbs.xidian.edu.cn/forum.php?mod=forumdisplay&fid=554&page=' + str(i) + '&mobile=2'if pages not in pageURL:headers = {'User-Agent': r'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) 'r'Chrome/45.0.2454.85 Safari/537.36 115Browser/6.0.3','Referer': r'http://rsbbs.xidian.edu.cn','Connection': 'keep-alive'}req = request.Request(pages, headers=headers)html = request.urlopen(req)bsObj = BeautifulSoup(html.read(), "lxml")[s.extract() for s in bsObj.findAll('i', attrs={'class': "pstatus"})]tiezi = bsObj.findAll("ul")for tiaos in tiezi:for tiao in tiaos.findAll('a'):for person in tiao.findAll('span', attrs={'class': "by"}):T = person.get_text().strip()[s.extract() for s in tiao.findAll('span', attrs={'class': "by"})]P = tiao.get_text().strip().strip('【散金币】').strip('【金币】').strip('(散金币)').strip('(金币)')if 'href' in tiao.attrs:try:tiao_links = "http://rsbbs.xidian.edu.cn/" + tiao.attrs['href']tiao_html = urlopen(tiao_links)L=str(make_tiny(tiao_links))tiao_bsObj = BeautifulSoup(tiao_html.read(), "lxml")[s.extract() for s in tiao_bsObj.findAll('i', attrs={'class': "pstatus"})]content = tiao_bsObj.findAll("div", {"class": "message"})[0]R = content.get_text().strip()V=0C='rs'time_now = time.strftime('%H%M', time.localtime(time.time()))DBT = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")except (ValueError, IndexError) as e:passpageURL.add(pages)cur.execute("INSERT INTO hireinfo (title,links,contents,viewnum,classname,dbtime) VALUES (%s,%s,%s,%s,%s,%s)", (P,L,R,V,C,DBT))cur.connection.commit()time.sleep(3)def xdjobs():url = 'http://job.xidian.edu.cn/html/zpxx/jobs/'headers = {'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:29.0) Gecko/20100101 Firefox/29.0'}req = urllib.request.Request(url=url,headers=headers)res = urllib.request.urlopen(req)page_content = res.read()page_content=BeautifulSoup(page_content,"lxml")job_content = page_content.find("div", {"class": "content"})rows = job_content.findAll("span")job_info=[]for row in rows:for cell in row.findAll('a'):info=cell.get_text()P = cell.get_text().strip()tiao_links = "http://job.xidian.edu.cn" + cell.attrs['href']L=str(make_tiny(tiao_links))tiao_html = urllib.request.Request(url=tiao_links,headers=headers)tiao_res = urllib.request.urlopen(tiao_html)tiao_bsObj = BeautifulSoup(tiao_res.read(), "lxml")[s.extract() for s in tiao_bsObj.findAll('p', attrs={'class': "windowClose"})][s.extract() for s in tiao_bsObj.findAll('p', attrs={'class': "arcInfo"})][s.extract() for s in tiao_bsObj.findAll('a', attrs={'href': "javascript:window.print()"})][s.extract() for s in tiao_bsObj.findAll('a', attrs={'href': "javascript:window.close()"})][s.extract() for s in tiao_bsObj.findAll('div', attrs={'class': "context"})]content = tiao_bsObj.findAll("div", {"class": "content"})[0]R = content.get_text().strip()time_now = time.strftime('%H%M', time.localtime(time.time()))DBT = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")V=0C="就业信息网"cur.execute("INSERT INTO hireinfo (title,links,viewnum,classname,contents,dbtime) VALUES (%s,%s,%s,%s,%s,%s)", (P,L,V,C,R,DBT))cur.connection.commit()time.sleep(3)def seminar():url = ['https://xjh.haitou.cc/xa/uni-29','https://xjh.haitou.cc/xa/uni-28','https://xjh.haitou.cc/xa/uni-27']try:for i in url:page_content=getPageContent(i)job_content = page_content.find("table", {"class": "table cxxt-table"})rows = job_content.findAll("tr")[1:]job_info=[]for row in rows:for cell in row.findAll('td')[:-2]:info=cell.get_text().replace(' ','').strip("官云宣讲").strip("热").strip("官").strip("西电").strip("交大").strip("西工大")for i in cell.findAll('span')[1:2]:info1=i.get_text()job_info.append(info1)job_info.append(info)jobinfo=job_info[1:]time_now = time.strftime('%H%M', time.localtime(time.time()))DBT = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")U = jobinfo[0]C = jobinfo[1]T = jobinfo[3]P = jobinfo[4]cur.execute("INSERT INTO seminar (university,company,time1,position,dbtime) VALUES (%s,%s,%s,%s,%s)", (U,C,T,P,DBT))cur.connection.commit()job_info=[]except:jobinfo=["【西电今日无招聘会信息哦】"]print("jobinfo")connection = pymysql.connect(host='localhost', user='root', password='root', db='campushire', charset='utf8',cursorclass=pymysql.cursors.DictCursor)cur = connection.cursor()cur.execute("USE campushire")while True:time_now = time.strftime('%H%M',time.localtime(time.time()))if int(time_now) == 0000:xdjobs()rs()seminar()