Python爬虫之selenium

内容概述 : chromedriver翻页操作, PhantomJS翻页+MySQL操作, selenium

1. chromedriver翻页操作

基于chromedriver和selenium, 抓取QQ音乐歌手列表中歌手的名字和主页地址, 并保存到本地txt文件中.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import codecs

# 使用之前安装以下两个工具 : selenium和chromedriver, 并且配置环境变量

def getSingerList(html):
soup = BeautifulSoup(html,'lxml')
li_list = soup.find_all('li',class_="singer_list_txt__item")
singerList = list()
for li in li_list:
global count
count+=1
title = li.a.get('title')
href = li.a.get('href')
singer = {title:href}
print('NO.',count,' : title=',title,', href=',href)
singerList.append(singer)
return singerList

# 针对chrome浏览器的qq音乐页面翻页程序
# 如果使用PhantomJS, 渲染后的结果可能会有差异, 因此需要定制修改
def nextPage(browser, file):
time.sleep(3)
#滑到最底下
js = "var q=document.documentElement.scrollTop=20000"
browser.execute_script(js)
time.sleep(1)

# 获取页面的html内容
html = browser.page_source
# print('page_resource = ',html)
singers = getSingerList(html)
file.write(str(singers))

# 找到翻页的button
# 使用xpath定位页面中的元素, 具体xpath原理参考网络资源
# 对于某个页面而言, 可以使用chrome浏览器提供的工具获取element的xpath:
# chrome浏览器进入以后, F12进入开发者工具--->>>Elements中找到要点击的元素---->>>右键---->>>copy---->>>copy XPath
# 然后自己可以根据需要略微修改
it = browser.find_element_by_xpath('//*[@id="mod-singerlist"]/div[@class="mod_page_nav js_pager"]/a[@title="下一页"]/span')
# print(it.is_displayed())
# print(it.is_enabled())
# 点击下一页
it.click()




if __name__ == '__main__':
browser = webdriver.Chrome()

# 使得浏览器窗口最大化
browser.maximize_window()
browser.get('https://y.qq.com/portal/singer_list.html')

count = 0

file = codecs.open('QQ歌手列表.txt', 'w')

# 模拟翻页,从第1页到第10页
for i in range(10):
print("\n\nPage No = ",i)
nextPage(browser,file)
time.sleep(3)
file.close()

运行结果:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
python3 TestSelenium_Chrome.py 

Page No = 0
NO. 1 : title= G.E.M. 邓紫棋 , href= https://y.qq.com/n/yqq/singer/001fNHEf1SFEFN.html#stat=y_new.singerlist.singername
NO. 2 : title= 张杰 , href= https://y.qq.com/n/yqq/singer/002azErJ0UcDN6.html#stat=y_new.singerlist.singername
NO. 3 : title= 鹿晗 , href= https://y.qq.com/n/yqq/singer/001SqkF53OEhdO.html#stat=y_new.singerlist.singername
NO. 4 : title= 庄心妍 , href= https://y.qq.com/n/yqq/singer/003Cn3Yh16q1MO.html#stat=y_new.singerlist.singername
NO. 5 : title= 杨宗纬 , href= https://y.qq.com/n/yqq/singer/003tMm0y0TuewY.html#stat=y_new.singerlist.singername
NO. 6 : title= Justin Bieber (贾斯汀·比伯) , href= https://y.qq.com/n/yqq/singer/002DYpxl3hW3EP.html#stat=y_new.singerlist.singername
NO. 7 : title= 张碧晨 , href= https://y.qq.com/n/yqq/singer/0003ZpE43ypssl.html#stat=y_new.singerlist.singername
NO. 8 : title= 华晨宇 , href= https://y.qq.com/n/yqq/singer/002Vcz8F2hpBQj.html#stat=y_new.singerlist.singername
NO. 9 : title= 张学友 , href= https://y.qq.com/n/yqq/singer/004Be55m1SJaLk.html#stat=y_new.singerlist.singername
NO. 10 : title= 田馥甄 , href= https://y.qq.com/n/yqq/singer/001ByAsv3XCdgm.html#stat=y_new.singerlist.singername
NO. 11 : title= RADWIMPS (ラッドウィンプス) , href= https://y.qq.com/n/yqq/singer/000f1b6W1wzyRN.html#stat=y_new.singerlist.singername
NO. 12 : title= 萧敬腾 , href= https://y.qq.com/n/yqq/singer/004bsIDK0awMOv.html#stat=y_new.singerlist.singername
NO. 13 : title= 那英 , href= https://y.qq.com/n/yqq/singer/003LCFXH0eodXv.html#stat=y_new.singerlist.singername
NO. 14 : title= 王力宏 , href= https://y.qq.com/n/yqq/singer/001JDzPT3JdvqK.html#stat=y_new.singerlist.singername
NO. 15 : title= 张靓颖 , href= https://y.qq.com/n/yqq/singer/000aw4WC2EQYTv.html#stat=y_new.singerlist.singername
NO. 16 : title= 吴亦凡 , href= https://y.qq.com/n/yqq/singer/002yeznU3VAVEV.html#stat=y_new.singerlist.singername
NO. 17 : title= BEYOND , href= https://y.qq.com/n/yqq/singer/002pUZT93gF4Cu.html#stat=y_new.singerlist.singername
NO. 18 : title= 林宥嘉 , href= https://y.qq.com/n/yqq/singer/001f0VyZ1hmWZ1.html#stat=y_new.singerlist.singername
NO. 19 : title= 南征北战 , href= https://y.qq.com/n/yqq/singer/003ZQQb64D5317.html#stat=y_new.singerlist.singername
NO. 20 : title= 汪苏泷 , href= https://y.qq.com/n/yqq/singer/001z2JmX09LLgL.html#stat=y_new.singerlist.singername
....


2. PhantomJS翻页爬虫+MySQL操作

程序结构与chrome浏览器相同

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
from selenium import webdriver
from bs4 import BeautifulSoup
import time
import pymysql

def nextPage(browser):
time.sleep(3)
#滑到最底下
js = "var q=document.documentElement.scrollTop=20000"
browser.execute_script(js)
time.sleep(1)

singers = getSingerList(browser)
insertSingerList(singers)

# 找到翻页的button
it = browser.find_element_by_xpath('//*[@id="mod-singerlist"]/div[@class="mod_page_nav js_pager"]/a[@title="下一页"]/span')
# print(type(it),it.get_attribute('class'),it.tag_name,'>>>>>>>>>',it.text)
# print(it.is_displayed())
# print(it.is_enabled())
# 点击下一页
it.click()


def getSingerList(browser):
html = browser.page_source
soup = BeautifulSoup(html,'lxml')

avatar = soup.find_all('h3',class_='singer_list__title')
print('*'*40,'Avatar singer','*'*30)

singers = dict()
for j in avatar:
i = j.contents[0]
name = i.get('title')
href = i.get('href')
print('*'*20,'singer = ',name,'--->>>',href)
singers[name]=href

print('\n\n','*'*40,'singer','*'*30)
txt = soup.find_all('a',class_='singer_list_txt__link js_singer')
for i in txt:
name = i.get('title')
href = i.get('href')
print('*' * 20, 'singer = ', name,'--->>>',href)
singers[name]=href
time.sleep(3)
return singers



#######################################################################################
# THe following codes will do insertion operation
# open database and connect
def insertSingerList(singers):
db = pymysql.connect(host='localhost', user='root', passwd='zhangjie', db='blog', charset='utf8')
# get operation cursor
cursor = db.cursor()

# SQL 插入语句

vals = []
for name, href in singers.items():
val = (name, href)
vals.append(val)

# # SQL 插入语句,不论数据是什么类型,都使用%s作为占位符号
sql = "INSERT INTO singer(name,href) VALUES (%s, %s)"
try:
# execute sql
# 数据可以是tuple或者list
cursor.executemany(sql, vals)
db.commit()
except Exception as e:
print(e)
print("Error:unable to query the database!")
# close the connection
db.close()



if __name__ == '__main__':
browser = webdriver.PhantomJS()
browser.maximize_window()
browser.get('https://y.qq.com/portal/singer_list.html')
cks = browser.get_cookies()

for i in range(100):
nextPage(browser)