Python爬虫之金庸全集

内容概要: 利用Python+BeautifulSoup构建爬虫, 获取金庸全集,保存到本地txt文件中, 以备后续数据分析使用.

1. 代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#/user/bin/python3.5
from bs4 import BeautifulSoup
import requests
import codecs
import bs4.element


# 获取每一章的文本
def getPage(url):
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html,'html.parser')
# print(soup.prettify())

soup_txt = soup.find('div', id='htmlContent',class_="contentbox")

title_tag = soup.find('div',class_="h1title")
titles = title_tag.h1.string
print("titles = "+(titles))
content = ">>>>>>>>>>>>"+titles+"\n"
for c in soup_txt.children:
if c != None:
if type(c) == bs4.element.Tag:
content +=c.get_text()

content = content.strip()
return content



# 获取某一本书的所有章节url
def getBook(url):
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html, 'html.parser')

soup_txt = soup.find('div', class_="book_list")

print("*"*90)

linkList = list()
index = ""
for link in soup_txt.ul.children:
if link != '\n':
hrf = link.a.get('href')
title = link.a.string

print("title="+title, end="\t")
print("link="+hrf)
index += title+"\n"
linkList.append(hrf)
return linkList,index


# 获取金庸全集书的url : {书名:url}
def getList(url):
response = requests.get(url)
html = response.content

soup = BeautifulSoup(html, 'html.parser')

soup_list= soup.find_all('div', class_="bk")

print(soup_list)

bookList = dict()

for i in soup_list:
print("---" * 30)
print(i.h3.a)
href = i.h3.a.get('href')
title = i.h3.a.get('title')
bookList[title]=href
print("---" * 30)
print(bookList)

return bookList



# 运行主程序, 先获取金庸全集的书名:url, 然后逐一去爬取数据,保存到对应的txt文件中
if __name__ == "__main__":
# get a list for bookName:url
bookList = dict(getList('http://jinyong.zuopinj.com/'))

for title,url in bookList.items():
# get all url
print(">>>>>>>>>>>>>>"+title)
print(">>>>>>>>>>>>>>"+url)

lis,index = getBook(url)

f = codecs.open(title+".txt",'w','utf-8')
f.write(index)
f.write("**"*50)
f.write("\n\n")
for uri in lis:
print("--"*40)
cont = getPage(uri)
f.write(uri+"\n")
f.write(cont+'\n\n')
f.write("**"*50)
f.write("\n\n")
f.close()

2. 结果示例

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
$ ll -h
总用量 16M
drwxrwxr-x 2 jay jay 4.0K 927 13:35 ./
drwxrwxr-x 4 jay jay 4.0K 925 23:43 ../
-rw-rw-r-- 1 jay jay 0 925 23:43 __init__.py
-rw-rw-r-- 1 jay jay 2.4K 927 13:20 jinyong27.py
-rw-rw-r-- 1 jay jay 2.5K 927 13:31 Jinyong35.py
-rw-rw-r-- 1 jay jay 1.6K 926 11:35 Miyuezhuan.py
-rw-rw-r-- 1 jay jay 415 927 02:28 TestBeautifulsoup.py
-rw-rw-r-- 1 jay jay 1.8K 926 13:50 tianlong.py
-rw-rw-r-- 1 jay jay 220K 927 13:32 白马啸西风.txt
-rw-rw-r-- 1 jay jay 1.4M 927 13:35 碧血剑.txt
-rw-rw-r-- 1 jay jay 1.3M 927 13:33 飞狐外传.txt
-rw-rw-r-- 1 jay jay 682K 927 13:32 连城诀.txt
-rw-rw-r-- 1 jay jay 14K 927 13:33 鹿鼎记.txt
-rw-rw-r-- 1 jay jay 164K 927 13:35 射雕英雄传.txt
-rw-rw-r-- 1 jay jay 8.2K 927 13:35 神雕侠侣.txt
-rw-rw-r-- 1 jay jay 1.5M 927 13:36 书剑恩仇录.txt
-rw-rw-r-- 1 jay jay 3.5M 927 13:33 天龙八部.txt
-rw-rw-r-- 1 jay jay 1.1M 927 13:35 侠客行.txt
-rw-rw-r-- 1 jay jay 2.9M 927 13:34 笑傲江湖.txt
-rw-rw-r-- 1 jay jay 394K 927 13:33 雪山飞狐.txt
-rw-rw-r-- 1 jay jay 2.8M 927 13:34 倚天屠龙记.txt
-rw-rw-r-- 1 jay jay 94K 927 13:35 鸳鸯刀.txt
-rw-rw-r-- 1 jay jay 49K 927 13:33 越女剑.txt