1.基础的爬虫架构及运行流程

基础的爬虫框架主要包括五大模块,分别为爬虫调度器、URL 管理器、HTML 下载器、HTML 解析器、数据存储器。

功能分析如下:

爬虫调度器主要负责统筹其他四个模块的协调工作。

URL 管理器负责管理 URL 链接,维护已经爬取的 URL 集合和未爬取的 URL 集合,提供获取新的 URL 链接的接口。

HTML 下载器用于从 URL 中获取为爬取的 URL 链接并下载 HTML 网页。

HTML 解析器用于从 HTML 下载器中获取已经下载的 HTML 网页,并从中解析出新的 URL 链接交给 URL 管理器,解析出有效数据交给数据存储器。

数据存储器用于将 HTML 解析器解析出来的数据通过文件或者数据库的形式存储起来。

2.第一个爬虫示例

爬取随即1条百度百科的词条数据。

3.URL 管理器

功能定义如下:

1
2
3
4
5
6
has_new_url() #判断是否有待取的 URL
add_new_url(url) #添加新的 URL 到未爬取集合中
add_new_urls(urls)
get_new_url() #获取一个未爬取的 URL
new_url_size() #获取未爬取 URL 的集合大小
old_url_size() #获取已经爬取的 URL 的大小

代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
#coding = utf-8
#author: FullSky
class Urlmanage():
def __init__(self):
self.new_urls = set()
self.old_urls = set()
def has_new_url(self):
return self.new_url_size()
def get_new_url(self):
new_url = self.new_urls.pop()
self.old_urls.add(new_url)
return new_url
def add_new_url(self,url):
if url is None:
return
if url not in self.new_urls and url not in self.old_urls:
self.new_urls.add(url)
def add_new_urls(self,urls):
if urls is None or len(urls) == 0:
return
for url in urls:
self.add_new_url(url)
def new_url_size(self):
return len(self.new_urls)
def old_url_size(self):
return len(self.old_urls)

4.HTML 下载器

代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
#coding = utf-8
#author: FullSky
import requests
class Htmldownload():
def download(self,url):
if url is None:
return None
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = {'User-Agent': user_agent}
r = requests.get(url,headers=headers)
if r.status_code == 200:
r.encoding = 'utf-8'
return r.text
return None

5.HTML 解析器

代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
#coding = utf-8
#author: FullSky
import re
from urllib import parse
from bs4 import BeautifulSoup
class htmlparser():
def parser(self,page_url,html_cont):
if page_url is None or html_cont is None:
return
soup = BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8')
new_urls = self._get_new_urls(page_url,soup)
new_data = self._get_new_data(page_url,soup)
return new_urls,new_data
def _get_new_urls(self,page_url,soup):
new_urls = set()
links = soup.find_all('a',herf=re.compile(r'/view/\d+\.htm'))
for link in links:
new_url = link['href']
new_full_url = parse.urljoin(page_url,new_url)
new_urls.add(new_full_url)
return new_urls
def _get_new_data(self,page_url,soup):
data={}
data['url'] = page_url
title = soup.find('dd',class_='lemmaWgt-lemmaTitle-title').find('h1')
data['title'] = title.get_text()
summary = soup.find('div',class_='lemma-summary')
data['summary'] = summary.get_text()
return data

6.数据存储器

代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
#coding = utf-8
#author: FullSky
import codecs
class dataoutput():
def __init__(self):
self.datas = []
def store_data(self,data):
if data is None:
return
self.datas.append(data)
def output_html(self):
fout = codecs.open('baile.html','w',encoding='utf-8')
fout.write('<html>')
fout.write('<body>')
fout.write('<table>')
for data in self.datas:
fout.write('<tr>')
fout.write('<td>%s</td>' %data['url'])
fout.write('<td>%s</td>' %data['title'])
fout.write('<td>%s</td>' %data['summary'])
fout.write('</tr>')
self.datas.remove(data)
fout.write('</table>')
fout.write('</body>')
fout.write('</html>')
fout.close()

7.爬虫调度器

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#coding = utf-8
#author: FullSky
import random
from spride.database import dataoutput
from spride.htmldown import Htmldownload
from spride.htmlsoup import htmlparser
from spride.urlmanage import Urlmanage
class Spriderman():
def __init__(self):
self.manager = Urlmanage()
self.downloder = Htmldownload()
self.parse = htmlparser()
self.output = dataoutput()
def crawl(self,root_url):
self.manager.add_new_url(root_url)
while (self.manager.has_new_url() and self.manager.old_url_size()<100):
try:
new_url = self.manager.get_new_url()
html = self.downloder.download(new_url)
new_urls ,data = self.parse.parser(new_url,html)
self.manager.add_new_urls(new_urls)
self.output.store_data(data)
print('has touch %s links' %self.manager.old_url_size())
except Exception as e:
print('crwal fallid',e)
self.output.output_html()
if __name__ == '__main__':
def viod(num):
ran = ''
for i in range(num):
add = random.randrange(10)
ran += str(add)
return ran
spriderman = Spriderman()
spriderman.crawl('http://baike.baidu.com/view/' + viod(7) + '.htm')