1.基础的爬虫架构及运行流程
基础的爬虫框架主要包括五大模块,分别为爬虫调度器、URL 管理器、HTML 下载器、HTML 解析器、数据存储器。
功能分析如下:
爬虫调度器主要负责统筹其他四个模块的协调工作。
URL 管理器负责管理 URL 链接,维护已经爬取的 URL 集合和未爬取的 URL 集合,提供获取新的 URL 链接的接口。
HTML 下载器用于从 URL 中获取为爬取的 URL 链接并下载 HTML 网页。
HTML 解析器用于从 HTML 下载器中获取已经下载的 HTML 网页,并从中解析出新的 URL 链接交给 URL 管理器,解析出有效数据交给数据存储器。
数据存储器用于将 HTML 解析器解析出来的数据通过文件或者数据库的形式存储起来。
2.第一个爬虫示例
爬取随即1条百度百科的词条数据。
3.URL 管理器
功能定义如下:
1 2 3 4 5 6
| has_new_url() add_new_url(url) add_new_urls(urls) get_new_url() new_url_size() old_url_size()
|
代码如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
|
class Urlmanage(): def __init__(self): self.new_urls = set() self.old_urls = set() def has_new_url(self): return self.new_url_size() def get_new_url(self): new_url = self.new_urls.pop() self.old_urls.add(new_url) return new_url def add_new_url(self,url): if url is None: return if url not in self.new_urls and url not in self.old_urls: self.new_urls.add(url) def add_new_urls(self,urls): if urls is None or len(urls) == 0: return for url in urls: self.add_new_url(url) def new_url_size(self): return len(self.new_urls) def old_url_size(self): return len(self.old_urls)
|
4.HTML 下载器
代码如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14
|
import requests class Htmldownload(): def download(self,url): if url is None: return None user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = {'User-Agent': user_agent} r = requests.get(url,headers=headers) if r.status_code == 200: r.encoding = 'utf-8' return r.text return None
|
5.HTML 解析器
代码如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
|
import re from urllib import parse from bs4 import BeautifulSoup class htmlparser(): def parser(self,page_url,html_cont): if page_url is None or html_cont is None: return soup = BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8') new_urls = self._get_new_urls(page_url,soup) new_data = self._get_new_data(page_url,soup) return new_urls,new_data def _get_new_urls(self,page_url,soup): new_urls = set() links = soup.find_all('a',herf=re.compile(r'/view/\d+\.htm')) for link in links: new_url = link['href'] new_full_url = parse.urljoin(page_url,new_url) new_urls.add(new_full_url) return new_urls def _get_new_data(self,page_url,soup): data={} data['url'] = page_url title = soup.find('dd',class_='lemmaWgt-lemmaTitle-title').find('h1') data['title'] = title.get_text() summary = soup.find('div',class_='lemma-summary') data['summary'] = summary.get_text() return data
|
6.数据存储器
代码如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
|
import codecs class dataoutput(): def __init__(self): self.datas = [] def store_data(self,data): if data is None: return self.datas.append(data) def output_html(self): fout = codecs.open('baile.html','w',encoding='utf-8') fout.write('<html>') fout.write('<body>') fout.write('<table>') for data in self.datas: fout.write('<tr>') fout.write('<td>%s</td>' %data['url']) fout.write('<td>%s</td>' %data['title']) fout.write('<td>%s</td>' %data['summary']) fout.write('</tr>') self.datas.remove(data) fout.write('</table>') fout.write('</body>') fout.write('</html>') fout.close()
|
7.爬虫调度器
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
|
import random from spride.database import dataoutput from spride.htmldown import Htmldownload from spride.htmlsoup import htmlparser from spride.urlmanage import Urlmanage class Spriderman(): def __init__(self): self.manager = Urlmanage() self.downloder = Htmldownload() self.parse = htmlparser() self.output = dataoutput() def crawl(self,root_url): self.manager.add_new_url(root_url) while (self.manager.has_new_url() and self.manager.old_url_size()<100): try: new_url = self.manager.get_new_url() html = self.downloder.download(new_url) new_urls ,data = self.parse.parser(new_url,html) self.manager.add_new_urls(new_urls) self.output.store_data(data) print('has touch %s links' %self.manager.old_url_size()) except Exception as e: print('crwal fallid',e) self.output.output_html() if __name__ == '__main__': def viod(num): ran = '' for i in range(num): add = random.randrange(10) ran += str(add) return ran spriderman = Spriderman() spriderman.crawl('http://baike.baidu.com/view/' + viod(7) + '.htm')
|
Author:
FullSky
Permalink:
http://FullSky`★.com/2019/12/27/第一个爬虫/
License:
Copyright (c) 2019 CC-BY-NC-4.0 LICENSE
Slogan:
能通途偶遇在这星球上,我多么够运。