CWYAlpha

Just another WordPress.com site

Thought this was cool: Python 爬虫框架

leave a comment »


学Python的应该都写过爬虫吧,如果希望提高爬虫的效率就要用到并发,可以选择的用多线程、多进程,还有最近很火的Gevent,据说是基于一种新的概念,协程,不管什么程,总之好用就行了。写一个爬虫有一系列的东西需要处理,如果有一个好用的框架就会事半功倍。
在42qu.com源码里面,有个教主写的爬虫框架,很好用,是基于Gevent的,处理url采用和web开发类似的映射方法,教主还写了一个简单的html处理库,extract,虽没有Beautifulsoup那样强大,但是简单好用,基本能满足常见的要求。HTTP请求使用了requests,requests是个处理HTTP的库,用官方的话说“Requests is an ISC Licensed HTTP library, written in Python, for human beings.”,for human beings,好吧,废话不说,上代码:
这里我将会把所需代码都贴出来,并贴一个示例。由于windows平台貌似没有Gevent支持,我还改写了一个多线程版本,一并放出。
首先是extract:#coding:utf-8
import re
 
def extract(begin, end, html):
if not html:
return ”
start = html.find(begin)
if start >= 0:
start += len(begin)
if end is not None:
end = html.find(end, start)
if end is None or end >= 0:
return html[start:end].strip()
 
def extract_all(begin, end, html):
return map(str.strip, _extract_all(begin, end, html))
 
def _extract_all(begin, end, html):
if not html:
return ”
result = []
from_pos = 0
while True:
start = html.find(begin, from_pos)
if start >= 0:
start += len(begin)
endpos = html.find(end, start)
if endpos >= 0:
result.append(html[start:endpos])
from_pos = endpos+len(end)
continue
break
return result
 
def line_strip(txt):
if not txt:
return ”
txt = txt.replace(‘ ’, ‘ ‘).split(‘\n’)
return ‘\n’.join(i for i in [i.strip() for i in txt] if i)
 
def extract_strip(begin, end, html):
if not html:
return ”
t = extract(begin, end, html)
if t:
return strip_line(t)
 
 
def extract_map(begin, end, html, func):
txt = []
result = []
prepos = None
preend = 0
len_end = len(end)
len_begin = len(begin)
while True:
if prepos is None:
pos = html.find(begin)
else:
pos = html.find(begin, prepos)
if pos >= 0:
end = html.find(end, pos)
if pos < 0 or end < 0:
result.append(html[preend:])
break
end = end+len_end
result.append(html[preend:pos])
tmp = func(html[pos:end])
if tmp:
result.append(tmp)
prepos = pos+len_begin
preend = end
 
return ”.join(result)
 
 
if __name__ == ‘__main__’:
 
pass然后是爬虫框架,下面是原版#coding:utf-8
import _env
from gevent.queue import Empty, Queue
import gevent
import gevent.monkey
import requests
from urlparse import urlparse, parse_qs
import re
gevent.monkey.patch_all()
 
 
class Bot(object):
cookie = None
headers = {}
 
def __init__(self, route):
self.queue = Queue()
self.route = route
 
def _fetch(self):
queue = self.queue
timeout = self.timeout
route = self.route
while True:
try:
url = queue.get(timeout=timeout+10)
except Empty:
return
 
headers = self.headers
 
if self.cookie:
headers[‘Cookie’] = self.cookie
req = requests.get(url, timeout=timeout, headers=headers, proxies=self.proxies)
p = urlparse(req.url)
 
cls, args = route.match(p.path)
if cls:
o = cls(req)
r = o.get(*args)
if r:
for i in r:
if i:
queue.put(i)
 
def run(self, num=10, timeout=60, proxies={}, cookie=None):
self.proxies = proxies
self.timeout = timeout
self.cookie = cookie
for i in xrange(num):
g = gevent.spawn(self._fetch)
g.join()
 
def put(self, url):
self.queue.put(url)
 
class Route(object):
def __init__(self):
self.map = []
 
def match(self, url):
for r, f in self.map:
m = r.match(url)
if m:
return f, m.groups()
return None, None
 
def __call__(self, path):
if not path.endswith(‘$’):
path += ‘$’
re_path = re.compile(path)
def _(func):
self.map.append((re_path, func))
return func
return _
 
 
route = Route()
bot = Bot(route)
 
from extract import extract, extract_all
 
class Page(object):
 
def __init__(self, req):
p = urlparse(req.url)
req.arguments = parse_qs(p.query, 1)
self.req = req
self.html = req.content
 
def get_argument(self, name, default=None):
result = self.req.arguments.get(name, None)
if result is None:
return default
return result[0].encode(‘utf-8’, ‘ignore’)
 
def extract(self, begin, end):
return extract(begin, end, self.html)
 
def extract_all(self, begin, end):
return extract_all(begin, end, self.html)
然后是线程版的,适用于windows平台或者未安装Gevent的情况:#coding:utf-8
import _env
 
import re
import time
import requests
from Queue import Queue
from threading import Thread
from urlparse import urlparse, parse_qs
 
ua = ‘Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.83 Safari/537.1’
 
class Bot(object):
cookie = None
headers = {}
 
def __init__(self, route):
self.queue = Queue()
self.route = route
 
def _fetch(self):
queue = self.queue
timeout = self.timeout
route = self.route
while True:
try:
url = queue.get(timeout=timeout+10)
except:
return
 
headers = self.headers
 
if self.cookie:
headers[‘Cookie’] = self.cookie
headers[‘User-Agent’] = ua
req = requests.get(url, timeout=timeout, headers=headers, proxies=self.proxies)
time.sleep(3)
p = urlparse(req.url)
 
cls, args = route.match(p.path)
if cls:
o = cls(req)
r = o.get(*args)
if r:
for i in r:
if i:
queue.put(i)
 
def run(self, num=6, timeout=10, proxies={}, cookie=cookie):
self.timeout = timeout
self.proxies = proxies
self.cookie = cookie
g = Thread()
g.setDaemon(1)
g.start()
for i in xrange(num):
g = Thread(target=self._fetch)
g.start()
#g.join()
 
def put(self, url):
self.queue.put(url)
 
class Route(object):
def __init__(self):
self.map = []
 
def match(self, url):
for r, f in self.map:
m = r.match(url)
if m:
return f, m.groups()
return None, None
 
def __call__(self, path):
if not path.endswith(‘$’):
path += ‘$’
re_path = re.compile(path)
def _(func):
self.map.append((re_path, func))
return func
return _
 
 
route = Route()
bot = Bot(route)
 
from extract import extract, extract_all
 
class Page(object):
 
def __init__(self, req):
p = urlparse(req.url)
req.arguments = parse_qs(p.query, 1)
self.req = req
self.html = req.content
 
def get_argument(self, name, default=None):
result = self.req.arguments.get(name, None)
if result is None:
return default
return result[0].encode(‘utf-8’, ‘ignore’)
 
def extract(self, begin, end):
return extract(begin, end, self.html)
 
def extract_all(self, begin, end):
return extract_all(begin, end, self.html)
线程版和Gevent用法完全一样,下面是一个抓取豆瓣相册的例子:#!/usr/bin/python
# -*- coding: utf-8 -*-
import _env
import _bot
import os
import urllib
from os.path import join, exists
from _bot import bot, route, Page
from extract import extract, extract_all
 
url_base = ‘http://img1.douban.com/view/photo/raw/public/p%s.jpg&#8217;
url_tmp = ‘http://movie.douban.com/subject/3395373/photos?type=S&start=%s&sortby=vote&size=a&subtype=a&#8217;
f_base = ‘/home/lerry/movie’
#f_base = u’E:\Download\photo\蝙蝠侠’
f_tmp = join(f_base, ‘p%s.jpg’)
 
 
@route(‘/subject/3395373/photos’)
class url_list(Page):
def get(self):
page = self.get_argument(‘start’)
page = int(page)
if page == 0:
for i in xrange(1, 41):
start = i*40
url = url_tmp % start
yield url
 
li = self.extract(‘class=”poster-col4 clearfix”‘, ”)
for i in extract_all(‘

  • ‘, ‘
  • ‘, li):
    path = extract(‘a href=”‘, ‘/”>’, i)
    if not path:
    continue
    id = path[37:]
    fpath = f_tmp % id
    if exists(fpath):
    continue
    url = url_base % id
    yield url
     
    @route(‘/view/photo/raw/public/(.+)’)
    class single(Page):
    def get(self, arg):
    save_pic(self.html, arg)
     
    def save_pic(content, fname):
    fpath = join(f_base, fname)
    f = open(fpath, ‘wb’)
    f.write(content)
    f.close()
    print fname, ‘saved’
     
    if __name__ == ‘__main__’:
    bot.put(‘http://movie.douban.com/subject/3395373/photos?type=S&start=0&sortby=vote&size=a&subtype=a&#8217;)
    bot.run()
    from Lerry' Blog: http://lerry.me/post/2012/09/15/python-spider

    Written by cwyalpha

    十一月 24, 2012 在 3:26 下午

    发表在 Uncategorized

    发表评论

    Fill in your details below or click an icon to log in:

    WordPress.com Logo

    You are commenting using your WordPress.com account. Log Out / 更改 )

    Twitter picture

    You are commenting using your Twitter account. Log Out / 更改 )

    Facebook photo

    You are commenting using your Facebook account. Log Out / 更改 )

    Google+ photo

    You are commenting using your Google+ account. Log Out / 更改 )

    Connecting to %s

    %d 博主赞过: