首页 > 编程 > Python > 正文

Python 爬取163整个网站的图片 多线程 深度控制 相对路径处理 请求间隔控制 正则匹配

2019-11-10 19:55:47
字体:
来源:转载
供稿:网友
# coding:utf-8import urllibimport urllib2import reimport timeimport threadingimport socketimport urlparseimport datetimeroot_domain='163.com'beginurl = 'http://www.163.com/'#最大深度max_depth=2socket.setdefaulttimeout(10)SLEEP_TIME = 1linkpool = [beginurl]seedlink = {beginurl:0}imgpool = []dueimgpool = []num_retries=0lock = threading.Lock()pimg1 = re.compile( r'<img[^<>]+(?:src|original|src2)=["/']{1}([^"/']+)["/']{1}', re.IGNORECASE)pimg2 =re.compile(  r'"image":"([^"/']+)"', re.IGNORECASE)plink = re.compile( r'<a[^<>]+href=["/']{1}([^"/']+)["/']{1}', re.IGNORECASE)pfilename=re.compile(r'/W|_')headers = {'User-agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) Gecko/20100101 Firefox/22.0'}class Throttle:    def __init__(self,delay):        self.delay=delay        self.domains={}    def wait(self,url):        domain=urlparse.urlparse(url).netloc        last_accessed=self.domains.get(domain)        if self.delay>0 and last_accessed is not None :            sleep_secs=self.delay-(datetime.datetime.now()-last_accessed).seconds            if sleep_secs>0 :                time.sleep(sleep_secs)        self.domains[domain]=datetime.datetime.now()throttle = Throttle(SLEEP_TIME)def download(url, headers, PRoxy=None, num_retries=0, data=None):    print 'Downloading:', url    request = urllib2.Request(url, data, headers)    opener = urllib2.build_opener()    if proxy:        proxy_params = {urlparse.urlparse(url).scheme: proxy}        opener.add_handler(urllib2.ProxyHandler(proxy_params))    try:        response = opener.open(request)        html = response.read()        code = response.code    except urllib2.URLError as e:        print 'Download error:', e.reason        html = ''        if hasattr(e, 'code'):            code = e.code            if num_retries > 0 and 500 <= code < 600:                return download(url, headers, proxy, num_retries-1, data)        else:            code = None    return htmldef same_root_domain(url):    domain=urlparse.urlparse(url).netloc    return root_domain in domaindef grab(url):    depth = seedlink[url]    if depth != max_depth:        throttle.wait(url)        html = download(url, headers=headers, num_retries=num_retries)        imglist = pimg1.findall(html)        imglist.extend(pimg2.findall(html))        linklist = plink.findall(html)        for lnk in linklist:            lnk=urlparse.urljoin(url,lnk)            if lnk not in seedlink:                seedlink[lnk] = depth + 1                if same_root_domain(lnk):                    linkpool.append(lnk)        for img in imglist:            img = urlparse.urljoin(url, img)            if img in imgpool or img in dueimgpool:                continue            imgpool.append(img)def process_img_queue():    while True:        try:            imgurl = imgpool.pop()            dueimgpool.append(imgurl)            tail = "jpg"            filename=pfilename.sub('',imgurl)        except IndexError:            break        else:            try:                throttle.wait(imgurl)                urllib.urlretrieve(imgurl, "D:/grab/%s.%s" % (filename, tail))            except Exception, e:                print str(e)def process_link_queue():    while True:        try:            link = linkpool.pop()        except IndexError:            break        else:            try:                grab(link)            except Exception, e:                print str(e)img_threads = []link_threads = []while link_threads or linkpool:    if imgpool:        for thread in img_threads:            if not thread.is_alive():                img_threads.remove(thread)        while len(img_threads) < 20 and imgpool:            thread = threading.Thread(target=process_img_queue)            thread.setDaemon(True)            thread.start()            img_threads.append(thread)    for thread in link_threads:        if not thread.is_alive():            link_threads.remove(thread)    while len(link_threads) < 10 and linkpool:        thread = threading.Thread(target=process_link_queue)        thread.setDaemon(True)        thread.start()        link_threads.append(thread)
发表评论 共有条评论
用户名: 密码:
验证码: 匿名发表