Python 爬取163整个网站的图片多线程深度控制相对路径处理请求间隔控制正则匹配

2019-11-10 19:55:47

字体：大中小

来源：转载

供稿：网友

# coding:utf-8import urllibimport urllib2import reimport timeimport threadingimport socketimport urlparseimport datetimeroot_domain='163.com'beginurl = 'http://www.163.com/'#最大深度max_depth=2socket.setdefaulttimeout(10)SLEEP_TIME = 1linkpool = [beginurl]seedlink = {beginurl:0}imgpool = []dueimgpool = []num_retries=0lock = threading.Lock()pimg1 = re.compile( r'<img[^<>]+(?:src|original|src2)=["/']{1}([^"/']+)["/']{1}', re.IGNORECASE)pimg2 =re.compile(  r'"image":"([^"/']+)"', re.IGNORECASE)plink = re.compile( r'<a[^<>]+href=["/']{1}([^"/']+)["/']{1}', re.IGNORECASE)pfilename=re.compile(r'/W|_')headers = {'User-agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) Gecko/20100101 Firefox/22.0'}class Throttle:    def __init__(self,delay):        self.delay=delay        self.domains={}    def wait(self,url):        domain=urlparse.urlparse(url).netloc        last_accessed=self.domains.get(domain)        if self.delay>0 and last_accessed is not None :            sleep_secs=self.delay-(datetime.datetime.now()-last_accessed).seconds            if sleep_secs>0 :                time.sleep(sleep_secs)        self.domains[domain]=datetime.datetime.now()throttle = Throttle(SLEEP_TIME)def download(url, headers, PRoxy=None, num_retries=0, data=None):    print 'Downloading:', url    request = urllib2.Request(url, data, headers)    opener = urllib2.build_opener()    if proxy:        proxy_params = {urlparse.urlparse(url).scheme: proxy}        opener.add_handler(urllib2.ProxyHandler(proxy_params))    try:        response = opener.open(request)        html = response.read()        code = response.code    except urllib2.URLError as e:        print 'Download error:', e.reason        html = ''        if hasattr(e, 'code'):            code = e.code            if num_retries > 0 and 500 <= code < 600:                return download(url, headers, proxy, num_retries-1, data)        else:            code = None    return htmldef same_root_domain(url):    domain=urlparse.urlparse(url).netloc    return root_domain in domaindef grab(url):    depth = seedlink[url]    if depth != max_depth:        throttle.wait(url)        html = download(url, headers=headers, num_retries=num_retries)        imglist = pimg1.findall(html)        imglist.extend(pimg2.findall(html))        linklist = plink.findall(html)        for lnk in linklist:            lnk=urlparse.urljoin(url,lnk)            if lnk not in seedlink:                seedlink[lnk] = depth + 1                if same_root_domain(lnk):                    linkpool.append(lnk)        for img in imglist:            img = urlparse.urljoin(url, img)            if img in imgpool or img in dueimgpool:                continue            imgpool.append(img)def process_img_queue():    while True:        try:            imgurl = imgpool.pop()            dueimgpool.append(imgurl)            tail = "jpg"            filename=pfilename.sub('',imgurl)        except IndexError:            break        else:            try:                throttle.wait(imgurl)                urllib.urlretrieve(imgurl, "D:/grab/%s.%s" % (filename, tail))            except Exception, e:                print str(e)def process_link_queue():    while True:        try:            link = linkpool.pop()        except IndexError:            break        else:            try:                grab(link)            except Exception, e:                print str(e)img_threads = []link_threads = []while link_threads or linkpool:    if imgpool:        for thread in img_threads:            if not thread.is_alive():                img_threads.remove(thread)        while len(img_threads) < 20 and imgpool:            thread = threading.Thread(target=process_img_queue)            thread.setDaemon(True)            thread.start()            img_threads.append(thread)    for thread in link_threads:        if not thread.is_alive():            link_threads.remove(thread)    while len(link_threads) < 10 and linkpool:        thread = threading.Thread(target=process_link_queue)        thread.setDaemon(True)        thread.start()        link_threads.append(thread)