# coding:utf-8import urllibimport urllib2import reimport timeimport threadingimport socketimport urlparseimport datetimeroot_domain='163.com'beginurl = 'http://www.163.com/'#最大深度max_depth=2socket.setdefaulttimeout(10)SLEEP_TIME = 1linkpool = [beginurl]seedlink = {beginurl:0}imgpool = []dueimgpool = []num_retries=0lock = threading.Lock()pimg1 = re.compile( r'<img[^<>]+(?:src|original|src2)=["/']{1}([^"/']+)["/']{1}', re.IGNORECASE)pimg2 =re.compile( r'"image":"([^"/']+)"', re.IGNORECASE)plink = re.compile( r'<a[^<>]+href=["/']{1}([^"/']+)["/']{1}', re.IGNORECASE)pfilename=re.compile(r'/W|_')headers = {'User-agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:22.0) Gecko/20100101 Firefox/22.0'}class Throttle: def __init__(self,delay): self.delay=delay self.domains={} def wait(self,url): domain=urlparse.urlparse(url).netloc last_accessed=self.domains.get(domain) if self.delay>0 and last_accessed is not None : sleep_secs=self.delay-(datetime.datetime.now()-last_accessed).seconds if sleep_secs>0 : time.sleep(sleep_secs) self.domains[domain]=datetime.datetime.now()throttle = Throttle(SLEEP_TIME)def download(url, headers, PRoxy=None, num_retries=0, data=None): print 'Downloading:', url request = urllib2.Request(url, data, headers) opener = urllib2.build_opener() if proxy: proxy_params = {urlparse.urlparse(url).scheme: proxy} opener.add_handler(urllib2.ProxyHandler(proxy_params)) try: response = opener.open(request) html = response.read() code = response.code except urllib2.URLError as e: print 'Download error:', e.reason html = '' if hasattr(e, 'code'): code = e.code if num_retries > 0 and 500 <= code < 600: return download(url, headers, proxy, num_retries-1, data) else: code = None return htmldef same_root_domain(url): domain=urlparse.urlparse(url).netloc return root_domain in domaindef grab(url): depth = seedlink[url] if depth != max_depth: throttle.wait(url) html = download(url, headers=headers, num_retries=num_retries) imglist = pimg1.findall(html) imglist.extend(pimg2.findall(html)) linklist = plink.findall(html) for lnk in linklist: lnk=urlparse.urljoin(url,lnk) if lnk not in seedlink: seedlink[lnk] = depth + 1 if same_root_domain(lnk): linkpool.append(lnk) for img in imglist: img = urlparse.urljoin(url, img) if img in imgpool or img in dueimgpool: continue imgpool.append(img)def process_img_queue(): while True: try: imgurl = imgpool.pop() dueimgpool.append(imgurl) tail = "jpg" filename=pfilename.sub('',imgurl) except IndexError: break else: try: throttle.wait(imgurl) urllib.urlretrieve(imgurl, "D:/grab/%s.%s" % (filename, tail)) except Exception, e: print str(e)def process_link_queue(): while True: try: link = linkpool.pop() except IndexError: break else: try: grab(link) except Exception, e: print str(e)img_threads = []link_threads = []while link_threads or linkpool: if imgpool: for thread in img_threads: if not thread.is_alive(): img_threads.remove(thread) while len(img_threads) < 20 and imgpool: thread = threading.Thread(target=process_img_queue) thread.setDaemon(True) thread.start() img_threads.append(thread) for thread in link_threads: if not thread.is_alive(): link_threads.remove(thread) while len(link_threads) < 10 and linkpool: thread = threading.Thread(target=process_link_queue) thread.setDaemon(True) thread.start() link_threads.append(thread)
新闻热点
疑难解答