Python爬取视频(其实是一篇福利)过程解析

2019-11-25 12:08:39

字体：大中小

来源：转载

供稿：网友

窗外下着小雨，作为单身程序员的我逛着逛着发现一篇好东西，来自知乎你都用 Python 来做什么？的第一个高亮答案。

到上面去看了看，地址都是明文的，得，赶紧开始吧。

下载流式文件，requests库中请求的stream设为True就可以啦，文档在此。

先找一个视频地址试验一下：

# -*- coding: utf-8 -*-import requests def download_file(url, path):  with requests.get(url, stream=True) as r:    chunk_size = 1024    content_size = int(r.headers['content-length'])    print '下载开始'    with open(path, "wb") as f:      for chunk in r.iter_content(chunk_size=chunk_size):        f.write(chunk) if __name__ == '__main__':  url = '就在原帖...'  path = '想存哪都行'  download_file(url, path)

遭遇当头一棒：

AttributeError: __exit__

这文档也会骗人的么！

看样子是没有实现上下文需要的__exit__方法。既然只是为了保证要让r最后close以释放连接池，那就使用contextlib的closing特性好了：

# -*- coding: utf-8 -*-import requestsfrom contextlib import closing def download_file(url, path):  with closing(requests.get(url, stream=True)) as r:    chunk_size = 1024    content_size = int(r.headers['content-length'])    print '下载开始'    with open(path, "wb") as f:      for chunk in r.iter_content(chunk_size=chunk_size):        f.write(chunk)

程序正常运行了，不过我盯着这文件，怎么大小不见变啊，到底是完成了多少了呢？还是要让下好的内容及时存进硬盘，还能省点内存是不是：

# -*- coding: utf-8 -*-import requestsfrom contextlib import closingimport os def download_file(url, path):  with closing(requests.get(url, stream=True)) as r:    chunk_size = 1024    content_size = int(r.headers['content-length'])    print '下载开始'    with open(path, "wb") as f:      for chunk in r.iter_content(chunk_size=chunk_size):        f.write(chunk)        f.flush()        os.fsync(f.fileno())

文件以肉眼可见的速度在增大，真心疼我的硬盘，还是最后一次写入硬盘吧，程序中记个数就好了：

def download_file(url, path):  with closing(requests.get(url, stream=True)) as r:    chunk_size = 1024    content_size = int(r.headers['content-length'])    print '下载开始'    with open(path, "wb") as f:      n = 1      for chunk in r.iter_content(chunk_size=chunk_size):        loaded = n*1024.0/content_size        f.write(chunk)        print '已下载{0:%}'.format(loaded)        n += 1

结果就很直观了：

已下载2.579129%已下载2.581255%已下载2.583382%已下载2.585508%

心怀远大理想的我怎么会只满足于这一个呢，写个类一起使用吧：

# -*- coding: utf-8 -*-import requestsfrom contextlib import closingimport time def download_file(url, path):  with closing(requests.get(url, stream=True)) as r:    chunk_size = 1024*10    content_size = int(r.headers['content-length'])    print '下载开始'    with open(path, "wb") as f:      p = ProgressData(size = content_size, unit='Kb', block=chunk_size)      for chunk in r.iter_content(chunk_size=chunk_size):        f.write(chunk)        p.output()  class ProgressData(object):   def __init__(self, block,size, unit, file_name='', ):    self.file_name = file_name    self.block = block/1000.0    self.size = size/1000.0    self.unit = unit    self.count = 0    self.start = time.time()  def output(self):    self.end = time.time()    self.count += 1    speed = self.block/(self.end-self.start) if (self.end-self.start)>0 else 0    self.start = time.time()    loaded = self.count*self.block    progress = round(loaded/self.size, 4)    if loaded >= self.size:      print u'%s下载完成/r/n'%self.file_name    else:      print u'{0}下载进度{1:.2f}{2}/{3:.2f}{4} 下载速度{5:.2%} {6:.2f}{7}/s'./         format(self.file_name, loaded, self.unit,/         self.size, self.unit, progress, speed, self.unit)      print '%50s'%('/'*int((1-progress)*50))

运行：

下载开始下载进度10.24Kb/120174.05Kb 0.01% 下载速度4.75Kb/s/////////////////////////////////////////////////下载进度20.48Kb/120174.05Kb 0.02% 下载速度32.93Kb/s/////////////////////////////////////////////////

看上去舒服多了。

下面要做的就是多线程同时下载了，主线程生产url放入队列，下载线程获取url：

# -*- coding: utf-8 -*-import requestsfrom contextlib import closingimport timeimport Queueimport hashlibimport threadingimport os def download_file(url, path):  with closing(requests.get(url, stream=True)) as r:    chunk_size = 1024*10    content_size = int(r.headers['content-length'])    if os.path.exists(path) and os.path.getsize(path)>=content_size:      print '已下载'      return    print '下载开始'    with open(path, "wb") as f:      p = ProgressData(size = content_size, unit='Kb', block=chunk_size, file_name=path)      for chunk in r.iter_content(chunk_size=chunk_size):        f.write(chunk)        p.output() class ProgressData(object):   def __init__(self, block,size, unit, file_name='', ):    self.file_name = file_name    self.block = block/1000.0    self.size = size/1000.0    self.unit = unit    self.count = 0    self.start = time.time()  def output(self):    self.end = time.time()    self.count += 1    speed = self.block/(self.end-self.start) if (self.end-self.start)>0 else 0    self.start = time.time()    loaded = self.count*self.block    progress = round(loaded/self.size, 4)    if loaded >= self.size:      print u'%s下载完成/r/n'%self.file_name    else:      print u'{0}下载进度{1:.2f}{2}/{3:.2f}{4} {5:.2%} 下载速度{6:.2f}{7}/s'./         format(self.file_name, loaded, self.unit,/         self.size, self.unit, progress, speed, self.unit)      print '%50s'%('/'*int((1-progress)*50)) queue = Queue.Queue() def run():  while True:    url = queue.get(timeout=100)    if url is None:      print u'全下完啦'      break    h = hashlib.md5()    h.update(url)    name = h.hexdigest()    path = 'e:/download/' + name + '.mp4'    download_file(url, path) def get_url():  queue.put(None)if __name__ == '__main__':  get_url()  for i in xrange(4):    t = threading.Thread(target=run)    t.daemon = True    t.start()

加了重复下载的判断，至于怎么源源不断的生产url，诸位摸索吧，保重身体！

以上就是本文的全部内容，希望对大家的学习有所帮助，也希望大家多多支持武林网。

上一篇：用Cython加速Python到“起飞”(推荐)

下一篇：flask框架jinja2模板与模板继承实例分析