# -*- coding=utf-8 -*-import urllib as url_libimport urllib2 as url_lib2import bs4 as BS4import reimport osimport sysreload(sys)sys.setdefaultencoding('utf8')class get_joke(object): def __init__(self, url_str): self.url_str = url_str # 糗事百科笑话链接的初始化链接 self.page_href = set() # 糗事百科其他页的链接 self.next_page = "" # 下一页的链接 # 糗事百科对爬虫访问进行了阻止,这里采用伪装的形式进行访问 self.user_agent = 'Mozilla/5.0 (X11; linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36' self.headers = {"User-Agent": self.user_agent} # 读取网页数据 def getwebpage(self, url_str = ""): if "" is self.url_str: if "" == url_str: return None else: self.url_str = url_str try: request = url_lib2.Request(self.url_str, headers = self.headers) webpage = url_lib2.urlopen(request).read().decode('utf-8') except url_lib2.URLError as ex: if hasattr(ex, "code"): PRint(ex.code) if hasattr(ex, "reason"): print(ex.reason) webpage = webpage.decode("utf-8") return webpage # 读取糗事百科上面的文本笑话,包括文本笑话的作者,内容,点赞数目,评论数目 def getjoke_text(self, webpage): if None is webpage: return None re_str = '<div.*?>.*?<div class="author.*?>.*?<img.*?alt="(.*?)"/>.*?<div class="content"> .*?<span>(.*?)</span>.*?<div class="stats">.*?<i class="number">(.*?)</i>.*?<span class="dash"> .*?<i class="number">(.*?)</i>.*?</div>' re_obj = re.compile(re_str, re.S) # re.S 标志代表在匹配时为点任意匹配模式,点 . 也可以代表换行符 jokes = re.findall(re_obj, webpage) print("获取到的笑话的个数:%d" % len(jokes)) joke_list = [] for joke in jokes: temp_str = "%s" % joke[1] temp_str = re.sub("(<br/>)", " ", temp_str) print("发表者:%s"%joke[0] + "/n内容:%s"%temp_str + "/n点赞数目:%s"%joke[2] + "/n评论数目:%s/n"%joke[3]) joke_list.append([joke[0], joke[1], joke[2], joke[3]]) return joke_list # 寻找网页上的下一页跳转链接 def getnext_page(self, webpage): if None is webpage: return None print('爬虫开始获得下一页的链接。。。') re_str = '<li>.*?<a href="(.*?/page/.*?)".*?>.*?<!--<.*?>-->.*?<span class="next">.*?</span>.*?</a>.*?</li>' re_obj = re.compile(re_str, re.S) next_href = re.findall(re_obj, webpage) print("下一页链接的个数:%d" % len(next_href)) if len(next_href) > 0: nexthref = self.get_absoluteurl(next_href[0][6:]) print("得到的下一页的链接为:%s" % nexthref) return nexthref # 获取网页的绝对路径 def get_absoluteurl(self, url_str): if url_str.startswith("http://"): return url_str if url_str.startswith("http://www."): url_str = "http://" + url_str[11:] return url_str if url_str.startswith("www."): url_str = "http://" + url_str[4:] return url_str else: url_str = self.url_str + "/" + url_str return url_str if self.url_str not in url_str: # 去除盗链 return None return None def write2file(self, list): if None is list: return None else: dir_str = os.path.abspath('.') dir_str += "/jokes/" print("文件保存路径:%s" % dir_str) if not os.path.exists(dir_str): os.makedirs(dir_str) dir_str += "joke.txt" try: my_file = open(dir_str, 'w') for item in list: my_file.writelines(item[0]+"/t" + item[1]+"/t" + item[2]+"/t" + item[3]+"/t/n") except IOError,ex: my_file.close() print ex return None finally: my_file.close()# 主函数部分url_str = "http://www.qiushibaike.com/text"print("需要采集的网页为:" + url_str)my_obj = get_joke(url_str)webpage = my_obj.getwebpage()my_obj.getnext_page(webpage)joke_list = my_obj.getjoke_text(webpage)my_obj.write2file(joke_list)3. 结果
新闻热点
疑难解答