网络爬虫2def get_html(urls): import urllib.request as ur try: page = ur.urlopen(urls) return page.read().decode('utf-8-sig') except: return ""def get_url(html_page): start_position = html_page.find('a href="') if start_position == -1: return None, 0 start_position += 8 end_position = html_page.find('"', start_position) return html_page[start_position:end_position], end_positiondef get_all_url(seed): html_page =get_html(seed) useful_links = [] while True: a_url, end = get_url(html_page) if a_url: if a_url not in useful_links and is_useful(a_url): useful_links.append(a_url) html_page = html_page[end + 1:] else: break useful_links.sort() return useful_linksdef is_useful(a_url): import re if re.match(r"//d+./d+.+/.html", a_url): return True else: return Falsedef get_content(html_page): start_flag = html_page.find("readx()") # PRint(start_flag) if start_flag < 0: return "获取章节失败" end_flag = html_page.find("read3()") content_page = html_page[start_flag:end_flag] start_flag = content_page.find("</script>") if start_flag < 0: return "获取章节失败" else: start_flag += 8 end_flag = content_page.find("</div>") content =deal_content(content_page[start_flag:end_flag]) return contentdef deal_content(content_page): first_deal = content_page.replace(" ", " ") second_deal = content_page.replace("<br/>", "/n") return second_deal+"/n"def get_title(html_page): start_flag = html_page.find("bookname") if start_flag < 0: return "获取章节失败" start_flag = html_page.find("<h1>",start_flag)+4 end_flag = html_page.find("</h1>",start_flag) content_title = html_page[start_flag:end_flag] return content_titledef write_to_txt(name,seed): new_file = open("./res/"+name,"a") all_useful_links = get_all_url(seed) # print(all_useful_links) while all_useful_links: a_link = seed[:-7]+all_useful_links.pop(0) print(a_link) html_page = get_html(a_link) # print(html_page) content_title = get_title(html_page) content = get_content(html_page) new_file.write(content_title+"/n"+content) if not new_file.closed: new_file.close() print("缓存结束")seed = "http://www.biquge.com/0_176/"write_to_txt("大主宰.txt",seed)现在比之一来说代码的可读性和移植性都很好,但是不知道为什么执行的效率很慢
求解
新闻热点
疑难解答