首页 > 编程 > Python > 正文

python 扒取小说2

2019-11-11 01:07:02
字体:
来源:转载
供稿:网友
网络爬虫2
def get_html(urls):    import urllib.request as ur    try:        page = ur.urlopen(urls)        return page.read().decode('utf-8-sig')    except:        return ""def get_url(html_page):    start_position = html_page.find('a href="')    if start_position == -1:        return None, 0    start_position += 8    end_position = html_page.find('"', start_position)    return html_page[start_position:end_position], end_positiondef get_all_url(seed):    html_page =get_html(seed)    useful_links = []    while True:        a_url, end = get_url(html_page)        if a_url:            if a_url not in useful_links and is_useful(a_url):                useful_links.append(a_url)            html_page = html_page[end + 1:]        else:            break        useful_links.sort()    return useful_linksdef is_useful(a_url):    import re    if re.match(r"//d+./d+.+/.html", a_url):        return True    else:        return Falsedef get_content(html_page):    start_flag = html_page.find("readx()")    # PRint(start_flag)    if start_flag < 0:        return "获取章节失败"    end_flag = html_page.find("read3()")    content_page = html_page[start_flag:end_flag]    start_flag = content_page.find("</script>")    if start_flag < 0:        return "获取章节失败"    else:        start_flag += 8        end_flag = content_page.find("</div>")        content =deal_content(content_page[start_flag:end_flag])        return contentdef deal_content(content_page):    first_deal = content_page.replace(" ", " ")    second_deal = content_page.replace("<br/>", "/n")    return second_deal+"/n"def get_title(html_page):    start_flag = html_page.find("bookname")    if start_flag < 0:        return "获取章节失败"    start_flag = html_page.find("<h1>",start_flag)+4    end_flag = html_page.find("</h1>",start_flag)    content_title = html_page[start_flag:end_flag]    return content_titledef write_to_txt(name,seed):    new_file = open("./res/"+name,"a")    all_useful_links = get_all_url(seed)    # print(all_useful_links)    while all_useful_links:        a_link = seed[:-7]+all_useful_links.pop(0)        print(a_link)        html_page = get_html(a_link)        # print(html_page)        content_title = get_title(html_page)        content = get_content(html_page)        new_file.write(content_title+"/n"+content)    if not new_file.closed:        new_file.close()    print("缓存结束")seed = "http://www.biquge.com/0_176/"write_to_txt("大主宰.txt",seed)

现在比之一来说代码的可读性和移植性都很好,但是不知道为什么执行的效率很慢

求解


发表评论 共有条评论
用户名: 密码:
验证码: 匿名发表