欢迎光临UUpython
最大、最新、最全的Python代码收集站

爬取读万卷小说网

from selenium import webdriver
import requests, re, os, time, shutil, threading, queue
from lxml import etree
import pandas as pd
import random

def get_user_agent():
    headers_list = [
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
        "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
        "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
        "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
        "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
        "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
        "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
        "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
        "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
        "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)",
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1",
        "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre",
        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
        "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
    ]
    return headers_list
'''def get_proxy():
    proxy = [
        'http://182.140.244.163:8118',
        'http://113.124.86.180:9999',
       'http://117.64.237.42:9999',
        'http://182.34.102.48:9999',
        'http://183.236.123.242:8060',
        'http://27.192.203.80:9000',
        'http://114.231.8.242:8888',
        'http://36.134.91.82:8888',
        'http://222.132.57.105:9000',
        'http://61.216.156.222:60808',
        'http://182.34.20.110:9999',
        'http://60.205.132.71:80',
    ]
    return proxy
/'''
headers = {
    'user-agent': random.choice(get_user_agent()),
}
'''proxy = {
   'http': random.choice(get_proxy()),}
/'''

def extract_link_suffix(url):
    # 查找最后一个斜杠的位置
    last_slash_index = url.rfind('/')
    if last_slash_index != -1:
        # 提取斜杠之后的部分作为后缀
        return url[last_slash_index + 1:]
    else:
        # 如果没有斜杠,则直接返回整个URL(这种情况可能很少见)
        return url

# 搜索小说,并选择所需要下载的小说
def search_novel():
    chrome_options = webdriver.ChromeOptions()
    #后台静默运行
    chrome_options.add_argument('--headless')
    print('浏览器已打开')
    browser = webdriver.Chrome(options=chrome_options)
    #browser = webdriver.Chrome()
    name_input = input('输入小说名或作者:')
    browser.get(f'http://www.duwanjuan.info/modules/article/search.php?q={name_input}')
    time.sleep(6)
    # 输出网页源代码
    html = browser.page_source
    browser.close()
    # print('浏览器已关闭')
    html = etree.HTML(html)
    name = html.xpath("//div[@id='jieqi_page_contents']/div[@class='c_row']/div/div/span[@class='c_subject']/a/text()")[:10]
    chapter = html.xpath("//div[@class='c_tag']/span[@class='c_value']/a/text()")[:10]
    link = html.xpath("//div[@id='jieqi_page_contents']/div[@class='c_row']/div/div/a/@href")[:10]
    # 提取每个链接的后缀部分
    link_suffixes = [extract_link_suffix(l) for l in link]
    author = html.xpath("//div[@class='c_tag']/span[contains(text(), '作者:')]/following-sibling::span[1]/text()")[:10]
    num = [i + 1 for i in range(0, len(name))]
    data = {'序号': num, '小说': name, '作者': author,'最新章节':chapter,'链接':link_suffixes}
    df = pd.DataFrame(data)
    if df.empty:
        print('搜索数据为空,请重新搜索')
        search_novel()
    else:
        print(df)
        sx_input = int(input('请输入序号选择下载的小说:'))
        novel_link = link[sx_input - 1]
        return novel_link

# 定义一个函数来获取小说章节目录的URL和章节名
def get_chapter_urls(url, visited_urls, value):
    global tot_title
    global book_name
    response = requests.get(url, headers=headers)
    response.encoding = response.apparent_encoding
    html = etree.HTML(response.text)
    chapter_elements = html.xpath("//div[@class='index']//li[@class='chapter']/a")
    chapter_elements.pop(10)
    tot_title = html.xpath("//div[@class='index']//li[@class='chapter']/a/text()")
    bk = html.xpath("//div[@class='main']/div[@class='headlink cf']/h1/text()[1]")
    # 从列表中提取字符串
    if bk:  # 确保bk不为空
        text = bk[0]  # 提取列表中的第一个元素
    else:
        text = ""  # 如果bk为空,则设置text为空字符串

    # 正则表达式,匹配方括号及其内容,但使用括号捕获括号内的内容
    pattern = r"\['(.*?)'\]"
    # 使用re.search来查找匹配项,如果找到,则提取捕获组中的内容
    match = re.search(pattern, text)
    if match:
        book_name = match.group(1)  # 提取捕获组中的内容
    else:
        book_name = text  # 如果没有找到匹配项,则保留原始text值
    chapter_urls = []
    for element in chapter_elements:
        chapter_name = element.text
        chapter_url = element.get('href')
        if chapter_url not in visited_urls:
            value += 1
            chapter_urls.append((chapter_name, chapter_url, value))
            visited_urls.add(chapter_url)
    return chapter_urls


# 定义一个函数来获取小说具体章节的内容
def get_chapter_content(url):
try:
    response = requests.get(url, headers=headers,verify=False,timeout=15)
    response.encoding = response.apparent_encoding
    html = etree.HTML(response.text)
    content_element = html.xpath("//div[@id='acontent']/text()")
    pattern = r'\r\n     \xa0\xa0\xa0\xa0|\s|\(|\)|\读万卷 www.duwanjuan.info'
    content = [re.sub(pattern, '', sub_text) for sub_text in content_element]
    return content
except requests.RequestException as e:
     print(f"Error occurred while fetching content from {url}: {e}")
     return []



# 定义一个函数来处理每个章节的爬取任务
def process_chapter(chapter_queue):
    global time_start
    time_start = time.time()
    while not chapter_queue.empty():
        chapter_name, chapter_url, value = chapter_queue.get()
        print("正在爬取章节:", chapter_name)
        try:
            content = get_chapter_content(chapter_url)
        except Exception as e:
            print(f"获取章节内容失败:{e}")
            content = []
        # 在这里可以将内容保存到文件或进行其他处理
        folder_path = f'{book_name}'
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
        with open(f'{book_name}/{value}.txt', 'w', encoding='utf-8') as f:
            f.write('\n' + chapter_name + '\n')
            for data in content:
                f.write(data + '\n')
            f.write('\n\n')
        chapter_queue.task_done()
        time.sleep(6)


# 合并下载的TXT文件
def merge_txt_files(folder_path, output_file):
    txt_files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]
    txt_files.sort(key=lambda x: int(x[:-4]))

    with open(output_file, 'w', encoding='utf-8') as outfile:
        for txt_file in txt_files:
            with open(os.path.join(folder_path, txt_file), 'r', encoding='utf-8') as infile:
                content = infile.read()
                outfile.write(content)


def search_continue():
    input_continue = input('请输入y/n选择是否继续下载小说:')
    if input_continue == 'y':
        main()
    else:
        return

def main():
    directory_url = search_novel()
    # 获取小说章节目录的URL和章节名
    visited_urls = set()
    value = 0
    chapter_urls = get_chapter_urls(directory_url, visited_urls, value)
    # 创建一个队列来存储待爬取的章节信息
    chapter_queue = queue.Queue()
    for chapter_name, chapter_url, value in chapter_urls:

        chapter_queue.put((chapter_name, chapter_url, value))
    # 创建多个线程来并发爬取章节内容
    print('=' * 64)
    print('线程数建议在10-30之间,避免对目标服务器造成过大压力')
    sum = int(input('输入线程数:'))
    num_threads = sum  # 设置线程数量,根据需要进行调整
    threads = []
    for i in range(num_threads):
        thread = threading.Thread(target=process_chapter, args=(chapter_queue,))
        thread.daemon = False
        thread.start()
        threads.append(thread)
    # 等待所有线程完成任务
    chapter_queue.join()
    # 等待所有线程结束
    for thread in threads:
        thread.join()
    print("所有章节爬取完成!")
    time_end = time.time()
    print('章节爬取花费时间:', time_end - time_start)
    print('=' * 64)
    print('开始合并所有TXT文件')
    folder_path_1 = f'{book_name}/'  # 请替换为实际文件夹路径
    output_file = f'{book_name}.txt'  # 输出文件名
    merge_txt_files(folder_path_1, output_file)
    print('合并所有TXT文件成功')
    print(f'{book_name}下载成功')
    shutil.rmtree(book_name)
    print('=' * 64)
    search_continue()

# 主程序入口
if __name__ == "__main__":

    main()
赞(0) 打赏
未经允许不得转载:UUpython » 爬取读万卷小说网
分享到: 更多 (0)

评论 抢沙发

评论前必须登录!