抽空写了个python爬虫爬取起点小说《江峰柳月》
|| 作者:LittleRob | 分类:科技 | 浏览:7361 | 日期:2021年03月01日上两周整理笔记发现一些哲理性句子,想到了17年看的一部小说,想再看一遍,但苦于满屏的广告,太浪费时间了。于是花了几天时间写了一个爬虫把文章爬下来,看着舒服多了。
下面是代码,请合理文明使用,不要用于非法用途。
import json import requests, random from requests.exceptions import RequestException import re import time import random user_agent = [ "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11", "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)", "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10", "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)", "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999", ] def get_one_page(url): #请求头 # headers ={ # 'User-Agent':'Mozilla/5.0(Macintosh;Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/52.0.2743.116 Safari/537.36' # } headers = {'User-Agent': random.choice(user_agent)} #打印当前时间 print(time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()) ) response = requests.get(url, headers = headers) response.encoding='utf-8' #response = requests.get(url,proxies=http,timeout=3) print(response.status_code) # print(response.text) if response.status_code == 200: # return response.content.decode('utf-8') return response.content.decode('utf-8','ignore') # return response.content.decode('GBK') # return response.content.decode('GBK','ignore') # return response.content return None #找到所有章节的链接 #chcp 65001 设置控制台编码 def parse_one_page(html): #正则表达式 # pattern = re.compile( # '<div id="book_text">(.*?)</div>',re.S # ) pattern = re.compile( '<div id="content" name="content">(.*?)</div>',re.S ) #对文本进行筛选 items = re.findall(pattern,html) # print(items) for item in items: item = item.replace('\n ','') item = item.replace(' ','') item = item.replace('<br />','') item = item.replace(' ','') # print(item) yield item # yield{ # 'index':item # } #筛选出章节名称 def parse_one_page_head(html): #print(html) #正则表达式 pattern = re.compile( '<h1>(.*)</h1>',re.S ) #对文本进行筛选 items = re.findall(pattern,html) for item in items: # print(item) # yield{ # 'index':item # } yield item #将结果保存到文本文档中 def write_to_file(content): with open('liuyue(142).txt','a',encoding='utf-8') as f: # f.write(json.dumps(content,ensure_ascii=False)+'\n') f.write(content+'\n\n') # f.write('sfgdffks1\nddffd2\r\ndfjgf3\ndkjk4\r\n5') def main(offset): # url = 'http://www.bequgew.com/99647/'+str(offset)+'.html' # url = 'https://www.rzlib.net/b/25/25339/'+str(offset)+'.html' # url = 'http://www.dingdianxs.la/29/29228/'+str(offset)+'.html' url = str('http://www.dingdianxs.la/29/29228/%s.html' % offset) html = get_one_page(url) # print(html) #首先筛选出章节 for item in parse_one_page_head(html): print(item) write_to_file(item) #筛选出段落 for item in parse_one_page(html): print(item) write_to_file(item) if __name__ == '__main__': #从第一章到最后一章 # 12639965 # 12640047:83 # 12640167:202 for i in range(27661700,27661730): print(i) try: main(i) except Exception as e: print('错误明细是',e.__class__.__name__,e) #continue#jia continue #每10秒访问一次 time_waite = random.randint(3,7) time.sleep(time_waite)
注意不要频繁抓取,以免影响服务器正常工作,甚至自己被屏蔽。
写的时候遇到很多问题,被注释的语句很多是因为在某些情况不适用了换其他方法。
大家如果有不理解的地方可以去Weiyoun(id:WeiyounMimi)公众号找我交流。
扫码领外卖红包,省十几块!