学渣的Python学习日记之爬取小说
14
2020-12-01
正文
学渣的PY之路1,做了一个简易的小说解析下载程序,大概就两百多行代码,实现了此站点下载所有小说的功能,不禁感慨PY的扩展包强大之处,似是完全了爬网站做的。编程环境是vs code + Anaconda,3.6的版本,废话不多说,上代码!
import urllib.request
import requests
import os
import time
import json
import re
from bs4 import BeautifulSoup
from urllib.parse import urlparse
txtPath = "txt"
getHeaders = {
"Referer": "http://www.47uc.com/",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
}
postHeaders = {
"Host": "www.47uc.com",
"Content-Type": "application/x-www-form-urlencoded",
"Referer": "http://www.47uc.com/",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
}
def findHtml(url):
try:
return requests.get(url, getHeaders, timeout=10).content.decode('utf-8', 'ignore')
except Exception as e:
log("获取" + url + "页面出现异常:"+str(e))
return ""
def findApi(api, postData, postHeaders):
try:
return requests.post(api, postData, postHeaders).text
except Exception as e:
log("获取" + api + "API数据出现异常:"+str(e)+" ; postData数据:"+str(postData))
return ""
def downloadChapter(url, retry=0):
html = findHtml(url)
if html == "":
return ""
soup = BeautifulSoup(html, 'html.parser')
content = soup.find("div", id="content")
content_text = content.text.replace(" ", "\n ").replace("o", "。")
# 去除广告(未设计)
# if retry != 0:
# print(url+"-----第"+str(retry)+"次重试")
if ("加载" in content_text) & retry < 4:
# 获取url上的小说id,和当前章节id
h = re.findall(r"var hash = \"([0-9a-z]+)\";", html)
r = re.findall(
r"http\:\/\/www\.47uc\.com\/[0-9]+_([0-9]+)\/([0-9]+)", url)
# 请求api
apiResponse = findApi(
"http://www.47uc.com/home/index/ajaxchapter", {"id": r[0][0], "eKey": h, "cid": r[0][1], "basecid": 1}, postHeaders)
if apiResponse == "":
return ""
try:
j = json.loads(apiResponse)
if ("info" in j.keys()) & ("content" in j["info"].keys()):
return "\n\n"+j["info"]["content"].replace("<br>", "\n").replace("<br/>", "\n").replace("o", "。").replace("\xa0", " ")
else:
return downloadChapter(url, retry+1)
except Exception as e:
log("序列化json数据出现异常:"+str(e)+" ; JSON数据:"+str(apiResponse))
return downloadChapter(url, retry+1)
return content_text
def saveChapter(title, content):
# 如果不存在文件夹,则创建
folder = os.path.exists(txtPath)
if not folder:
os.makedirs(txtPath)
# 保存
append(title, content)
def append(title, content):
with open(txtPath+"\\"+title+".txt", "ab+") as file:
content = content.encode("utf-8")
file.write(content)
def log(content):
with open(txtPath+"\\run.log", "ab+") as file:
content = (content+"\r\n").encode("utf-8")
file.write(content)
# 字典1
number_map = {
"零": 0,
"一": 1,
"二": 2,
"三": 3,
"四": 4,
"五": 5,
"六": 6,
"七": 7,
"八": 8,
"九": 9,
}
# 字典2
unit_map = {
"十": 10,
"百": 100,
"千": 1000,
"万": 10000,
"亿": 100000000
}
def convertToNumber(inputs):
output = 0
unit = 1
num = 0
for index, cn_num in enumerate(inputs):
if cn_num in number_map:
# 数字
num = number_map[cn_num]
# 最后的个位数字
if index == len(inputs) - 1:
output = output + num
elif cn_num in unit_map:
# 单位
unit = unit_map[cn_num]
if (unit == 10) & (num == 0):
num = 1
# 累加
output = output + num * unit
num = 0
else:
output = -1
break
return output
def findChapterNumber(title):
if title == '第十章 生意人':
print(title)
result = re.findall(r"第([零|一|二|三|四|五|六|七|八|九|十|百|千|万|亿]+)章", title)
if len(result) > 0:
return convertToNumber(result[0])
else:
result = re.findall(r"第([0-9]+)章", title)
if len(result) > 0:
return int(result[0])
elif "序章" in title:
return 0
elif "楔子" in title:
return 0
else:
return -1
def sortSecond(item):
return item[0]
def downloadTxt(url, name, chapterUrlList, missingChapterList, startNum=0, endNum=0):
if startNum < 0:
print("❌ 起始章节错误,终止爬取")
return
_u = urlparse(url)
domain = _u.scheme + "://"+_u.netloc
print("💯 抓取目录结构成功,正在分析....")
html = findHtml(url)
# 加载bs4
soup = BeautifulSoup(html, 'html.parser')
if len(chapterUrlList) < 1:
print("❌ 此站点没有此小说目录")
return
print("⚡ 共分析"+str(len(chapterUrlList))+"章,有" +
str(len(missingChapterList))+"章分析失败,开始爬取...")
# 章节重新排序
chapterUrlList.sort(key=sortSecond)
# 下载各章节+转换
count = len(chapterUrlList)
index = 0
for _chapter in chapterUrlList:
index += 1
content = downloadChapter(_chapter[1]).replace(_chapter[2].replace(
" 上", "(上)").replace(" 下", "(下)").replace("章 ", "章"), "")
if content == "":
missingChapterList.append(_chapter[2]+" - 抓取失败")
print("{:.2%} ".format(index/count) +
"🚀 正在抓取["+str(_chapter[0])+" - "+_chapter[2]+" - "+_chapter[1] + "] ❌ 服务端未返回")
continue
saveChapter(name, "\r\n\r\n" + _chapter[2] + "" + content)
print("{:.2%} ".format(index/count) +
"🚀 正在抓取["+str(_chapter[0])+" - "+_chapter[2]+" - "+_chapter[1] + "] ✔")
# 失败章节提示
if len(missingChapterList) > 0:
print("存在失败的章节,如下\r\n"+"\r\n".join(missingChapterList))
if (startNum == 0) & (endNum == 0):
print("✔ 爬取0-"+str(count)+"章完毕")
else:
print("✔ 爬取"+str(startNum)+"-"+str(endNum)+"章完毕")
def findChapter(url):
_u = urlparse(url)
domain = _u.scheme + "://"+_u.netloc
html = findHtml(url)
# 加载bs4
soup = BeautifulSoup(html, 'html.parser')
chapterUrlList = []
missingChapterList = []
for _a in soup.select("div > dl > dd >a", id="list"):
if _a is not None:
_chapterNumber = findChapterNumber(_a.text)
if _chapterNumber == -1:
missingChapterList.append(_a.text + " - 解析失败")
continue
_chapter = (_chapterNumber, domain+_a.attrs["href"], _a.text)
if _chapter not in chapterUrlList:
chapterUrlList.append(_chapter)
return (chapterUrlList, missingChapterList)
def searchTxt():
exit = False
while True & ~exit:
title = input("🔸 请输入要下载的小说名:")
searchResponse = findApi(
"http://www.47uc.com/home/search", {"action": "search", "q": title}, postHeaders)
if searchResponse == "":
_input = print("❌ 搜索无结果,请尝试其他关键词,输入e退出")
if _input == "e":
exit = True
else:
continue
try:
soup = BeautifulSoup(searchResponse, 'html.parser')
index = 0
searchList = []
for _item in soup.select("#hotcontent > div > ul > li > a"):
if _item is not None:
index += 1
_url = "http://www.47uc.com"+_item.attrs["href"]
_c = findChapter(_url)
searchList.append((_url, _item.attrs["title"], _c))
print(
"🍀 "+str(index)+"."+_item.attrs["title"]+" 共"+str(len(_c[0]))+"章/有"+str(len(_c[1]))+"错章节")
count = len(searchList)
if count < 1:
_input = print("❌ 搜索无结果,请尝试其他关键词,输入e退出")
if _input == "e":
exit = True
else:
continue
x = input("🔸 请选择要下载的小说序号=>")
while (type(eval(x)) != int) | (int(x) > len(searchList)):
x = input("❌ 请输入正确的小说序号=>")
_x = int(x)
downloadTxt(searchList[_x-1][0], "《" +
searchList[_x-1][1]+"》", searchList[_x-1][2][0], searchList[_x-1][2][1])
exit = True
except Exception as e:
log("检索小说出现异常:"+str(e)+" ; 检索返回数据:"+str(searchResponse))
_input = print("❌ 搜索出现异常,请尝试其他关键词,输入e退出")
if _input == "e":
exit = True
else:
continue
searchTxt()
这个小说网比较特别,它不是纯静态的网页,而有人第一次查看某章节时,需要去调用一个章节接口,动态获取此章的内容,并后台静态化此章网页。显而易见,这个小说网站也是爬别人网站的小说内容的。附件就是爬下来的小说示例
附件
- 0
- 0
-
分享