马上嗨小说网

笔趣阁小说网 「练习」爬取 新笔趣阁 小说

发布时间:2019-11-05 14:00:22 来源:CM無可厚非关键词 : 笔趣阁小说网
笔趣阁小说网
原文标题:「练习」爬取 新笔趣阁 小说
原文发布时间:2017-04-15 13:36:36
原文作者:CM無可厚非。
如果您喜欢本文,请关注头条号【CM無可厚非】阅读更多相关文章。
如果您是本文作者,不希望我们转载此文,请联系我们删除。
笔趣阁小说网

#!/usr/bin/env python

# -*- coding: utf-8 -*-

# 新笔趣阁

' xxbiquge'

__author__ = 'Wang930607'

import pprint,csv,urllib,urllib2,re,datetime

import traceback,time,json

#=========================

import socket

# 统一编码

import sys

reload(sys)

sys.setdefaultencoding('utf8')

# 获得系统默认编码格式'

sysCharType = sys.getfilesystemencoding()

# sys.setdefaultencoding( "utf-8" )

#=========================

#导入 bs4 库

from bs4 import BeautifulSoup

#=========================

# 输出给定集合的所有排列与组合

# from itertools import product

#=========================

# 读取xls文件

import xlrd

# 写入xls文件

import xlwt

#=========================

from datetime import date,datetime

#=========================

# 按任意键继续

import os

#=========================

# 数据持久存储

try:

import cPickle as pickle

except :

import pickle

#=========================

# 自定义程序 start

#=========================

#导出到文本文件

def save_file(file,data,hrefs):

try:

# 设置文件名

file_name = file+'.txt'

# 打开文件

fp = open(file_name,'w')

# 写入小说简介

fp.writelines(dict_['intro'])

for href in hrefs:

if dict_[href][0]:

fp.writelines('\n'+'================================================================================='+'\n')

# 写入章节名

fp.writelines(dict_[href][1])

fp.writelines('\n')

# 写入章节正文

fp.writelines(dict_[href][2])

except Exception as e:

# 关闭文件

fp.close()

#素质

def quality():

# 等待时间设置

time.sleep(0.5)

# 控制下载内容的时间

socket.setdefaulttimeout(15)

#存储数据

def dump_pkl(file,data):

# print data

file_name = file+'.pkl'

output = open(file_name, 'wb')

pickle.dump(data, output)

output.close()

#读取数据

def load_pkl(file):

file_name = file+'.pkl'

pkl_file = open(file_name, 'rb')

data = pickle.load(pkl_file)

pkl_file.close()

return data

#修剪内容

def __replace__(str):

content = str.replace("readx(); ()", '')

# 去除空格,换行符,制表符

content = content.strip()

content = content.replace("ff37;w039;30fb;;off4d;ff55;247b;50f;8bf4;66f4;65b0;6700;5feb;50f;8bf4;9605;8bfb;7f51;", '')

content = content.replace("★★★可将您看到的最新章节或 ,方便下次接着看★★★ ---------", '')

content = content.replace("==<!-br/->ww.uos.<!-->由网友上传==", '')

content = content.replace("跟-我-读wen文-xue学-lou楼 记住哦!", '')

content = content.replace("~~www.shushuw.n-更新首发~~", '')

content = content.replace("自从学会了投票,妈妈再也不用担心我闹书荒了", '')

content = content.replace("(未完待续。精彩小说【网】记住我们的网址:", '')

content = content.replace("无节操裸奔求收藏,求推荐,求点击,求抚摸", '')

content = content.replace("新书上传,求收藏,求推荐!卖身求乳啊!", '')

content = content.replace("(/无,弹.窗,小,说.网)(..)", '')

content = content.replace("精彩小说【网】记住我们的网址:", '')

content = content.replace("【w.w.m 1我|】", '')

content = content.replace("wenhangshuyuan", '')

content = content.replace("*************", '')

content = content.replace("(搜读窝.souduwo)", '')

content = content.replace("无弹窗小说网www.RT", '')

content = content.replace("***********", '')

content = content.replace("手机用户同步阅读请访问", '')

content = content.replace("U看书(ww..om)", '')

content = content.replace("<!--over-->", '')

content = content.replace("(第三次发布此章节)", '')

content = content.replace("<!--go-->", '')

content = content.replace("*********", '')

content = content.replace("<!-br/->.", '')

content = content.replace("早起求个票~~~~", '')

content = content.replace("看书要投票啊~~ ", '')

content = content.replace("readx();", '')

content = content.replace("readx();", '')

content = content.replace(" 早起求票", '')

content = content.replace("*******", '')

content = content.replace("ww.x.om", '')

content = content.replace("ww.x.om", '')

content = content.replace(" ()", '')

content = content.replace("早起求几张票", '')

content = content.replace("(未完待续)", '')

content = content.replace("(未完待续。", '')

content = content.replace("*****", '')

content = content.replace("票~~~~", '')

content = content.replace("(网网)w", '')

content = content.replace("wxs.o", '')

content = content.replace("wxs.o", '')

content = content.replace("…………", '')

content = content.replace(" ", '')

content = content.replace("  ", '')

content = content.replace("****", '')

content = content.replace("<!>.", '')

content = content.replace("未完待续", '')

content = content.replace("求推荐票", '')

content = content.replace("求推荐票", '')

content = content.replace("c!!!", '')

content = content.replace("………", '')

content = content.replace("***", '')

content = content.replace("~~~", '')

content = content.replace("()w", '')

content = content.replace("<br />", '')

content = content.replace("……", '')

content = content.replace("**", '')

content = content.replace(";", '')

#========================= u'

return ' '+content

# 自定义程序 end

#==================================================

class Xxbiquge(object):

"""docstring for Xxbiquge"""

def __init__(self):

self.headers = headers

self.soup = soup

#=========================

# 获取标题

def get_title(self):

# <h1>武炼巅峰</h1>

title = str(self.soup.h1.string).decode("utf8")

# print title

return title

#=========================

# 下载封面

def get_cover(self,name='Cover'):

# <meta property="og:image" content="http://www.xxbiquge.com/cover/0/347/347s.jpg"/>

try:

img_url = self.soup.find('meta',property="og:image").get('content')

urllib.urlretrieve(img_url,name+'.jpg')

except Exception as e:

print '下载封面失败'

print "-----------"

#=========================

# 简介

def get_intro(self):

# <meta property="og:description" content="武之巅峰,是孤独,是寂寞,是漫漫求索,是高处不胜寒逆境中成长,绝地里求生,不屈不饶,才能堪破武之极道。凌霄阁试炼弟子兼扫地小厮杨开偶获一本无字黑书,从此踏上漫漫武道。"/>

# intro = self.soup.find('div',id="intro").get_text()

# print soup.prettify()

intro1 = self.soup.find('div',id="info").get_text()

# print self.soup.find_all('div',id="info")

intro2 = self.soup.find('meta',property="og:description").get('content')

# print intro

return intro1+'\n'+'================================================================================='+'\n'+' '+intro2

#=========================

# 获取循环用地址池

def get_urls(self):

# 存放地址池

hrefs=[]

links=self.soup.find_all('a')

for link in links:

if re.match("\/\d+_\d+\/\d+.html",link.get('href')):

# print(link.get('href'))

hrefs.append(link.get('href'))

# 删除多余重复出现的链接

del hrefs[0]

return hrefs

#=========================

# 获取正文

def get_content(self,url='/0_347/1007300.html',plan='0/0'):

# 拼接完整链接

url='http://www.xxbiquge.com'+url

print u'开始下载 '+str(plan)+': '+url

print "-----------"

req = urllib2.Request(url, headers=self.headers)

try:

soup = urllib2.urlopen(req).read()

soup = BeautifulSoup(soup)

# 获取正文标题

title = str(soup.h1.string).decode("utf8")

# 获取正文

content = soup.find('div',id="content").get_text()

# 修剪内容

content= __replace__(content)

# 用长度判断该正文是否争取

if len(content)>1000:

return True,title,content

else:

print url+" 非小说正文 "+str(len(content))

print "-----------"

return False,url+" 非小说正文 "+str(len(content))

except Exception as e:

print u'连接失败'

#=========================

if __name__ == '__main__':

# 自定义参数 start

#主网址 ***************************************更改此处网址即刻

# 例'http://www.xxbiquge.com/0_347/'

#=========================

# No='0_347' #武炼巅峰

# No='0_681' #大圣传

# No='74_74821' #圣墟

No='2_2306' #神墓

# No='75_75151' #天道图书馆

#=========================

url_home='http://www.xxbiquge.com/'+No+'/'

# 头部文件

headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}

# 打开网页

try:

req = urllib2.Request(url_home, headers=headers)

soup = urllib2.urlopen(req).read()

# 修剪网页内容

soup = __replace__(soup)

soup = BeautifulSoup(soup)

#打印 soup 对象的内容,格式化输出

# print soup.prettify()

except Exception as e:

print u'连接失败'

#创建字典,存放章节内容

dict_={}

# 自定义参数 end

#=========================

# 获取标题

title = Xxbiquge().get_title()

#=========================

file = title

try:

# 创建文件夹

os.mkdir(file)

# 删除空文件夹

os.rmdir(file)

# 创建文件夹

os.mkdir(file)

except Exception as e:

pass

# 设置默认目录

os.chdir(file)

#=========================

try:

# 载入数据

dict_=load_pkl(title)

#写入标题

dict_['title'] = title

#写入简介

dict_['intro'] = Xxbiquge().get_intro()

except Exception as e:

print u'文件不存在,载入失败!'

print "-----------"

# 保存文件标题

# if title not in dict_:

# 下载封面

Xxbiquge().get_cover(title)

#=========================

# 获取循环用地址池

try:

hrefs = Xxbiquge().get_urls()

# 素质一下

quality()

except Exception as e:

raise e

#=========================

for href in hrefs:

# 显示进度

plan = str(hrefs.index(href)+1)+'/'+str(len(hrefs))

if href in dict_:

print 'http://www.xxbiquge.com'+href+' 已下载!'

print "-----------"

else:

list_ = Xxbiquge().get_content(href,plan)

dict_[href] = list_

# 更新数据

dump_pkl(title,dict_)

# 清空字典

# dict_.clear()

# 素质一下

quality()

#=========================

# 导出到文本文件

save_file(title,dict_,hrefs)


正文完,原文标题:「练习」爬取 新笔趣阁 小说
原文发布时间:2017-04-15 13:36:36
原文作者:CM無可厚非。

笔趣阁小说网 笔趣阁小说网