读者网作者发布次数统计爬虫

第一种:

根据规律手工构造各个期刊的url

爬取两遍,第一遍爬完,去重(集合),并初始化结果集;第二遍爬取,在结果集里查询并计数

将结果集转化为列表,并按照列表中times关键字对字典(相当于数据集合)整体排序

写入结果到txt文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
from bs4 import BeautifulSoup as Be
import requests as req
import os

BaseUrl = "http://www.52duzhe.com/"

def Do_soup(url):
try:
r = req.get(url,headers={'user-agent':'Mozilla/5.0'})
r.raise_for_status()
r.encoding = r.apparent_encoding
html = r.text
soup = Be(html,'lxml')
return soup
except:
print("获取"+url+"失败")


def Search_each_Per(tag):
aut_set = set()
a_list = []
for i in range(2010,2018):
for j in range(1,25):
if j<10:
ExtraUrl = str(i)+'_0'+str(j)
else:
ExtraUrl = str(i)+'_'+str(j)
if i in [2010,2011,2012]:
if(i == 2012 and j>=14):
url = BaseUrl + ExtraUrl + r"/index.html"
else:
url = BaseUrl + '1_' + ExtraUrl + ".html"
else:
url = BaseUrl + ExtraUrl + r"/index.html"
soup = Do_soup(url) #使用函数
if soup == None:
continue
per_aut_list = soup.find_all('td',class_="author")
if tag==1:
for k in per_aut_list:
aut_set.add(k.string)
print("{}年{}期作者已入库".format(i,j))
else:
for k in per_aut_list:
a_list.append(k.string)
if tag==1:
return list(aut_set) #返回了一个去重后的作者列表
else:
return a_list #返回了一个有重复元素的列表,用于计数

def main():
author_list0 = Search_each_Per(1) # 1代表一个控制标记,接收无重列表
print("正在接收有重复数据列表,请等待...")
a_list = Search_each_Per(0) #接收有重复元素列表
result = {} #放结果的字典
for i in author_list0:
result[str(i)] = 0 #初始化统计结果
for j in a_list:
if i==j:
result[str(i)] += 1
#下面对结果按发表次数做降序处理
print("下面对结果按发表次数做降序处理...")
att = [] #做一个容器
for key,value in result.items():
j={}
j["author"]=key
j["times"]=value
att.append(j)
att.sort(key = lambda x:x["times"],reverse = True)
# 将结果写入text文本中
print("将结果写入text文本中,请耐心等待...")
path = os.getcwd()
filename = path + "读者作者结果1.txt"
new = open(filename,"w",errors='ignore') #网络字节流中可能有不合法字符要忽略:illegal multibyte sequence
for i in att:
author = i["author"]
times = i["times"]
print(author)
print(times)
if author == None: #unsupported operand type(s) for +: 'NoneType' and 'str'
new.write("None" +"\t" + str(times) + "\n")
else:
new.write(author +"\t" + str(times) + "\n")
new.close()
print("完成统计")

main()

第二种:

根据规律手工构造各个期刊的url

爬取一遍,用列表(容器)放集合(数据集),并设置tag标签,判断列表中是否有此作者,没有就增加,有就 times+1

直接返回列表,并按照列表中times关键字对字典(相当于数据集合)整体排序

写入结果到txt文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
from bs4 import BeautifulSoup as Be
import requests as req
import os

BaseUrl = "http://www.52duzhe.com/"

def Do_soup(url):
try:
r = req.get(url,headers={'user-agent':'Mozilla/5.0'})
r.raise_for_status()
r.encoding = r.apparent_encoding
html = r.text
soup = Be(html,'lxml')
return soup
except:
print("获取"+url+"失败")


def Search_each_Per():
obj_list = []
for i in range(2010,2018):
for j in range(1,25):
if j<10:
ExtraUrl = str(i)+'_0'+str(j)
else:
ExtraUrl = str(i)+'_'+str(j)
if i in [2010,2011,2012]:
if(i == 2012 and j>=14):
url = BaseUrl + ExtraUrl + r"/index.html"
else:
url = BaseUrl + '1_' + ExtraUrl + ".html"
else:
url = BaseUrl + ExtraUrl + r"/index.html"
soup = Do_soup(url) #使用函数
if soup == None:
continue
per_aut_list = soup.find_all('td',class_="author")
for it in per_aut_list: #别习惯性用i,换个名字,是个坑
tag = 0
for jk in obj_list:
if(jk["author"] == it.string):
jk["times"] += 1
tag = 1
break
if(tag == 0):
obj = {"author":it.string,"times":1}
obj_list.append(obj)
return obj_list

def main():
print("正在创建结果对象列表,请耐心等待...")
obj_list = Search_each_Per() #接受结果列表
#下面对结果按发表次数做降序处理
print("下面对结果按发表次数做降序处理...")
obj_list.sort(key = lambda x:x["times"],reverse = True)
# 将结果写入text文本中
print("将结果写入text文本中,请耐心等待...")
path = os.getcwd()
filename = path + "读者作者结果3.txt"
new = open(filename,"w",errors='ignore') #网络字节流中可能有不合法字符要忽略:illegal multibyte sequence
for i in obj_list:
author = i["author"]
times = i["times"]
print(author)
print(times)
if author == None: #unsupported operand type(s) for +: 'NoneType' and 'str'
new.write("None" +"\t" + str(times) + "\n")
else:
new.write(author +"\t" + str(times) + "\n")
new.close()
print("完成统计")

main()

第三种:

使用类创建对象(数据集)

直接在主页爬取各期刊链接,放在列表里,遍历列表查询各期刊作者

爬取一遍,用列表(容器)放对象(数据集),并设置tag标签,判断列表中是否有此对象,没有就实例化一个对象并更新列表,有就 对象.times+1

直接返回列表,并按照列表中对象的times属性对对象(相当于数据集合)整体排序

写入结果到txt文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
class Author(object):
def __init__(self,name):
self.name = name
self.times = 1

from bs4 import BeautifulSoup as Be
import requests as req
import os

BaseUrl = "http://www.52duzhe.com/"

def Do_soup(url):
try:
r = req.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
html = r.text
soup = Be(html,'lxml')
return soup
except:
print("获取"+url+"失败")


def Search_each_Per():
url_list = []
obj_list = []
soup = Do_soup(BaseUrl)
link = soup.select(".booklist a") #获得链接,放回字典
for item in link:
url = BaseUrl +item["href"]
url_list.append(url)
for url in url_list:
soup = Do_soup(url) #使用函数
if soup == None:
continue
per_aut_list = soup.find_all('td',class_="author")
for i in per_aut_list:
tag = 0
for j in obj_list:
if(j.name == i.string):
j.times += 1
tag = 1
break
if(tag == 0):
obj = Author(i.string)
obj_list.append(obj)
return obj_list

def main():
print("正在创建对象列表,请等待...........")
obj_list = Search_each_Per()
#下面对结果按发表次数做降序处理
print("下面对结果按发表次数做降序处理...")
obj_list.sort(key = lambda obj:obj.times,reverse = True)
# 将结果写入text文本中
print("将结果写入text文本中,请耐心等待...")
path = os.getcwd()
filename = path + "读者作者结果2.txt"
new = open(filename,"w",errors="ignore") #处理非法字符 illegal multibyte sequence
for i in obj_list:
author = i.name
times = i.times
print(author)
print(times)
if author == None:
new.write("None" +"\t" + str(times) + "\n")
else:
new.write(author +"\t" + str(times) + "\n")
new.close()
print("完成统计")

main()

读者网作者发布次数统计爬虫
https://blog.wangxk.cc/2019/04/05/读者网作者发布次数统计爬虫/
作者
Mike
发布于
2019年4月5日
许可协议