Skip to content

Commit

Permalink
🐛 fiexed: Moments Beta 爬取时间问题
Browse files Browse the repository at this point in the history
  • Loading branch information
CCKNBC committed Dec 5, 2021
1 parent d1d86a9 commit cb0166c
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 35 deletions.
11 changes: 10 additions & 1 deletion hexo_circle_of_friends/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ class HexoCircleOfFriendsPipeline:
def __init__(self):
self.userdata = []
self.nonerror_data = set() # 能够根据友链link获取到文章的人
self.total_post_num = 0
self.total_friend_num = 0
self.err_friend_num = 0
def open_spider(self, spider):
if settings.DEBUG:
leancloud.init(settings.LC_APPID, settings.LC_APPKEY)
Expand Down Expand Up @@ -70,6 +73,10 @@ def close_spider(self,spider):

self.outdate_clean(settings.OUTDATE_CLEAN)
print("----------------------")
print("友链总数 : %d" %self.total_friend_num)
print("失联友链数 : %d" % self.err_friend_num)
print("共 %d 篇文章"%self.total_post_num)

print("done!")

def query_friendspoor(self):
Expand Down Expand Up @@ -120,9 +127,11 @@ def friendlist_push(self):
# print("未失联的用户")
friendlist.set('error', "false")
else:
self.err_friend_num+=1
print("请求失败,请检查链接: %s"%item[1])
friendlist.set('error', "true")
friendlist.save()
self.total_friend_num+=1

def friendpoor_push(self,item):
friendpoor = self.Friendspoor()
Expand All @@ -137,7 +146,7 @@ def friendpoor_push(self,item):
print("----------------------")
print(item["name"])
print("《{}》\n文章发布时间:{}\t\t采取的爬虫规则为:{}".format(item["title"], item["time"], item["rule"]))

self.total_post_num +=1

class DuplicatesPipeline:
def __init__(self):
Expand Down
3 changes: 2 additions & 1 deletion hexo_circle_of_friends/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@
}


# get links from gitee
# get links from gitee
GITEE_FRIENDS_LINKS={
"enable": True, # True 开启gitee issue兼容
Expand All @@ -113,7 +114,7 @@

# retry allowed
# 爬取url失败是否重试
RETRY_ENABLED=True
RETRY_ENABLED=False

# block site list
# 添加屏蔽站点
Expand Down
64 changes: 31 additions & 33 deletions hexo_circle_of_friends/spiders/hexo_circle_of_friends.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import time
import re
import scrapy
import queue
from scrapy.http.request import Request
from hexo_circle_of_friends import settings
from bs4 import BeautifulSoup
Expand All @@ -19,7 +20,8 @@ class FriendpageLinkSpider(scrapy.Spider):
start_urls = []

def __init__(self, name=None, **kwargs):
self.friend_poor = []
self.friend_poor = queue.Queue()
self.friend_list = queue.Queue()

super().__init__(name, **kwargs)

Expand All @@ -31,7 +33,7 @@ def start_requests(self):
# print('好友名%r' % user_info[0])
# print('头像链接%r' % user_info[2])
# print('主页链接%r' % user_info[1])
self.friend_poor.append(user_info)
self.friend_poor.put(user_info)
if settings.GITEE_FRIENDS_LINKS['enable'] and settings.GITEE_FRIENDS_LINKS['type'] == 'normal':
for number in range(1, 100):
domain = 'https://gitee.com'
Expand Down Expand Up @@ -63,44 +65,37 @@ def friend_poor_parse(self, response):
# print("friend_poor_parse---------->" + response.url)

if "gitee" in response.meta.keys():
soup = BeautifulSoup(response.text, 'lxml')
main_content = soup.find_all(id='git-issues')
linklist = main_content[0].find_all('a', {'class': 'title'})

for item in linklist:
issueslink = response.meta["gitee"]["domain"] + item['href']
yield Request(issueslink, self.friend_poor_parse, meta={"gitee-issues": None},dont_filter=True)
main_content = response.css("#git-issues a.title::attr(href)").extract()
if main_content:
for item in main_content:
issueslink = response.meta["gitee"]["domain"] + item
yield Request(issueslink, self.friend_poor_parse, meta={"gitee-issues": None},dont_filter=True)
if "gitee-issues" in response.meta.keys():
issues_soup = BeautifulSoup(response.text, 'html.parser')
try:
issues_linklist = issues_soup.find_all('code')
source = issues_linklist[0].text
content = ''.join(response.css("code *::text").extract())
user_info = []
info_list = ['name', 'link', 'avatar']
reg(info_list, user_info, source)
# print(user_info)
reg(info_list, user_info, content)
if user_info[1] != '你的链接':
self.friend_poor.append(user_info)
self.friend_poor.put(user_info)
except:
pass

if "github" in response.meta.keys():
soup = BeautifulSoup(response.text, 'lxml')
main_content = soup.find_all('div', {'aria-label': 'Issues'})
linklist = main_content[0].find_all('a', {'class': 'Link--primary'})
for item in linklist:
issueslink = response.meta["github"]["domain"] + item['href']
yield Request(issueslink, self.friend_poor_parse, meta={"github-issues": None},dont_filter=True)
main_content = response.css("div[aria-label=Issues] a.Link--primary::attr(href)").extract()
if main_content:
for item in main_content:
issueslink = response.meta["github"]["domain"] + item
yield Request(issueslink, self.friend_poor_parse, meta={"github-issues": None},dont_filter=True)
if "github-issues" in response.meta.keys():
issues_soup = BeautifulSoup(response.text, 'html.parser')
try:
issues_linklist = issues_soup.find_all('pre')
source = issues_linklist[0].text
user_info = []
info_list = ['name', 'link', 'avatar']
reg(info_list, user_info, source)
if user_info[1] != '你的链接':
self.friend_poor.append(user_info)
content = ''.join(response.css("pre *::text").extract())
if content!='':
user_info = []
info_list = ['name', 'link', 'avatar']
reg(info_list, user_info, content)
if user_info[1] != '你的链接':
self.friend_poor.put(user_info)
except:
pass

Expand All @@ -120,7 +115,7 @@ def friend_poor_parse(self, response):
user_info.append(name[i])
user_info.append(link[i])
user_info.append(avatar[i])
self.friend_poor.append(user_info)
self.friend_poor.put(user_info)
user_info = []
# print("""------------------------\n
# name:%s
Expand All @@ -132,7 +127,9 @@ def friend_poor_parse(self, response):
# print(self.friend_poor)

# 要添加主题扩展,在这里添加一个请求
for friend in self.friend_poor:
while not self.friend_poor.empty():
friend = self.friend_poor.get()
self.friend_list.put(friend)
friend[1] += "/" if not friend[1].endswith("/") else ""
yield Request(friend[1] + "atom.xml", callback=self.post_atom_parse, meta={"friend": friend},
dont_filter=True, errback=self.errback_handler)
Expand Down Expand Up @@ -164,7 +161,8 @@ def friend_poor_parse(self, response):
# friend = ['小冰博客', 'https://zfe.space/', 'https://zfe.space/images/headimage.png']
# [[1,1,1],[2,3,2]]
# 将获取到的朋友列表传递到管道
for friend in self.friend_poor:
while not self.friend_list.empty():
friend = self.friend_list.get()
userdata = {}
userdata["name"] = friend[0]
userdata["link"] = friend[1]
Expand Down Expand Up @@ -204,7 +202,7 @@ def post_atom_parse(self, response):
def post_rss2_parse(self, response):
# print("post_rss2_parse---------->" + response.url)
friend = response.meta.get("friend")
soup = BeautifulSoup(response.text, 'lxml')
soup = BeautifulSoup(response.text, "lxml")
items = soup.find_all("item")
if not items:
items = soup.find_all("entry")
Expand Down

0 comments on commit cb0166c

Please sign in to comment.