Skip to content

Commit

Permalink
✅ Blogroll Moments Beta Ver.4.0.0
Browse files Browse the repository at this point in the history
  • Loading branch information
CCKNBC committed Dec 2, 2021
1 parent ac23316 commit d1d86a9
Show file tree
Hide file tree
Showing 14 changed files with 1,291 additions and 6 deletions.
46 changes: 46 additions & 0 deletions .github/workflows/moments - beta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# 文章更新
name: moments beta

on:
workflow_dispatch:
schedule:
- cron: "0 0,4,14,18,21 * * *"
# push:
# branches: main

jobs:
build:
runs-on: ubuntu-latest
env:
TZ: Asia/Shanghai
APP_ID: ${{ secrets.APPID }}
APP_KEY: ${{ secrets.APPKEY }}
LINK: ${{ secrets.LINK }}
steps:
- name: Checkout
uses: actions/checkout@main
- name: Set up Python #安装python
uses: actions/setup-python@main
with:
python-version: 3.8
- name: Install requirements #安装requests
working-directory: ./hexo_circle_of_friends
run: |
pip install -r requirement.txt
- name: Update Moments #更新
working-directory: ./hexo_circle_of_friends
run: |
python run.py ${{ secrets.APPID }} ${{ secrets.APPKEY }} ${{ secrets.LINK }}
- name: Delete Workflow Runs
uses: Mattraks/delete-workflow-runs@main
with:
retain_days: 1
keep_minimum_runs: 1
# - name: Telegram Notification
# if: cancelled() == false
# uses: xinthink/[email protected]
# with:
# botToken: ${{ secrets.TG_BOT_TOKEN }}
# chatId: ${{ secrets.TG_CHAT_ID }}
# jobStatus: ${{ job.status }}
# skipSuccess: false
12 changes: 6 additions & 6 deletions .github/workflows/moments.yml
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
# 文章更新
name: moments

on:
on:
workflow_dispatch:
schedule:
- cron: "0 16 * * *"
# schedule:
# - cron: "0 16 * * *"
# push:
# branches: main

jobs:
build:
runs-on: ubuntu-latest
env:
TZ: Asia/Shanghai
APP_ID: ${{ secrets.APPID }}
APP_ID: ${{ secrets.APPID }}
APP_KEY: ${{ secrets.APPKEY }}
LINK: ${{ secrets.LINK }}
steps:
Expand All @@ -27,7 +27,7 @@ jobs:
run: |
pip install -r requirements.txt
- name: Update Moments #更新
run: |
run: |
python run.py ${{ secrets.APPID }} ${{ secrets.APPKEY }} ${{ secrets.LINK }}
- name: Delete Workflow Runs
uses: Mattraks/delete-workflow-runs@main
Expand Down
Empty file.
12 changes: 12 additions & 0 deletions hexo_circle_of_friends/items.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class HexoCircleOfFriendsItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
125 changes: 125 additions & 0 deletions hexo_circle_of_friends/middlewares.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals
from settings import USER_AGENT_LIST
import random
import settings
import re
from scrapy.exceptions import IgnoreRequest
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class RandomUserAgentMiddleware:
# 随机User-Agent
def process_request(self, request, spider):
UA = random.choice(USER_AGENT_LIST)
if UA:
request.headers.setdefault('User-Agent',UA)
return None

class BlockSiteMiddleware:
def process_request(self, request, spider):

try:
for url in settings.BLOCK_SITE:
if re.match(url,request.url):
print("block----------------->%s"%url)
raise IgnoreRequest("url block")
except:
pass
return None

class HexoCircleOfFriendsSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.

@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s

def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.

# Should return None or raise an exception.
return None

def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.

# Must return an iterable of Request, or item objects.
for i in result:
yield i

def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.

# Should return either None or an iterable of Request or item objects.
pass

def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.

# Must return only requests (not items).
for r in start_requests:
yield r

def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)


class HexoCircleOfFriendsDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.

@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s

def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.

# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None

def process_response(self, request, response, spider):
# Called with the response returned from the downloader.

# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response

def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.

# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass

def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
171 changes: 171 additions & 0 deletions hexo_circle_of_friends/pipelines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
# -*- coding:utf-8 -*-

import leancloud
import datetime
import settings
import sys
import re
from scrapy.exceptions import DropItem


class HexoCircleOfFriendsPipeline:
def __init__(self):
self.userdata = []
self.nonerror_data = set() # 能够根据友链link获取到文章的人
def open_spider(self, spider):
if settings.DEBUG:
leancloud.init(settings.LC_APPID, settings.LC_APPKEY)
else:
leancloud.init(sys.argv[1], sys.argv[2])
self.Friendslist = leancloud.Object.extend('friend_list')
self.Friendspoor = leancloud.Object.extend('friend_poor')
self.query_friendslist()

for query_j in self.query_friend_list:
delete = self.Friendslist.create_without_data(query_j.get('objectId'))
delete.destroy()
self.query_friendslist()
self.query_friendspoor()

# print(self.query_post_list)
# print(self.query_friend_list)

print("Initialization complete")
def process_item(self, item, spider):
if "userdata" in item.keys():
li = []
li.append(item["name"])
li.append(item["link"])
li.append(item["img"])
self.userdata.append(li)
# print(item)
return item

if "title" in item.keys():
if item["name"] in self.nonerror_data:
pass
else:
# 未失联的人
self.nonerror_data.add(item["name"])

# print(item)
for query_item in self.query_post_list:
try:
if query_item.get("link")==item["link"]:
item["time"]=min(item['time'], query_item.get('time'))
delete = self.Friendspoor.create_without_data(query_item.get('objectId'))
delete.destroy()
# print("----deleted %s ----"%item["title"])
except:
pass

self.friendpoor_push(item)

return item
def close_spider(self,spider):
# print(self.nonerror_data)
# print(self.userdata)

self.friendlist_push()

self.outdate_clean(settings.OUTDATE_CLEAN)
print("----------------------")
print("done!")

def query_friendspoor(self):
try:
query = self.Friendspoor.query
query.select("title",'time', 'link', 'updated')
query.limit(1000)
self.query_post_list = query.find()
# print(self.query_post_list)
except:
self.query_post_list=[]
def query_friendslist(self):
try:
query = self.Friendslist.query
query.select('frindname', 'friendlink', 'firendimg', 'error')
query.limit(1000)
self.query_friend_list = query.find()
except:
self.query_friend_list=[]

def outdate_clean(self,time_limit):
out_date_post = 0
for query_i in self.query_post_list:

time = query_i.get('time')
try:
query_time = datetime.datetime.strptime(time, "%Y-%m-%d")
if (datetime.datetime.today() - query_time).days > time_limit:
delete = self.Friendspoor.create_without_data(query_i.get('objectId'))
out_date_post += 1
delete.destroy()
except:
delete = self.Friendspoor.create_without_data(query_i.get('objectId'))
delete.destroy()
out_date_post += 1
# print('\n')
# print('共删除了%s篇文章' % out_date_post)
# print('\n')
# print('-------结束删除规则----------')

def friendlist_push(self):
for index, item in enumerate(self.userdata):
friendlist = self.Friendslist()
friendlist.set('frindname', item[0])
friendlist.set('friendlink', item[1])
friendlist.set('firendimg', item[2])
if item[0] in self.nonerror_data:
# print("未失联的用户")
friendlist.set('error', "false")
else:
print("请求失败,请检查链接: %s"%item[1])
friendlist.set('error', "true")
friendlist.save()

def friendpoor_push(self,item):
friendpoor = self.Friendspoor()
friendpoor.set('title', item['title'])
friendpoor.set('time', item['time'])
friendpoor.set('updated', item['updated'])
friendpoor.set('link', item['link'])
friendpoor.set('author', item['name'])
friendpoor.set('headimg', item['img'])
friendpoor.set('rule', item['rule'])
friendpoor.save()
print("----------------------")
print(item["name"])
print("《{}》\n文章发布时间:{}\t\t采取的爬虫规则为:{}".format(item["title"], item["time"], item["rule"]))


class DuplicatesPipeline:
def __init__(self):
self.data_set = set() # posts filter set 用于对文章数据的去重
self.user_set = set() # userdata filter set 用于对朋友列表的去重
def process_item(self, item, spider):
if "userdata" in item.keys():
# userdata filter
link = item["link"]
if link in self.user_set:
raise DropItem("Duplicate found:%s" % link)
self.user_set.add(link)
return item

title = item['title']

if title in self.data_set or title=="":
# 重复数据清洗
raise DropItem("Duplicate found:%s" % title)
if not item["link"]:
raise DropItem("missing fields :'link'")
elif not re.match("^http.?://",item["link"]):
# 链接必须是http开头,不能是相对地址
raise DropItem("invalid link ")

if not re.match("^\d+",item["time"]):
# 时间不是xxxx-xx-xx格式,丢弃
raise DropItem("invalid time ")
self.data_set.add(title)

return item
Loading

0 comments on commit d1d86a9

Please sign in to comment.