forked from jackhawks/rectg
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtelegram_spider.py
123 lines (101 loc) · 4.21 KB
/
telegram_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import codecs
import posixpath
import re
from itertools import chain
from urllib.parse import urljoin
import random
import requests
from jinja2 import Template
from lxml import etree
class CreateMarkdown:
""" Create GitHub Markdown """
def __init__(self):
self.url = 'https://github.com/jackhawks/rectg'
self.template_file = '_template.md'
def readme_handler(self):
readme_url = posixpath.join(self.url, "blob/main/README.md")
response = requests.get(readme_url)
html = etree.HTML(response.text)
elements = html.xpath('//*[contains(@href,"t.me")]/@href')
for element in elements:
yield element.replace('\\"', '')
def issues_handler(self):
issues_url = posixpath.join(self.url, "issues")
response = requests.get(issues_url)
html = etree.HTML(response.text)
elements = html.xpath("//div[contains(@role,'group')]//a[contains(@id,'issue_')]/@href")
for element in elements:
issues_title_url = urljoin(self.url, element)
iss_resp = requests.get(issues_title_url)
iss_html = etree.HTML(iss_resp.text)
iss_elements = iss_html.xpath("//a[contains(@href,'t.me')]/@href")[0]
yield iss_elements
def url_join(self, *args):
return chain(*args)
def get_info(self, urls):
for idx, url in enumerate(urls):
print(idx, ' ---> ', url)
response = requests.get(url)
html = etree.HTML(response.text)
tg_me_page_url = url
try:
tg_me_page_title_raw = dict(enumerate(html.xpath(
"//div[contains(@class,'tgme_page')]//div[contains(@class,'tgme_page_title')]//span/text()"))).get(
0)
tg_me_page_title = tg_me_page_title_raw.replace('|', '')
except:
continue
tg_me_page_extra = dict(enumerate(
html.xpath(
"//div[contains(@class,'tgme_page')]//div[contains(@class,'tgme_page_extra')]/text()"))).get(
0)
try:
tg_me_page_description_raw = dict(enumerate(html.xpath(
"//div[contains(@class,'tgme_page')]//div[contains(@class,'tgme_page_description')]/text()"))).get(
0)
if 'If you have' in tg_me_page_description_raw:
continue
tg_me_page_description = tg_me_page_description_raw.replace('|', '')
except:
tg_me_page_description = None
# 数据处理
tg_me_audience = None
tg_me_category = None
if '@' in tg_me_page_extra:
tg_me_category = '机器人'
tg_me_audience = None
elif 'subscribers' in tg_me_page_extra:
tg_me_category = '频道'
tg_me_audience = re.match(r'\d+', re.sub(' ', '', tg_me_page_extra)).group()
elif 'members' in tg_me_page_extra:
tg_me_category = '群组'
tg_me_audience = re.match(r'\d+', re.sub(' ', '', tg_me_page_extra)).group()
yield {
'tg_me_page_url': tg_me_page_url,
'tg_me_page_title': tg_me_page_title,
'tg_me_audience': tg_me_audience,
'tg_me_page_description': tg_me_page_description,
'tg_me_category': tg_me_category,
}
def create_md(self, repo):
with open('_template.md', 'r', encoding='utf-8') as file:
template = Template(file.read(), trim_blocks=True)
rendered_file = template.render(repo=repo)
output_file = codecs.open("README.md", "w", "utf-8")
output_file.write(rendered_file)
output_file.close()
def shuffle(self, generator):
lst = list(generator)
lst = list(set(lst))
random.shuffle(lst)
return (y for y in lst)
def start(self):
issues = self.issues_handler()
readme = self.readme_handler()
urls = self.url_join(issues, readme)
suf = self.shuffle(urls)
info = self.get_info(suf)
self.create_md(info)
if __name__ == '__main__':
cm = CreateMarkdown()
cm.start()