-
Notifications
You must be signed in to change notification settings - Fork 0
/
categorize.py
209 lines (178 loc) · 7.48 KB
/
categorize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This bots finds the English Wikipedia counterpart of a non-English Wikipedia
page and fetches its categories. If any of those categories has a counterpart
in the origin Wikipedia, the bot then adds the page to those categories.
"""
#
# (C) User:Huji, 2016
#
# Distributed under the terms of the MIT license.
#
from __future__ import absolute_import, unicode_literals
#
import pywikibot
from pywikibot import pagegenerators
from pywikibot.bot import (
SingleSiteBot, ExistingPageBot, NoRedirectPageBot, AutomaticTWSummaryBot)
from pywikibot.tools import issue_deprecation_warning
# This is required for the text that is shown when you run this script
# with the parameter -help.
docuReplacements = {
'¶ms;': pagegenerators.parameterHelp
}
class CategorizeBot(
# Refer pywikobot.bot for generic bot classes
SingleSiteBot, # A bot only working on one site
# CurrentPageBot, # Sets 'current_page'. Process it in treat_page method.
# # Not needed here because we have subclasses
ExistingPageBot, # CurrentPageBot which only treats existing pages
NoRedirectPageBot, # CurrentPageBot which only treats non-redirects
AutomaticTWSummaryBot, # Automatically defines summary; needs summary_key
):
def __init__(self, generator, **kwargs):
"""
Constructor.
@param generator: the page generator that determines on which pages
to work
@type generator: generator
"""
# call constructor of the super class
super(CategorizeBot, self).__init__(site=True, **kwargs)
# assign the generator to the bot
self.generator = generator
# define the edit summary
self.summary = u'زیادکردنی پۆل (بۆت)'
# allowed namespaces
self.allowednamespaces = [0, 4, 6, 10, 12, 14, 16]
def treat_page(self):
# check if the bot should even run (this bot cannot be run in EN WP)
if self.current_page.site.code == 'en':
pywikibot.output(u'\03{lightred}Cannot accept EN WP page as input!\03{default}')
return False
if self.current_page.namespace() not in self.allowednamespaces:
pywikibot.output(u'\03{lightred}Namespace not allowed!\03{default}')
return False
# let's define some basic variables
text = self.current_page.text
lang = self.current_page.site.code
current_categories = []
remote_site = pywikibot.Site('en')
remote_title = ''
remote_categories = []
new_categories = []
# don't run the bot if it contains the template that bans this bot
# TODO: make this step FA WP agnostic
if text.find(u'{{پۆل همسنگ نه}}') != -1 or text.find(u'{{پۆلهمسنگ نه}}') != -1:
pywikibot.output(u'\03{lightred}Skipped!\03{default}')
return False
# fetch the list of categories the page is currently in
params = {
'action': 'query',
'prop': 'categories',
'titles': self.current_page.title(),
'redirects': 1,
'cllimit': 500,
}
try:
req = pywikibot.data.api.Request(site=pywikibot.Site(lang), **params).submit()
page_id = req[u'query'][u'pages'].keys()[0]
for cat in req[u'query'][u'pages'][page_id][u'categories']:
current_categories.append(cat[u'title'])
except Exception as e:
pywikibot.output(u'\03{lightred}Unable to fetch local categories!:\03{default}')
pywikibot.output(e)
return False
# find the EN WP counterpart page, if one exists
params = {
'action': 'query',
'prop': 'langlinks',
'titles': self.current_page.title(),
'redirects': 1,
'lllimit': 500,
}
try:
req = pywikibot.data.api.Request(site=pywikibot.Site(lang), **params).submit()
page_id = req[u'query'][u'pages'].keys()[0]
for ll in req[u'query'][u'pages'][page_id][u'langlinks']:
if ll[u'lang'] == u'en':
remote_title = ll[u'*']
except Exception as e:
pywikibot.output(u'\03{lightred}Unable to fetch interwiki links!\03{default}')
pywikibot.output(e)
return False
# fetch the list of categories its EN WP counterpart is in
params = {
'action': 'query',
'prop': 'categories',
'titles': remote_title,
'redirects': 1,
'cllimit': 500,
'clshow': '!hidden'
}
try:
req = pywikibot.data.api.Request(site=remote_site, **params).submit()
page_id = req[u'query'][u'pages'].keys()[0]
remote_categories = req[u'query'][u'pages'][page_id][u'categories']
except:
pywikibot.output(u'\03{lightred}Unable to fetch remote categories!\03{default}')
return False
# if the category is new, add the page to that category
for rc in remote_categories:
remote_category = pywikibot.Page(remote_site, rc[u'title'])
# find the matching local category
for ll in remote_category.langlinks():
if ll.site.code == lang:
# TODO: Get the local namespace name
if u'پۆل:' + ll.title not in current_categories:
# don't add stub categories
# TODO: export this into a function that is more generic
if ll.title.find(u' خرد ') < 0:
text += u'\n[[پۆل:' + ll.title + ']]'
# save the page
self.put_current(text, summary=self.summary)
def main(*args):
"""
Process command line arguments and invoke bot.
If args is an empty list, sys.argv is used.
@param args: command line arguments
@type args: list of unicode
"""
options = {}
# Process global arguments to determine desired site
local_args = pywikibot.handle_args(args)
# This factory is responsible for processing command line arguments
# that are also used by other scripts and that determine on which pages
# to work on.
genFactory = pagegenerators.GeneratorFactory()
# Parse command line arguments
for arg in local_args:
# Catch the pagegenerators options
if genFactory.handleArg(arg):
continue # nothing to do here
# Now pick up your own options
arg, sep, value = arg.partition(':')
option = arg[1:]
if option in ('summary', 'text'):
if not value:
pywikibot.input('Please enter a value for ' + arg)
options[option] = value
# take the remaining options as booleans.
# You will get a hint if they aren't pre-definded in your bot class
else:
options[option] = True
gen = genFactory.getCombinedGenerator()
if gen:
# The preloading generator is responsible for downloading multiple
# pages from the wiki simultaneously.
gen = pagegenerators.PreloadingGenerator(gen)
# pass generator and private options to the bot
bot = CategorizeBot(gen, **options)
bot.run() # guess what it does
return True
else:
pywikibot.bot.suggest_help(missing_generator=True)
return False
if __name__ == '__main__':
main()