-
Notifications
You must be signed in to change notification settings - Fork 654
/
Copy pathupdate_wikidata_stats.rb
177 lines (158 loc) · 6.73 KB
/
update_wikidata_stats.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
# frozen_string_literal: true
require_dependency "#{Rails.root}/lib/wikidata_summary_parser"
require_dependency "#{Rails.root}/lib/importers/wikidata_summary_importer"
# require the installed wikidata-diff-analyzer gem
require 'wikidata-diff-analyzer'
class UpdateWikidataStats
# This hash contains uses the keys of the wikidata-diff-analyzer output hash
# and maps them to the values used in the UI and CourseStat Hash
STATS_CLASSIFICATION = {
# UI section: General
'merge_to' => 'merged to',
'added_sitelinks' => 'interwiki links added',
# UI section: Claims
'added_claims' => 'claims created',
'removed_claims' => 'claims removed',
'changed_claims' => 'claims changed',
# UI section: Items
'clear_item' => 'items cleared',
'create_item' => 'items created',
# UI section: Labels
'added_labels' => 'labels added',
'removed_labels' => 'labels removed',
'changed_labels' => 'labels changed',
# UI section: Descriptions
'added_descriptions' => 'descriptions added',
'removed_descriptions' => 'descriptions removed',
'changed_descriptions' => 'descriptions changed',
# UI section: Aliases
'added_aliases' => 'aliases added',
'removed_aliases' => 'aliases removed',
'changed_aliases' => 'aliases changed',
# UI section: Others
'added_references' => 'references added',
'added_qualifiers' => 'qualifiers added',
'redirect' => 'redirects created',
'undo' => 'reverts performed',
'restore' => 'restorations performed',
# UI section: Not added yet
'removed_references' => 'references removed',
'changed_references' => 'references changed',
'removed_qualifiers' => 'qualifiers removed',
'changed_qualifiers' => 'qualifiers changed',
'removed_sitelinks' => 'interwiki links removed',
'changed_sitelinks' => 'interwiki links updated',
'merge_from' => 'merged from',
'added_lemmas' => 'lemmas added',
'removed_lemmas' => 'lemmas removed',
'changed_lemmas' => 'lemmas changed',
'added_forms' => 'forms added',
'removed_forms' => 'forms removed',
'changed_forms' => 'forms changed',
'added_senses' => 'senses added',
'removed_senses' => 'senses removed',
'changed_senses' => 'senses changed',
'create_property' => 'properties created',
'create_lexeme' => 'lexeme items created',
'added_representations' => 'representations added',
'removed_representations' => 'representations removed',
'changed_representations' => 'representations changed',
'added_glosses' => 'glosses added',
'removed_glosses' => 'glosses removed',
'changed_glosses' => 'glosses changed',
'added_formclaims' => 'form claims added',
'removed_formclaims' => 'form claims removed',
'changed_formclaims' => 'form claims changed',
'added_senseclaims' => 'sense claims added',
'removed_senseclaims' => 'sense claims removed',
'changed_senseclaims' => 'sense claims changed'
}.freeze
def initialize(course)
@course = course
update_summary_with_stats
update_wikidata_statistics
end
private
# When summary is nil, instead of fetching edit summaries for each revision
# wikidata-diff-analyzer gem is used to get the stats saved in the database after serializing.
# That means while UpdateWikidataStats will be called for a course, all the revisions which
# had edit summaries in the summary field be processed with WikidataSummaryParser
# and the revisions which didn't have edit summaries will be processed
# with wikidata-diff-analyzer gem.
# if summary is nil, then the stats would be created and saved in the summary
def update_summary_with_stats
return if wikidata_revisions_without_summaries.empty?
revision_ids = wikidata_revisions_without_summaries.pluck(:mw_rev_id)
analyzed_revisions = WikidataDiffAnalyzer.analyze(revision_ids)[:diffs]
Revision.transaction do
wikidata_revisions_without_summaries.each do |revision|
rev_id = revision.mw_rev_id
individual_stat = analyzed_revisions[rev_id]
serialized_stat = individual_stat.to_json
revision.summary = serialized_stat
revision.save!
end
end
end
def update_wikidata_statistics
return if course_revisions.empty?
crs_stat = CourseStat.find_by(course_id: @course.id) || CourseStat.create(course_id: @course.id)
# Initialize arrays to store revisions with edit summaries and serialized stats
revisions_with_summary = []
revisions_with_serialized_stats = []
# Divide revisions based on edit summaries or serialized stats
course_revisions.each do |revision|
if revision.edit_summary
revisions_with_summary << revision
else
# If the summary contains an edit summary, add it to the revisions_with_summary array
revisions_with_serialized_stats << revision
end
end
summary_stats = WikidataSummaryParser.analyze_revisions(revisions_with_summary)
serialized_stats = get_stats_from_serialized_stats(revisions_with_serialized_stats)
stats = merge_stats(summary_stats, serialized_stats)
# Update the stats_hash in the CourseStat model and save it
crs_stat.stats_hash[wikidata.domain] = stats
crs_stat.save
end
def merge_stats(summary_stats, serialized_stats)
# Create a set of all unique keys from both 'summary_stats' and 'serialized_stats'
all_keys = summary_stats.keys.concat(serialized_stats.keys).uniq
# Initialize a new hash to store the merged stats
merged_stats = {}
# Iterate through all unique keys and add the values from 'summary_stats' and 'serialized_stats'
all_keys.each do |key|
summary_value = summary_stats[key].to_i # gracefully handle nil values
serialized_value = serialized_stats[key].to_i
merged_stats[key] = summary_value + serialized_value
end
merged_stats
end
def get_stats_from_serialized_stats(revisions_with_serialized_stats)
stats = {}
STATS_CLASSIFICATION.each_key do |key|
stats[STATS_CLASSIFICATION[key]] = 0
end
# create a sum of stats after deserializing the stats for each revision object
revisions_with_serialized_stats.each do |revision|
# Deserialize the summary field to get the stats
deserialized_stat = revision.diff_stats
# create a stats which sums up each field of the deserialized_stat and create a stats hash
deserialized_stat.each do |key, value|
stats[STATS_CLASSIFICATION[key]] += value
end
end
stats['total revisions'] = revisions_with_serialized_stats.count
stats
end
def wikidata_revisions_without_summaries
course_revisions.where(summary: nil)
end
def course_revisions
@course.revisions.where(wiki: wikidata, deleted: false)
end
def wikidata
@wikidata ||= Wiki.get_or_create(language: nil, project: 'wikidata')
end
end