Skip to content

Commit

Permalink
Merge pull request yegor256#378 from lueFlake/more-than-1000-repos
Browse files Browse the repository at this point in the history
yegor256#186 Workaround to find more than 1000 repositories
  • Loading branch information
yegor256 authored Oct 11, 2024
2 parents a194620 + 4b4d9cb commit 9331a8a
Show file tree
Hide file tree
Showing 2 changed files with 98 additions and 52 deletions.
148 changes: 97 additions & 51 deletions steps/discover-repos.rb
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,18 @@
require 'fileutils'
require 'slop'
require 'octokit'

max = 1000
require 'date'

opts = Slop.parse do |o|
o.string '--token', 'GitHub access token', default: ''
o.boolean '--dry', 'Make no round-trips to GitHub API (for testing)', default: false
o.integer '--total', 'Total number of repos to take from GitHub', required: true
o.integer '--pause', 'How many seconds to sleep between API calls', default: 10
o.integer '--page-size', 'Number of repos to fetch in one API call', default: 100
o.integer '--min-stars', 'Minimum GitHub stars in each repo', default: max
o.integer '--min-stars', 'Minimum GitHub stars in each repo', default: 1000
o.integer '--max-stars', 'Maximum GitHub stars in each repo', default: 100_000
o.integer '--min-size', 'Minimum size of GitHub repo, in Kb', default: 100
o.integer '--start-year', 'The starting year for querying repositories', default: Date.today.year
o.string '--csv', 'The file name to save the list to', required: true
o.string '--tex', 'The file name to save LaTeX summary of the operation', required: true
o.on '--help' do
Expand All @@ -48,8 +48,6 @@
end
end

raise 'Can only retrieve up to 1000 repos' if opts[:total] > max

puts "Trying to find #{opts[:total]} repos in GitHub"
size = [opts[:page_size], opts[:total]].min
puts "Taking up to #{size} repos per one GitHub API request"
Expand All @@ -65,19 +63,6 @@
puts 'Accessing GitHub with personal access token!'
end
found = {}
page = 0
query = [
"stars:#{opts['min-stars']}..#{opts['max-stars']}",
"size:>=#{opts['min-size']}",
'language:java',
'is:public',
'mirror:false',
'archived:false',
'template:false',
'NOT',
'android'
].join(' ')

def mock_array(size, licenses)
Array.new(size) do
{
Expand All @@ -97,44 +82,105 @@ def mock_reps(page, size, licenses)
}
end

def cooldown(opts, found)
puts "Let's sleep for #{opts[:pause]} seconds to cool off GitHub API \
(already found #{found.count} repos, need #{opts[:total]})..."
sleep opts[:pause]
def process_year(year, github, context)
query = build_query(year, context[:opts])
puts "Querying for repositories created in #{year}..."
loop_through_pages(query, github, context)
puts "Completed querying for year #{year}. Found #{context[:found].count} repositories so far."
end

puts 'Not searching GitHub API, using mock repos' if opts[:dry]
loop do
break if page * size > max
count = 0
json = if opts[:dry]
mock_reps(page, size, licenses)
def build_query(year, opts)
[
"stars:#{opts['min-stars']}..#{opts['max-stars']}",
"size:>=#{opts['min-size']}",
'language:java',
"created:#{year}-01-01..#{year}-12-31",
'is:public',
'mirror:false',
'archived:false', 'template:false', 'NOT', 'android'
].join(' ')
end

def loop_through_pages(query, github, context)
page = 0
loop do
break if context[:found].count >= context[:opts][:total]
json = fetch_repositories(query, github, page, context)
break if json[:items].empty?
process_repositories(json[:items], context)
page += 1
cooldown(context)
end
end

def fetch_repositories(query, github, page, context)
if context[:opts][:dry]
mock_reps(page, context[:size], context[:licenses])
else
github.search_repositories(query, per_page: size, page: page)
github.search_repositories(query, per_page: context[:size], page: page)
end
json[:items].each do |i|
no_license = i[:license].nil? || !licenses.include?(i[:license][:key])
puts "Repo #{i[:full_name]} doesn't contain required license. Skipping" if no_license
next if no_license
count += 1
found[i[:full_name]] = {
full_name: i[:full_name],
default_branch: i[:default_branch],
stars: i[:stargazers_count],
forks: i[:forks_count],
created_at: i[:created_at].iso8601,
size: i[:size],
open_issues_count: i[:open_issues_count],
description: "\"#{i[:description]}\"",
topics: Array(i[:topics]).join(' ')
}
puts "Found #{i[:full_name].inspect} GitHub repo ##{found.count} \
(#{i[:forks_count]} forks, #{i[:stargazers_count]} stars) with license: #{i[:license][:key]}"
end

def process_repositories(repositories, context)
repositories.each do |repo_data|
process_repo(repo_data, context[:found], context[:licenses])
end
puts "Found #{count} good repositories in page ##{page} (out of #{json[:items].count})"
end

def process_repo(repo_data, found, licenses)
return if repo_already_processed?(repo_data, found)
return if license_invalid?(repo_data, licenses)
add_repo_to_found(repo_data, found)
print_repo_info(repo_data, found)
end

def repo_already_processed?(repo_data, found)
found.key?(repo_data[:full_name])
end

def license_invalid?(repo_data, licenses)
no_license = repo_data[:license].nil? || !licenses.include?(repo_data[:license][:key])
puts "Repo #{repo_data[:full_name]} doesn't contain required license. Skipping" if no_license
no_license
end

def add_repo_to_found(repo_data, found)
found[repo_data[:full_name]] = {
full_name: repo_data[:full_name],
default_branch: repo_data[:default_branch],
created_at: repo_data[:created_at].iso8601,
open_issues_count: repo_data[:open_issues_count],
description: "\"#{repo_data[:description]}\"",
topics: Array(repo_data[:topics]).join(' '),
stars: repo_data[:stargazers_count], forks: repo_data[:forks_count], size: repo_data[:size]
}
end

def print_repo_info(repo, found)
puts "Found #{repo[:full_name].inspect} GitHub repo ##{found.count} \
(#{repo[:forks_count]} forks, #{repo[:stargazers_count]} stars) with license: #{repo[:license][:key]}"
end

def cooldown(context)
puts "Let's sleep for #{context[:opts][:pause]} seconds to cool off GitHub API \
(already found #{context[:found].count} repos, need #{context[:opts][:total]})..."
sleep context[:opts][:pause]
end

current_year = opts[:start_year]
years = (2008..current_year).to_a.reverse
final_query = ''

puts 'Not searching GitHub API, using mock repos' if opts[:dry]
years.each do |year|
break if found.count >= opts[:total]
cooldown(opts, found)
page += 1
context = {
found: found,
opts: opts,
licenses: licenses,
size: size
}
process_year(year, github, context)
end
puts "Found #{found.count} total repositories in GitHub"

Expand All @@ -158,7 +204,7 @@ def cooldown(opts, found)
' GitHub API\footnote{\url{https://docs.github.com/en/rest}}',
' was the following:',
'\begin{ffcode}',
query.gsub(' ', "\n"),
final_query.gsub(' ', "\n"),
'\end{ffcode}'
].join("\n")
)
Expand Down
2 changes: 1 addition & 1 deletion tests/steps/test-discover-repos.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ tex=${TARGET}/foo.tex
rm -f "${csv}"
msg=$("${LOCAL}/steps/discover-repos.rb" --dry --pause=0 --total=3 --page-size=1 --min-stars=100 --max-stars=1000 "--csv=${csv}" "--tex=${tex}")
echo "${msg}"
echo "${msg}" | grep "Found 1 good repositories in page #0"
echo "${msg}" | grep "Completed querying for year 2024. Found 3 repositories so far."
echo "${msg}" | grep "Found 3 total repositories in GitHub"
test -e "${csv}"
test -s "${tex}"
Expand Down

0 comments on commit 9331a8a

Please sign in to comment.