Skip to content

Commit

Permalink
Add a mechanism to filter out GitHub forks (#204)
Browse files Browse the repository at this point in the history
The `scan` and `github repos list` commands offer a new `--github-repo-type={all,source,fork}` option to select a subset of repositories. The default value is `source`, which causes all forked repos on GitHub to be ignored.
  • Loading branch information
bradlarsen authored Jul 15, 2024
1 parent 224d0aa commit bc2cc08
Show file tree
Hide file tree
Showing 13 changed files with 152 additions and 23 deletions.
12 changes: 12 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,18 @@ All notable changes to the project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project aspires to use [Semantic Versioning](https://semver.org/spec/v2.0.0.html).


## Unreleased

### Additions

- The `scan` and `github repos list` commands offer a new `--github-repo-type={all,source,fork}` option to select a subset of repositories ([#204](https://github.com/praetorian-inc/noseyparker/pull/204)).

### Changes

- The `scan` and `github repos list` commands now only consider non-forked repositories by default ([#204](https://github.com/praetorian-inc/noseyparker/pull/204)).
This behavior can be reverted to the previous behavior using the `--github-repo-type=all` option.


## [v0.18.1](https://github.com/praetorian-inc/noseyparker/releases/v0.18.1) (2024-07-12)

### Fixes
Expand Down
42 changes: 42 additions & 0 deletions crates/noseyparker-cli/src/args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -462,6 +462,15 @@ pub struct GitHubRepoSpecifiers {
visible_alias = "all-github-orgs"
)]
pub all_organizations: bool,

/// Select only GitHub repos of the given type
#[arg(
long,
visible_alias = "github-repo-type",
value_name="TYPE",
default_value_t = GitHubRepoType::Source,
)]
pub repo_type: GitHubRepoType,
}

impl GitHubRepoSpecifiers {
Expand Down Expand Up @@ -697,6 +706,31 @@ pub enum GitCloneMode {
Mirror,
}

/// Which GitHub repositories to select
#[derive(Copy, Clone, Debug, Display, PartialEq, Eq, PartialOrd, Ord, ValueEnum)]
#[strum(serialize_all = "kebab-case")]
pub enum GitHubRepoType {
/// Select both source repositories and fork repositories
All,

/// Only source repositories, i.e., ones that are not forks
Source,

/// Only fork repositories
#[value(alias = "forks")]
Fork,
}

impl From<GitHubRepoType> for noseyparker::github::RepoType {
fn from(val: GitHubRepoType) -> Self {
match val {
GitHubRepoType::All => noseyparker::github::RepoType::All,
GitHubRepoType::Source => noseyparker::github::RepoType::Source,
GitHubRepoType::Fork => noseyparker::github::RepoType::Fork,
}
}
}

/// The method of handling history in discovered Git repositories
#[derive(Copy, Clone, Debug, Display, PartialEq, Eq, PartialOrd, Ord, ValueEnum)]
#[strum(serialize_all = "kebab-case")]
Expand Down Expand Up @@ -829,6 +863,14 @@ pub struct InputSpecifierArgs {
)]
pub github_api_url: Url,

/// Clone and scan GitHub repos only of the given type
#[arg(
long,
value_name = "TYPE",
default_value_t = GitHubRepoType::Source,
)]
pub github_repo_type: GitHubRepoType,

/// Use the specified method for cloning Git repositories
#[arg(long, value_name = "MODE", display_order = 40, default_value_t=GitCloneMode::Bare, alias="git-clone-mode")]
pub git_clone: GitCloneMode,
Expand Down
1 change: 1 addition & 0 deletions crates/noseyparker-cli/src/cmd_github.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ fn list_repos(global_args: &GlobalArgs, args: &GitHubReposListArgs, api_url: Url
user: args.repo_specifiers.user.clone(),
organization: args.repo_specifiers.organization.clone(),
all_organizations: args.repo_specifiers.all_organizations,
repo_filter: args.repo_specifiers.repo_type.into(),
},
api_url,
global_args.ignore_certs,
Expand Down
1 change: 1 addition & 0 deletions crates/noseyparker-cli/src/cmd_scan.rs
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()>
user: args.input_specifier_args.github_user.clone(),
organization: args.input_specifier_args.github_organization.clone(),
all_organizations: args.input_specifier_args.all_github_organizations,
repo_filter: args.input_specifier_args.github_repo_type.into(),
};
let mut repo_urls = args.input_specifier_args.git_url.clone();
if !repo_specifiers.is_empty() {
Expand Down
9 changes: 5 additions & 4 deletions crates/noseyparker-cli/tests/common/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ pub use assert_cmd::prelude::*;
pub use assert_fs::prelude::*;
pub use assert_fs::{fixture::ChildPath, TempDir};
pub use insta::{assert_json_snapshot, assert_snapshot, internals::Redaction, with_settings};
pub use predicates::str::{is_empty, RegexPredicate};
pub use predicates::prelude::*;
pub use predicates::str::RegexPredicate;
pub use std::path::Path;
pub use std::process::Command;

Expand Down Expand Up @@ -108,7 +109,7 @@ pub fn noseyparker_cmd() -> Command {

/// Create a `RegexPredicate` from the given pattern.
pub fn is_match(pat: &str) -> RegexPredicate {
predicates::str::is_match(pat).expect("pattern should compile")
predicate::str::is_match(pat).expect("pattern should compile")
}

/// Create a `RegexPredicate` for matching a scan stats output message from Nosey Parker.
Expand Down Expand Up @@ -282,8 +283,8 @@ pub fn create_empty_git_repo(destination: &Path) {
.arg(destination)
.assert()
.success()
.stdout(is_empty())
.stderr(is_empty());
.stdout(predicate::str::is_empty())
.stderr(predicate::str::is_empty());
}

pub fn get_report_stdout_filters() -> Vec<(&'static str, &'static str)> {
Expand Down
2 changes: 1 addition & 1 deletion crates/noseyparker-cli/tests/datastore/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ fn export_empty() {
// export it
let tgz = scan_env.root.child("export.tgz");
noseyparker_success!("datastore", "export", "-d", scan_env.dspath(), "-o", tgz.path());
tgz.assert(predicates::path::is_file());
tgz.assert(predicate::path::is_file());

// extract the archive
let extract_dir = scan_env.root.child("export.np");
Expand Down
50 changes: 42 additions & 8 deletions crates/noseyparker-cli/tests/github/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ fn github_repos_list_user_badtoken() {
}

// XXX Note: `octocat` is not a user under our control; it's a kind of test account owned by GitHub.
// We are assuming that the `octocat` user's list of repositories will always include `Spoon-Knife`.
// We are making some assumptions about the `octocat` user's list of repositories that may change.

// XXX Note: the following test cases make actual GitHub requests and may fail due to rate limiting
// issues when not using a token.
Expand All @@ -44,24 +44,58 @@ fn handle_github_token(cmd: &mut Command) {
}
}

// XXX this assumes that Spoon-Knife will be in the octocat user's repo list
#[test]
fn github_repos_list_user_human_format() {
let mut cmd = noseyparker!("github", "repos", "list", "--user", "octocat");
handle_github_token(&mut cmd);
cmd.assert()
.success()
.stdout(predicates::str::contains("https://github.com/octocat/Spoon-Knife.git"))
.stderr(predicates::str::is_empty());
.stdout(predicate::str::contains("https://github.com/octocat/Spoon-Knife.git"))
.stderr(predicate::str::is_empty());
}

// XXX this assumes that Spoon-Knife will be in the octocat user's repo list
#[test]
fn github_repos_list_user_jsonl_format() {
let mut cmd = noseyparker!("github", "repos", "list", "--user", "octocat", "--format", "jsonl");
handle_github_token(&mut cmd);
cmd.assert()
.success()
.stdout(predicates::str::contains("\"https://github.com/octocat/Spoon-Knife.git\"\n"))
.stderr(predicates::str::is_empty());
.stdout(predicate::str::contains("\"https://github.com/octocat/Spoon-Knife.git\"\n"))
.stderr(predicate::str::is_empty());
}

// XXX this assumes that Spoon-Knife will be in the octocat user's non-fork repo list, and linguist will be in its fork repo list
#[test]
fn github_repos_list_user_repo_filter() {
let mut cmd = noseyparker!(
"github",
"repos",
"list",
"--user=octocat",
"--format=jsonl",
"--repo-type=fork"
);
handle_github_token(&mut cmd);
cmd.assert()
.success()
.stdout(predicate::str::contains("\"https://github.com/octocat/linguist.git\"\n"))
.stderr(predicate::str::is_empty());

let mut cmd = noseyparker!(
"github",
"repos",
"list",
"--user=octocat",
"--format=jsonl",
"--repo-type=source"
);
handle_github_token(&mut cmd);
cmd.assert()
.success()
.stdout(predicate::str::contains("\"https://github.com/octocat/linguist.git\"\n").not())
.stderr(predicate::str::is_empty());
}

#[test]
Expand All @@ -73,8 +107,8 @@ fn github_repos_list_multiple_user_dedupe_jsonl_format() {
let cmd = cmd
.assert()
.success()
.stdout(predicates::str::contains("\"https://github.com/octocat/Spoon-Knife.git\"\n"))
.stderr(predicates::str::is_empty());
.stdout(predicate::str::contains("\"https://github.com/octocat/Spoon-Knife.git\"\n"))
.stderr(predicate::str::is_empty());

// Ensure that output is sorted and there are no dupes
let stdout = String::from_utf8(cmd.get_output().stdout.clone())
Expand All @@ -90,7 +124,7 @@ fn github_repos_list_multiple_user_dedupe_jsonl_format() {
fn github_repos_list_user_json_format() {
let mut cmd = noseyparker!("github", "repos", "list", "--user", "octocat", "--format", "json");
handle_github_token(&mut cmd);
let cmd = cmd.assert().success().stderr(predicates::str::is_empty());
let cmd = cmd.assert().success().stderr(predicate::str::is_empty());

let output = &cmd.get_output().stdout;
let json_parsed: Vec<String> =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,16 @@ Input Specifier Options:

This option can be repeated.

--github-repo-type <TYPE>
Clone and scan GitHub repos only of the given type

[default: source]

Possible values:
- all: Select both source repositories and fork repositories
- source: Only source repositories, i.e., ones that are not forks
- fork: Only fork repositories

--github-organization <NAME>
Clone and scan accessible repositories belonging to the specified GitHub organization

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ Rule Selection Options:
Input Specifier Options:
[INPUT]... Scan the specified file, directory, or local Git repository
--git-url <URL> Clone and scan the Git repository at the specified URL
--github-repo-type <TYPE> Clone and scan GitHub repos only of the given type [default:
source] [possible values: all, source, fork]
--github-organization <NAME> Clone and scan accessible repositories belonging to the
specified GitHub organization [aliases: github-org]
--github-user <NAME> Clone and scan accessible repositories belonging to the
Expand Down
4 changes: 2 additions & 2 deletions crates/noseyparker-cli/tests/report/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@ pub use pretty_assertions::{assert_eq, assert_ne};
fn report_nonexistent_default_datastore() {
let scan_env = ScanEnv::new();
let ds = scan_env.root.child("datastore.np");
ds.assert(predicates::path::missing());
ds.assert(predicate::path::missing());

assert_cmd_snapshot!(noseyparker!("report")
.current_dir(scan_env.root.path())
.assert()
.failure());

ds.assert(predicates::path::missing());
ds.assert(predicate::path::missing());
}

/// Test that the `report` command's `--max-matches` can be given a negative value (which means "no
Expand Down
10 changes: 5 additions & 5 deletions crates/noseyparker-cli/tests/scan/basic/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ fn scan_default_datastore() {
let input = scan_env.input_file("input.txt");

let ds = scan_env.root.child("datastore.np");
ds.assert(predicates::path::missing());
ds.assert(predicate::path::missing());

// first scan with the default datastore
noseyparker!("scan", input.path())
Expand All @@ -166,8 +166,8 @@ fn scan_default_datastore() {
.success()
.stdout(match_scan_stats("0 B", 1, 0, 0));

ds.assert(predicates::path::is_dir());
input.assert(predicates::path::is_file());
ds.assert(predicate::path::is_dir());
input.assert(predicate::path::is_file());

// Make sure that summarization and reporting works without an explicit datastore
let cmd = noseyparker!("report", "--format=json")
Expand Down Expand Up @@ -202,12 +202,12 @@ fn scan_default_datastore() {
fn summarize_nonexistent_default_datastore() {
let scan_env = ScanEnv::new();
let ds = scan_env.root.child("datastore.np");
ds.assert(predicates::path::missing());
ds.assert(predicate::path::missing());

assert_cmd_snapshot!(noseyparker!("summarize")
.current_dir(scan_env.root.path())
.assert()
.failure());

ds.assert(predicates::path::missing());
ds.assert(predicate::path::missing());
}
2 changes: 1 addition & 1 deletion crates/noseyparker/src/github.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ pub use auth::Auth;
pub use client::Client;
pub use client_builder::ClientBuilder;
pub use error::Error;
pub use repo_enumerator::{RepoEnumerator, RepoSpecifiers};
pub use repo_enumerator::{RepoEnumerator, RepoSpecifiers, RepoType};
pub use result::Result;

use progress::Progress;
Expand Down
30 changes: 28 additions & 2 deletions crates/noseyparker/src/github/repo_enumerator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ impl<'c> RepoEnumerator<'c> {
let mut repo_urls = Vec::new();

for username in &repo_specifiers.user {
let to_add = self.enumerate_user_repos(username).await?;
let mut to_add = self.enumerate_user_repos(username).await?;
to_add.retain(|r| repo_specifiers.repo_filter.filter(r));
if let Some(progress) = progress.as_mut() {
progress.inc(to_add.len() as u64);
}
Expand All @@ -67,7 +68,8 @@ impl<'c> RepoEnumerator<'c> {
.collect();

for orgname in orgs {
let to_add = self.enumerate_org_repos(orgname).await?;
let mut to_add = self.enumerate_org_repos(orgname).await?;
to_add.retain(|r| repo_specifiers.repo_filter.filter(r));
if let Some(progress) = progress.as_mut() {
progress.inc(to_add.len() as u64);
}
Expand All @@ -81,12 +83,36 @@ impl<'c> RepoEnumerator<'c> {
}
}

/// Specifies which GitHub repositories to select.
#[derive(Debug)]
pub enum RepoType {
/// Select both source repositories and fork repositories
All,

/// Only source repositories, i.e., ones that are forks
Source,

/// Only fork repositories
Fork,
}

impl RepoType {
fn filter(&self, repo: &Repository) -> bool {
match self {
RepoType::All => true,
RepoType::Source => !repo.fork,
RepoType::Fork => repo.fork,
}
}
}

/// Specifies a set of GitHub usernames and/or organization names.
#[derive(Debug)]
pub struct RepoSpecifiers {
pub user: Vec<String>,
pub organization: Vec<String>,
pub all_organizations: bool,
pub repo_filter: RepoType,
}

impl RepoSpecifiers {
Expand Down

0 comments on commit bc2cc08

Please sign in to comment.