diff --git a/codeowners/src/codeowners.rs b/codeowners/src/codeowners.rs index 2eda2af2..a6c1c531 100644 --- a/codeowners/src/codeowners.rs +++ b/codeowners/src/codeowners.rs @@ -1,6 +1,9 @@ use std::{ + collections::HashMap, fs::File, path::{Path, PathBuf}, + sync::{Arc, Mutex, RwLock}, + thread, }; use constants::CODEOWNERS_LOCATIONS; @@ -17,7 +20,13 @@ use wasm_bindgen::prelude::*; #[cfg(feature = "pyo3")] use crate::{github::BindingsGitHubOwners, gitlab::BindingsGitLabOwners}; -use crate::{github::GitHubOwners, gitlab::GitLabOwners, traits::FromReader}; +use crate::{ + github::GitHubOwners, + gitlab::GitLabOwners, + traits::{FromReader, OwnersOfPath}, +}; + +pub type BundleUploadIDAndCodeOwnersBytes = (String, Option>); // TODO(TRUNK-13628): Implement serializing and deserializing for CodeOwners #[derive(Default, Debug, Serialize, Deserialize, Clone, PartialEq, Eq)] @@ -86,6 +95,36 @@ impl CodeOwners { owners: owners_result.ok(), } } + + pub fn parse_many_multithreaded( + to_parse: Vec, + num_threads: usize, + ) -> HashMap> { + let chunk_size = (to_parse.len() + num_threads - 1) / num_threads; + let mut handles = Vec::with_capacity(num_threads); + let results_map: Arc>>> = + Arc::new(Mutex::new(HashMap::new())); + + for chunk in to_parse.chunks(chunk_size) { + let chunk = chunk.to_vec(); + let results_map = Arc::clone(&results_map); + let handle = thread::spawn(move || { + for (bundle_upload_id, codeowners_bytes) in chunk.into_iter() { + let codeowners = codeowners_bytes.map(Self::parse); + let mut results_map = results_map.lock().unwrap(); + results_map.insert(bundle_upload_id, codeowners); + } + }); + + handles.push(handle); + } + + for handle in handles { + handle.join().unwrap(); + } + + Arc::try_unwrap(results_map).unwrap().into_inner().unwrap() + } } #[derive(Debug, Clone, PartialEq, Eq)] @@ -123,6 +162,67 @@ impl BindingsOwners { } } +pub fn associate_codeowners_multithreaded( + codeowners_matchers: HashMap>, + to_associate: Vec<(String, Option)>, + num_threads: usize, +) -> Vec> { + let input_len = to_associate.len(); + let chunk_size = (input_len + num_threads - 1) / num_threads; + let mut handles = Vec::with_capacity(num_threads); + let codeowners_matchers = Arc::new(RwLock::new(codeowners_matchers)); + let all_associated_owners: Arc>>>> = + Arc::new(Mutex::new(vec![None; input_len])); + + for i in 0..num_threads { + let to_associate = to_associate.clone(); + let codeowners_matchers = Arc::clone(&codeowners_matchers); + let all_associated_owners = Arc::clone(&all_associated_owners); + let start = i * chunk_size; + let end = ((i + 1) * chunk_size).min(input_len); + let handle = thread::spawn(move || { + let codeowners_matchers = codeowners_matchers.read().unwrap(); + for j in start..end { + let (bundle_upload_id, file) = &to_associate[j]; + let codeowners_matcher = codeowners_matchers.get(bundle_upload_id); + let associated_owners: Vec = match (codeowners_matcher, &file) { + (Some(Some(owners)), Some(file)) => match owners { + Owners::GitHubOwners(gho) => gho + .of(file) + .unwrap_or_default() + .iter() + .map(ToString::to_string) + .collect(), + Owners::GitLabOwners(glo) => glo + .of(file) + .unwrap_or_default() + .iter() + .map(ToString::to_string) + .collect(), + }, + _ => Vec::new(), + }; + let mut all_associated_owners = all_associated_owners.lock().unwrap(); + all_associated_owners[j] = Some(associated_owners); + } + }); + + handles.push(handle); + } + + for handle in handles { + handle.join().unwrap(); + } + + Arc::try_unwrap(all_associated_owners) + .unwrap() + .into_inner() + .unwrap() + .into_iter() + .flatten() + .collect() +} + const CODEOWNERS: &str = "CODEOWNERS"; fn locate_codeowners(repo_root: T, location: U) -> Option @@ -137,3 +237,60 @@ where None } } + +#[cfg(test)] +mod tests { + use super::*; + + fn make_codeowners_bytes(i: usize) -> Vec { + format!("{i}.txt @user{i}").into_bytes() + } + + #[test] + pub fn test_multithreaded_parsing_and_association() { + let num_codeowners_files = 100; + let num_files_to_associate_owners = 1000; + let num_threads = 4; + + let codeowners_files: Vec = (0..num_codeowners_files) + .map(|i| (i.to_string(), Some(make_codeowners_bytes(i)))) + .collect(); + let to_associate: Vec<(String, Option)> = (0..num_files_to_associate_owners) + .map(|i| { + let mut file = "foo".to_string(); + if i % 2 == 0 { + let file_prefix = i % num_codeowners_files; + file = format!("{file_prefix}.txt"); + } + ((i % num_codeowners_files).to_string(), Some(file)) + }) + .collect(); + + let codeowners_matchers = + CodeOwners::parse_many_multithreaded(codeowners_files, num_threads) + .into_iter() + .map(|(bundle_upload_id, codeowners)| { + ( + bundle_upload_id, + codeowners.and_then(|codeowners| codeowners.owners), + ) + }) + .collect(); + let owners = crate::associate_codeowners_multithreaded( + codeowners_matchers, + to_associate, + num_threads, + ); + + assert_eq!(owners.len(), num_files_to_associate_owners); + for (i, owners) in owners.iter().enumerate() { + if i % 2 == 0 { + assert_eq!(owners.len(), 1); + let user_id = i % num_codeowners_files; + assert_eq!(owners[0], format!("@user{user_id}")); + } else { + assert_eq!(owners.len(), 0); + } + } + } +} diff --git a/codeowners/src/lib.rs b/codeowners/src/lib.rs index a313c783..f57f6654 100644 --- a/codeowners/src/lib.rs +++ b/codeowners/src/lib.rs @@ -3,7 +3,10 @@ mod github; mod gitlab; mod traits; -pub use codeowners::{BindingsOwners, CodeOwners, Owners}; +pub use codeowners::{ + associate_codeowners_multithreaded, BindingsOwners, BundleUploadIDAndCodeOwnersBytes, + CodeOwners, Owners, +}; pub use github::{BindingsGitHubOwners, GitHubOwner, GitHubOwners}; pub use gitlab::{BindingsGitLabOwners, GitLabOwner, GitLabOwners}; pub use traits::{FromPath, FromReader, OwnersOfPath}; diff --git a/context-py/src/lib.rs b/context-py/src/lib.rs index 578b1fd1..1b42bfa8 100644 --- a/context-py/src/lib.rs +++ b/context-py/src/lib.rs @@ -1,11 +1,13 @@ -use std::sync::{Arc, RwLock}; -use std::{collections::HashMap, io::BufReader, thread}; +use std::{collections::HashMap, io::BufReader}; use bundle::{ parse_meta as parse_meta_impl, parse_meta_from_tarball as parse_meta_from_tarball_impl, BindingsVersionedBundle, }; -use codeowners::{BindingsOwners, CodeOwners}; +use codeowners::{ + associate_codeowners_multithreaded as associate_codeowners, BindingsOwners, + BundleUploadIDAndCodeOwnersBytes, CodeOwners, +}; use context::{env, junit, meta, repo}; use prost::Message; use pyo3::{exceptions::PyTypeError, prelude::*}; @@ -186,51 +188,40 @@ fn codeowners_parse(codeowners_bytes: Vec) -> PyResult { } } +#[gen_stub_pyfunction] +#[pyfunction] +fn parse_many_codeowners_multithreaded( + to_parse: Vec, + num_threads: usize, +) -> HashMap> { + CodeOwners::parse_many_multithreaded(to_parse, num_threads) + .into_iter() + .map(|(bundle_upload_id, codeowners)| { + ( + bundle_upload_id, + codeowners.and_then(|codeowners| codeowners.owners.map(BindingsOwners)), + ) + }) + .collect() +} + #[gen_stub_pyfunction] #[pyfunction] fn associate_codeowners_multithreaded( codeowners_matchers: HashMap>, - to_match: Vec<(String, Option)>, + to_associate: Vec<(String, Option)>, num_threads: usize, ) -> Vec> { - let chunk_size = (to_match.len() + num_threads - 1) / num_threads; - let mut handles = Vec::with_capacity(num_threads); - let shared_map = Arc::new(RwLock::new(codeowners_matchers)); - - for chunk in to_match.chunks(chunk_size) { - let chunk = chunk.to_vec(); - let shared_map = Arc::clone(&shared_map); - let handle = thread::spawn(move || { - let map = shared_map.read().unwrap(); - chunk - .into_iter() - .map(|bundle_upload_id_and_file| -> Vec { - let matcher = map.get(&bundle_upload_id_and_file.0); - match (matcher, &bundle_upload_id_and_file.1) { - (Some(Some(bo)), Some(file)) => { - if let Some(gho) = bo.get_github_owners() { - gho.of(file.to_string()).unwrap_or_default() - } else if let Some(glo) = bo.get_gitlab_owners() { - glo.of(file.to_string()).unwrap_or_default() - } else { - Vec::new() - } - } - _ => Vec::new(), - } - }) - .collect::>>() - }); - handles.push(handle); - } - - let mut result = Vec::new(); - for handle in handles { - let chunk_result = handle.join().unwrap(); - result.extend(chunk_result); - } - - result + associate_codeowners( + codeowners_matchers + .into_iter() + .map(|(bundle_upload_id, codeowners)| { + (bundle_upload_id, codeowners.map(|codeowners| codeowners.0)) + }) + .collect(), + to_associate, + num_threads, + ) } #[pymodule] @@ -278,6 +269,7 @@ fn context_py(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_function(wrap_pyfunction!(codeowners_parse, m)?)?; m.add_function(wrap_pyfunction!(associate_codeowners_multithreaded, m)?)?; + m.add_function(wrap_pyfunction!(parse_many_codeowners_multithreaded, m)?)?; Ok(()) } diff --git a/context-py/tests/test_parse_codeowners.py b/context-py/tests/test_parse_codeowners.py index b2e8c470..e651e93a 100644 --- a/context-py/tests/test_parse_codeowners.py +++ b/context-py/tests/test_parse_codeowners.py @@ -150,80 +150,42 @@ def test_parse_codeowners_from_bytes_gitlab_sections(): ] -def test_associate_multithreaded(): - from context_py import associate_codeowners_multithreaded, codeowners_parse +def test_parse_and_associate_multithreaded(): + from context_py import ( + associate_codeowners_multithreaded, + parse_many_codeowners_multithreaded, + ) - codeowners_text = b""" - # This is an example of a CODEOWNERS file. - # Lines that start with `#` are ignored. - - # app/ @commented-rule - - # Specify a default Code Owner by using a wildcard: - * @default-codeowner - - # Specify multiple Code Owners by using a tab or space: - * @multiple @code @owners - - # Rules defined later in the file take precedence over the rules - # defined before. - # For example, for all files with a filename ending in `.rb`: - *.rb @ruby-owner - - # Specify multiple Code Owners separated by spaces or tabs. - # In the following case the CODEOWNERS file from the root of the repo - # has 3 Code Owners (@multiple @code @owners): - CODEOWNERS @multiple @code @owners - - # You can use both usernames or email addresses to match - # users. Everything else is ignored. For example, this code - # specifies the `@legal` and a user with email `janedoe@gitlab.com` as the - # owner for the LICENSE file: - LICENSE @legal this_does_not_match janedoe@gitlab.com - - # Use group names to match groups, and nested groups to specify - # them as owners for a file: - README @group @group/with-nested/subgroup - - # End a path in a `/` to specify the Code Owners for every file - # nested in that directory, on any level: - /docs/ @all-docs - - # End a path in `/*` to specify Code Owners for every file in - # a directory, but not nested deeper. This code matches - # `docs/index.md` but not `docs/projects/index.md`: - /docs/* @root-docs - - # Include `/**` to specify Code Owners for all subdirectories - # in a directory. This rule matches `docs/projects/index.md` or - # `docs/development/index.md` - /docs/**/*.md @root-docs - - # This code makes matches a `lib` directory nested anywhere in the repository: - lib/ @lib-owner - - # This code match only a `config` directory in the root of the repository: - /config/ @config-owner + def make_codeowners_bytes(i: int) -> bytes: + return f"{i}.txt @user{i}".encode() - # Code Owners section: - [Documentation] - ee/docs @docs - docs @docs + num_codeowners_files = 100 + num_files_to_associate_owners = 1000 + num_threads = 4 - # Use of default owners for a section. In this case, all files (*) are owned by the dev team except the README.md and data-models which are owned by other teams. - [Development] @dev-team - * - README.md @docs-team - data-models/ @data-science-team - - # This section is combined with the previously defined [Documentation] section: - [DOCUMENTATION] - README.md @docs - """ - codeowners = codeowners_parse(codeowners_text) - matchers = {"id_1": codeowners} - files = [("id_1", "foo.rb"), ("id_2", "bar.js"), ("id_3", None)] + codeowners_files = [ + (f"{i}", make_codeowners_bytes(i)) for i in range(0, num_codeowners_files) + ] + to_associate = [ + ( + f"{i % num_codeowners_files}", + f"{i % num_codeowners_files if i % 2 == 0 else 'foo'}.txt", + ) + for i in range(0, num_files_to_associate_owners) + ] - results = associate_codeowners_multithreaded(matchers, files, 4) - print(results) - assert len(results) == 3 + codeowners_matchers = parse_many_codeowners_multithreaded( + codeowners_files, num_threads + ) + owners = associate_codeowners_multithreaded( + codeowners_matchers, to_associate, num_threads + ) + + assert len(owners) == num_files_to_associate_owners + + for i in range(0, num_files_to_associate_owners): + if i % 2 == 0: + assert len(owners[i]) == 1 + assert owners[i][0] == f"@user{i % num_codeowners_files}" + else: + assert len(owners[i]) == 0