From 68b5b429712aba614b68be4e574cadf3d56bcdcf Mon Sep 17 00:00:00 2001 From: "Worker Pants (Pantsbuild GitHub Automation Bot)" <133242086+WorkerPants@users.noreply.github.com> Date: Wed, 30 Aug 2023 17:47:24 -0500 Subject: [PATCH] Remove memoization of the `Paths` intrinsic to reduce memory usage (Cherry-pick of #19689) (#19716) Remove memoization of the `Paths` intrinsic, since it is very memory intensive, and relatively cheap to recompute from its memoized inputs. Improves performance for `pants dependencies ::` in the Pants repo by 6%, and eliminates the `paths` Node line item mentioned on #19053, since it will no longer be held in memory by memoization. Fixes #19053. Co-authored-by: Stu Hood --- src/rust/engine/src/intrinsics.rs | 48 ++++++++++++-- src/rust/engine/src/nodes.rs | 103 ++---------------------------- 2 files changed, 46 insertions(+), 105 deletions(-) diff --git a/src/rust/engine/src/intrinsics.rs b/src/rust/engine/src/intrinsics.rs index c00448cfde3..dcdbdcb09ad 100644 --- a/src/rust/engine/src/intrinsics.rs +++ b/src/rust/engine/src/intrinsics.rs @@ -12,8 +12,8 @@ use std::time::Duration; use crate::context::Context; use crate::externs::fs::{PyAddPrefix, PyFileDigest, PyMergeDigests, PyRemovePrefix}; use crate::nodes::{ - lift_directory_digest, task_side_effected, DownloadedFile, ExecuteProcess, NodeResult, Paths, - RunId, SessionValues, Snapshot, + lift_directory_digest, task_side_effected, unmatched_globs_additional_context, DownloadedFile, + ExecuteProcess, NodeResult, RunId, SessionValues, Snapshot, }; use crate::python::{throw, Key, Value}; use crate::tasks::Intrinsic; @@ -36,7 +36,10 @@ use pyo3::{IntoPy, PyAny, PyRef, Python, ToPyObject}; use tokio::process; use docker::docker::{ImagePullPolicy, ImagePullScope, DOCKER, IMAGE_PULL_CACHE}; -use fs::{DigestTrie, DirectoryDigest, Entry, RelativePath, SymlinkBehavior, TypedPath}; +use fs::{ + DigestTrie, DirectoryDigest, Entry, GlobMatching, PathStat, RelativePath, SymlinkBehavior, + TypedPath, +}; use hashing::{Digest, EMPTY_DIGEST}; use process_execution::local::{ apply_chroot, create_sandbox, prepare_workdir, setup_run_sh_script, KeepSandboxes, @@ -411,10 +414,41 @@ fn path_globs_to_paths( Snapshot::lift_path_globs(py_path_globs) }) .map_err(|e| throw(format!("Failed to parse PathGlobs: {e}")))?; - let paths = context.get(Paths::from_path_globs(path_globs)).await?; - Ok(Python::with_gil(|py| { - Paths::store_paths(py, &core, &paths) - })?) + + let path_globs = path_globs.parse().map_err(throw)?; + let path_stats = context + .expand_globs( + path_globs, + SymlinkBehavior::Oblivious, + unmatched_globs_additional_context(), + ) + .await?; + + Python::with_gil(|py| { + let mut files = Vec::new(); + let mut dirs = Vec::new(); + for ps in path_stats.iter() { + match ps { + PathStat::File { path, .. } => { + files.push(Snapshot::store_path(py, path)?); + } + PathStat::Link { path, .. } => { + panic!("Paths shouldn't be symlink-aware {path:?}"); + } + PathStat::Dir { path, .. } => { + dirs.push(Snapshot::store_path(py, path)?); + } + } + } + Ok(externs::unsafe_call( + py, + core.types.paths, + &[ + externs::store_tuple(py, files), + externs::store_tuple(py, dirs), + ], + )) + }) } .boxed() } diff --git a/src/rust/engine/src/nodes.rs b/src/rust/engine/src/nodes.rs index c4f220d271c..3ad2ac79808 100644 --- a/src/rust/engine/src/nodes.rs +++ b/src/rust/engine/src/nodes.rs @@ -28,8 +28,8 @@ use crate::python::{display_sorted_in_parens, throw, Failure, Key, Params, TypeI use crate::tasks::{self, Rule}; use fs::{ self, DigestEntry, Dir, DirectoryDigest, DirectoryListing, File, FileContent, FileEntry, - GlobExpansionConjunction, GlobMatching, Link, PathGlobs, PathStat, PreparedPathGlobs, - RelativePath, StrictGlobMatching, SymlinkBehavior, SymlinkEntry, Vfs, + GlobExpansionConjunction, GlobMatching, Link, PathGlobs, PreparedPathGlobs, RelativePath, + StrictGlobMatching, SymlinkBehavior, SymlinkEntry, Vfs, }; use process_execution::{ self, CacheName, InputDigests, Process, ProcessCacheScope, ProcessExecutionStrategy, @@ -639,7 +639,7 @@ impl From for NodeKey { } } -fn unmatched_globs_additional_context() -> Option { +pub fn unmatched_globs_additional_context() -> Option { let url = Python::with_gil(|py| { externs::doc_url( py, @@ -653,75 +653,6 @@ fn unmatched_globs_additional_context() -> Option { )) } -/// -/// A node that captures Vec for resolved files/dirs from PathGlobs. -/// -/// This is similar to the Snapshot node, but avoids digesting the files and writing to LMDB store -/// as a performance optimization. -/// -#[derive(Clone, Debug, DeepSizeOf, Eq, Hash, PartialEq)] -pub struct Paths { - path_globs: PathGlobs, -} - -impl Paths { - pub fn from_path_globs(path_globs: PathGlobs) -> Paths { - Paths { path_globs } - } - - async fn create(context: Context, path_globs: PreparedPathGlobs) -> NodeResult> { - context - .expand_globs( - path_globs, - SymlinkBehavior::Oblivious, - unmatched_globs_additional_context(), - ) - .await - } - - pub fn store_paths(py: Python, core: &Arc, item: &[PathStat]) -> Result { - let mut files = Vec::new(); - let mut dirs = Vec::new(); - for ps in item.iter() { - match ps { - PathStat::File { path, .. } => { - files.push(Snapshot::store_path(py, path)?); - } - PathStat::Link { path, .. } => { - panic!("Paths shouldn't be symlink-aware {path:?}"); - } - PathStat::Dir { path, .. } => { - dirs.push(Snapshot::store_path(py, path)?); - } - } - } - Ok(externs::unsafe_call( - py, - core.types.paths, - &[ - externs::store_tuple(py, files), - externs::store_tuple(py, dirs), - ], - )) - } - - async fn run_node(self, context: Context) -> NodeResult>> { - let path_globs = self.path_globs.parse().map_err(throw)?; - let path_stats = Self::create(context, path_globs).await?; - Ok(Arc::new(path_stats)) - } -} - -impl CompoundNode for Paths { - type Item = Arc>; -} - -impl From for NodeKey { - fn from(n: Paths) -> Self { - NodeKey::Paths(n) - } -} - #[derive(Clone, Debug, DeepSizeOf, Eq, Hash, PartialEq)] pub struct SessionValues; @@ -828,7 +759,7 @@ impl Snapshot { Ok(Value::new(py_snapshot.into_py(py))) } - fn store_path(py: Python, item: &Path) -> Result { + pub fn store_path(py: Python, item: &Path) -> Result { if let Some(p) = item.as_os_str().to_str() { Ok(externs::store_utf8(py, p)) } else { @@ -1301,7 +1232,6 @@ pub enum NodeKey { Scandir(Scandir), Select(Box