Skip to content

Commit

Permalink
Add some missing attributes and one API for server support
Browse files Browse the repository at this point in the history
There are a few attributes servers need in order to support
singleton operations as well as user-specified time limits
on job and spawn operations. Current server module functions
do not include the ability for the server library to pass
a local status from a collective operation (i.e., when the
server library aggregates all local participation before
calling the host)

Add a few job-related error constants for clear reporting
to the user.

Add a useful variant of a data management function.

Signed-off-by: Ralph Castain <[email protected]>
  • Loading branch information
rhc54 authored and jjhursey committed Mar 7, 2022
1 parent 7f436bd commit f4752ac
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 1 deletion.
42 changes: 41 additions & 1 deletion Chap_API_Data_Mgmt.tex
Original file line number Diff line number Diff line change
Expand Up @@ -601,7 +601,7 @@ \subsection{\code{PMIx_Data_decompress}}
\begin{codepar}
bool
PMIx_Data_decompress(const uint8_t *inbytes, size_t size,
uint8_t **outbytes, size_t *nbytes,);
uint8_t **outbytes, size_t *nbytes);
\end{codepar}
\cspecificend

Expand Down Expand Up @@ -632,3 +632,43 @@ \subsection{\code{PMIx_Data_decompress}}
unexpected and potentially catastrophic results.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\subsection{\code{PMIx_Data_embed}}
\declareapi{PMIx_Data_embed}

%%%%
\summary

Embed a data payload into a buffer

%%%%
\format

\versionMarker{4.2}
\cspecificstart
\begin{codepar}
pmix_status_t
PMIx_Data_embed(pmix_data_buffer_t *buffer,
const pmix_byte_object_t *payload);
\end{codepar}
\cspecificend

\begin{arglist}
\argout{buffer}{Address of the buffer where the payload is to be embedded (handle)}
\argin{payload}{Address of the \refstruct{pmix_byte_object_t} structure containing the data to be embedded into the buffer (handle)}
\end{arglist}

Returns one of the following:
\begin{constantdesc}
\item \refconst{PMIX_SUCCESS} The data has been embedded as requested
\item \refconst{PMIX_ERR_BAD_PARAM} The destination and/or source pointer is \code{NULL}
\item \refconst{PMIX_ERR_NOT_SUPPORTED} The \ac{PMIx} implementation does not support this function.
\end{constantdesc}

%%%%
\descr

The embed function is identical in operation to \refapi{PMIx_Data_load}
except that it does \emph{not} clear the payload object upon completion.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
34 changes: 34 additions & 0 deletions Chap_API_Proc_Mgmt.tex
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,9 @@ \subsection{\code{PMIx_Spawn}}
\pasteAttributeItem{PMIX_COSPAWN_APP}
\pasteAttributeItem{PMIX_SPAWN_TOOL}
\pasteAttributeItem{PMIX_EVENT_SILENT_TERMINATION}
\pasteAttributeItem{PMIX_ENVARS_HARVESTED}
\pasteAttributeItem{PMIX_JOB_TIMEOUT}
\pasteAttributeItem{PMIX_SPAWN_TIMEOUT}

\optattrend

Expand Down Expand Up @@ -294,6 +297,9 @@ \subsection{\code{PMIx_Spawn_nb}}
\pasteAttributeItem{PMIX_COSPAWN_APP}
\pasteAttributeItem{PMIX_SPAWN_TOOL}
\pasteAttributeItem{PMIX_EVENT_SILENT_TERMINATION}
\pasteAttributeItem{PMIX_ENVARS_HARVESTED}
\pasteAttributeItem{PMIX_JOB_TIMEOUT}
\pasteAttributeItem{PMIX_SPAWN_TIMEOUT}

\optattrend

Expand Down Expand Up @@ -329,6 +335,18 @@ \subsection{Spawn-specific constants}
\declareconstitemNEW{PMIX_ERR_JOB_FAILED_TO_LAUNCH}
One or more processes in the job request failed to launch
%
\declareconstitemNEW{PMIX_ERR_JOB_EXE_NOT_FOUND}
Specified executable not found
%
\declareconstitemNEW{PMIX_ERR_JOB_INSUFFICIENT_RESOURCES}
Insufficient resources to spawn job
%
\declareconstitemNEW{PMIX_ERR_JOB_SYS_OP_FAILED}
System library operation failed
%
\declareconstitemNEW{PMIX_ERR_JOB_WDIR_NOT_FOUND}
Specified working directory not found
%
\end{constantdesc}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Expand Down Expand Up @@ -536,6 +554,22 @@ \subsection{Spawn attributes}
\declareAttribute{PMIX_EVENT_SILENT_TERMINATION}{"pmix.evsilentterm"}{bool}{
Do not generate an event when this job normally terminates.
}
%
\declareAttributeNEW{PMIX_ENVARS_HARVESTED}{"pmix.evar.hvstd"}{bool}{
Environmental parameters have been harvested by the spawn requestor - the server
does not need to harvest them.
}
%
\declareAttributeNEW{PMIX_JOB_TIMEOUT}{"pmix.job.time"}{int}{
Time in seconds before the spawned job should time out and be terminated (0 => infinite)
}
%
\declareAttributeNEW{PMIX_SPAWN_TIMEOUT}{"pmix.sp.time"}{int}{
Time in seconds before spawn operation should time out (0 => infinite).
Logically equivalent to passing the \refattr{PMIX_TIMEOUT} attribute to the
\refapi{PMIx_Spawn} \ac{API}, it is provided as a separate attribute to distinguish
it from the \refattr{PMIX_JOB_TIMEOUT} attribute
}
\vspace{\baselineskip}
Attributes used to adjust remote environment variables prior to spawning the specified application processes.
Expand Down
4 changes: 4 additions & 0 deletions Chap_API_Reserved_Keys.tex
Original file line number Diff line number Diff line change
Expand Up @@ -485,6 +485,10 @@ \subsection{Node realm keys}
\declareAttribute{PMIX_LOCAL_SIZE}{"pmix.local.size"}{uint32_t}{
Number of processes in the specified job or application realm on the caller's node. Defaults to job realm unless the \refattr{PMIX_APP_INFO} and the \refattr{PMIX_APPNUM} qualifiers are given.
}
%
\declareAttributeNEW{PMIX_NODE_OVERSUBSCRIBED}{"pmix.ndosub"}{bool}{
True if the number of processes from this job on this node exceeds the number of slots allocated to it
}

\vspace{\baselineskip}

Expand Down
19 changes: 19 additions & 0 deletions Chap_API_Server.tex
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ \subsection{\code{PMIx_server_init}}
\pasteAttributeItemEnd{}
\pasteAttributeItem{PMIX_SERVER_ENABLE_MONITORING}
\pasteAttributeItem{PMIX_HOMOGENEOUS_SYSTEM}
\pasteAttributeItem{PMIX_SINGLETON}

\optattrend

Expand Down Expand Up @@ -233,6 +234,10 @@ \subsection{Server Initialization Attributes}
\declareAttributeNEW{PMIX_HOMOGENEOUS_SYSTEM}{"pmix.homo"}{bool}{
The nodes comprising the session are homogeneous - i.e., they each contain the same number of identical packages, fabric interfaces, \acp{GPU}, and other devices.
}
%
\declareAttributeNEW{PMIX_SINGLETON}{"pmix.singleton"}{char*}{
String representation (nspace.rank) of proc ID for the singleton the server was started to support
}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Expand Down Expand Up @@ -473,6 +478,7 @@ \subsection{\code{PMIx_server_register_nspace}}
\item \pasteAttributeItem{PMIX_NODE_SIZE}
\item \pasteAttributeItem{PMIX_LOCALLDR}
\item \pasteAttributeItem{PMIX_LOCAL_PEERS}
\item \pasteAttributeItem{PMIX_NODE_OVERSUBSCRIBED}
\end{itemize}

plus the following information for the server's own node:
Expand Down Expand Up @@ -2388,6 +2394,7 @@ \subsection{\code{pmix_server_fencenb_fn_t}}
The following attributes are required to be supported by all host environments:

\pasteAttributeItem{PMIX_COLLECT_DATA}
\pasteAttributeItem{PMIX_LOCAL_COLLECTIVE_STATUS}

\reqattrend

Expand Down Expand Up @@ -2886,6 +2893,8 @@ \subsection{\code{pmix_server_spawn_fn_t}}
\pasteAttributeItem{PMIX_JOB_CONTINUOUS}
\pasteAttributeItem{PMIX_MAX_RESTARTS}
\pasteAttributeItem{PMIX_TIMEOUT}
\pasteAttributeItem{PMIX_JOB_TIMEOUT}
\pasteAttributeItem{PMIX_SPAWN_TIMEOUT}

\optattrend

Expand Down Expand Up @@ -2957,6 +2966,8 @@ \subsection{\code{pmix_server_connect_fn_t}}
\end{itemize}

\reqattrstart
\pasteAttributeItem{PMIX_LOCAL_COLLECTIVE_STATUS}

\ac{PMIx} libraries are required to pass any provided attributes to the host environment for processing.
\reqattrend

Expand Down Expand Up @@ -3025,6 +3036,8 @@ \subsection{\code{pmix_server_disconnect_fn_t}}
\end{itemize}

\reqattrstart
\pasteAttributeItem{PMIX_LOCAL_COLLECTIVE_STATUS}

\ac{PMIx} libraries are required to pass any provided attributes to the host environment for processing.
\reqattrend

Expand Down Expand Up @@ -4310,6 +4323,12 @@ \subsection{\code{pmix_server_grp_fn_t}}
\item a PMIx error constant indicating either an error in the input or that the request was immediately processed and failed - the \refarg{cbfunc} will not be called
\end{itemize}

\reqattrstart
The following attributes are required to be supported by a host environment.

\pasteAttributeItem{PMIX_LOCAL_COLLECTIVE_STATUS}
\reqattrend

\optattrstart
The following attributes may be supported by a host environment.

Expand Down
4 changes: 4 additions & 0 deletions Chap_API_Sync_Access.tex
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,10 @@ \subsection{Fence-related attributes}
available to each participant at the end of the operation. By default, this will include all job-level information that was locally generated by \ac{PMIx} servers unless excluded using the \refattr{PMIX_COLLECT_GENERATED_JOB_INFO} attribute.
}
%
\declareAttributeNEW{PMIX_LOCAL_COLLECTIVE_STATUS}{"pmix.loc.col.st"}{pmix_status_t}{
Status code for local collective operation being reported to the host by the server library
}
%
\declareAttributeNEW{PMIX_COLLECT_GENERATED_JOB_INFO}{"pmix.collect.gen"}{bool}{
Collect all job-level information (i.e., reserved keys) that was locally generated by \ac{PMIx} servers. Some job-level information (e.g., distance between processes and fabric devices) is best determined on a distributed basis as it primarily pertains to local processes. Should remote processes need to access the information, it can either be obtained collectively using the \refapi{PMIx_Fence} operation with this directive, or can be retrieved one peer at a time using \refapi{PMIx_Get} without first having performed the job-wide collection.
}
Expand Down

0 comments on commit f4752ac

Please sign in to comment.