diff --git a/activitysim/abm/models/joint_tour_frequency_composition.py b/activitysim/abm/models/joint_tour_frequency_composition.py index 44e57d604..bf6031c2e 100644 --- a/activitysim/abm/models/joint_tour_frequency_composition.py +++ b/activitysim/abm/models/joint_tour_frequency_composition.py @@ -132,10 +132,10 @@ def joint_tour_frequency_composition( alternatives=alt_tdd, spec=model_spec, locals_d=constants, - chunk_size=state.settings.chunk_size, trace_label=trace_label, trace_choice_name=trace_label, estimator=estimator, + explicit_chunk_size=0, ) if estimator: diff --git a/activitysim/abm/models/non_mandatory_tour_frequency.py b/activitysim/abm/models/non_mandatory_tour_frequency.py index b300b0e88..af3871bd5 100644 --- a/activitysim/abm/models/non_mandatory_tour_frequency.py +++ b/activitysim/abm/models/non_mandatory_tour_frequency.py @@ -178,6 +178,9 @@ class NonMandatoryTourFrequencySettings(LogitComponentSettings): annotate_tours: PreprocessorSettings | None = None """Preprocessor settings to annotate tours""" + explicit_chunk: int = 0 + """Number of rows to process in each chunk when explicit chunking is enabled""" + @workflow.step def non_mandatory_tour_frequency( @@ -305,10 +308,10 @@ def non_mandatory_tour_frequency( spec=segment_spec, log_alt_losers=log_alt_losers, locals_d=constants, - chunk_size=state.settings.chunk_size, trace_label="non_mandatory_tour_frequency.%s" % segment_name, trace_choice_name="non_mandatory_tour_frequency", estimator=estimator, + explicit_chunk_size=model_settings.explicit_chunk, ) if estimator: diff --git a/activitysim/abm/models/school_escorting.py b/activitysim/abm/models/school_escorting.py index ed4e15bb3..d7311798b 100644 --- a/activitysim/abm/models/school_escorting.py +++ b/activitysim/abm/models/school_escorting.py @@ -408,6 +408,9 @@ class SchoolEscortSettings(BaseLogitComponentSettings): preprocessor_inbound: PreprocessorSettings | None = None preprocessor_outbound_cond: PreprocessorSettings | None = None + explicit_chunk: int = 0 + """If > 0, use this chunk size instead of adaptive chunking.""" + @workflow.step def school_escorting( @@ -539,10 +542,10 @@ def school_escorting( spec=model_spec, log_alt_losers=log_alt_losers, locals_d=locals_dict, - chunk_size=state.settings.chunk_size, trace_label=stage_trace_label, trace_choice_name="school_escorting_" + "stage", estimator=estimator, + explicit_chunk_size=model_settings.explicit_chunk, ) if estimator: diff --git a/activitysim/abm/models/vehicle_type_choice.py b/activitysim/abm/models/vehicle_type_choice.py index 4527f50a3..8271ac6e8 100644 --- a/activitysim/abm/models/vehicle_type_choice.py +++ b/activitysim/abm/models/vehicle_type_choice.py @@ -481,10 +481,10 @@ def iterate_vehicle_type_choice( spec=model_spec, log_alt_losers=log_alt_losers, locals_d=locals_dict, - chunk_size=chunk_size, trace_label=trace_label, trace_choice_name="vehicle_type", estimator=estimator, + explicit_chunk_size=model_settings.explicit_chunk, ) # otherwise, "simple simulation" should suffice, with a model spec that enumerates @@ -583,6 +583,9 @@ class VehicleTypeChoiceSettings(LogitComponentSettings): FLEET_YEAR: int + explicit_chunk: int = 0 + """If > 0, use this chunk size instead of adaptive chunking.""" + @workflow.step def vehicle_type_choice( diff --git a/activitysim/core/configuration.py b/activitysim/core/configuration.py deleted file mode 100644 index 3174653b2..000000000 --- a/activitysim/core/configuration.py +++ /dev/null @@ -1,291 +0,0 @@ -from typing import Union - -try: - from pydantic import BaseModel as PydanticBase -except ModuleNotFoundError: - - class PydanticBase: - pass - - -class InputTable(PydanticBase): - """ - The features that define an input table to be read by ActivitySim. - """ - - tablename: str - """Name of the injected table""" - - filename: str = None - """ - Name of the CSV or HDF5 file to read. - - If not provided, defaults to `input_store` - """ - - index_col: str = None - """table column to use for the index""" - - rename_columns: dict[str, str] = None - """dictionary of column name mappings""" - - keep_columns: list[str] = None - """ - columns to keep once read in to memory. - - Save only the columns needed for modeling or analysis to save on memory - and file I/O - """ - - h5_tablename: str = None - """table name if reading from HDF5 and different from `tablename`""" - - -class Settings(PydanticBase): - """ - The overall settings for the ActivitySim model system. - - The input for these settings is typically stored in one main YAML file, - usually called ``settings.yaml``. - - Note that this implementation is presently used only for generating - documentation, but future work may migrate the settings implementation to - actually use this pydantic code to validate the settings before running - the model. - """ - - models: list[str] - """ - list of model steps to run - auto ownership, tour frequency, etc. - - See :ref:`model_steps` for more details about each step. - """ - - resume_after: str = None - """to resume running the data pipeline after the last successful checkpoint""" - - input_table_list: list[InputTable] - """list of table names, indices, and column re-maps for each table in `input_store`""" - - input_store: str = None - """HDF5 inputs file""" - - create_input_store: bool = False - """ - Write the inputs as read in back to an HDF5 store. - - If enabled, this writes the store to the outputs folder to use for subsequent - model runs, as reading HDF5 can be faster than reading CSV files.""" - - households_sample_size: int = None - """ - Number of households to sample and simulate - - If omitted or set to 0, ActivitySim will simulate all households. - """ - trace_hh_id: Union[int, list] = None - """ - Trace household id(s) - - If omitted, no tracing is written out - """ - - trace_od: list[int] = None - """ - Trace origin, destination pair in accessibility calculation - - If omitted, no tracing is written out. - """ - - chunk_training_mode: str = None - """ - The method to use for chunk training. - - Valid values include {disabled, training, production, adaptive}. - See :ref:`chunk_size` for more details. - """ - - chunk_size: int = None - """ - Approximate amount of RAM to allocate to ActivitySim for batch processing. - - See :ref:`chunk_size` for more details. - """ - - chunk_method: str = None - """ - Memory use measure to use for chunking. - - See :ref:`chunk_size`. - """ - - checkpoints: Union[bool, list] = True - """ - When to write checkpoint (intermediate table states) to disk. - - If True, checkpoints are written at each step. If False, no intermediate - checkpoints will be written before the end of run. Or, provide an explicit - list of models to checkpoint. - """ - - check_for_variability: bool = False - """ - Debugging feature to find broken model specifications. - - Enabling this check does not alter valid results but slows down model runs. - """ - - log_alt_losers: bool = False - """ - Write out expressions when all alternatives are unavailable. - - This can be useful for model development to catch errors in specifications. - Enabling this check does not alter valid results but slows down model runs. - """ - - use_shadow_pricing: bool = False - """turn shadow_pricing on and off for work and school location""" - - output_tables: list[str] = None - """list of output tables to write to CSV or HDF5""" - - want_dest_choice_sample_tables: bool = False - """turn writing of sample_tables on and off for all models""" - - cleanup_pipeline_after_run: bool = False - """ - Cleans up pipeline after successful run. - - This will clean up pipeline only after successful runs, by creating a - single-checkpoint pipeline file, and deleting any subprocess pipelines. - """ - - sharrow: Union[bool, str] = False - """ - Set the sharrow operating mode. - - .. versionadded:: 1.2 - - * `false` - Do not use sharrow. This is the default if no value is given. - * `true` - Use sharrow optimizations when possible, but fall back to - legacy `pandas.eval` systems when any error is encountered. This is the - preferred mode for running with sharrow if reliability is more important - than performance. - * `require` - Use sharrow optimizations, and raise an error if they fail - unexpectedly. This is the preferred mode for running with sharrow - if performance is a concern. - * `test` - Run every relevant calculation using both sharrow and legacy - systems, and compare them to ensure the results match. This is the slowest - mode of operation, but useful for development and debugging. - """ - - -class ZarrDigitalEncoding(PydanticBase): - """Digital encoding instructions for skim tables. - - .. versionadded:: 1.2 - """ - - regex: str - """A regular expression for matching skim matrix names. - - All skims with names that match under typical regular expression rules - for Python will be processed together. - """ - - joint_dict: str - """The name of the joint dictionary for this group. - - This must be a unique name for this set of skims, and a new array - will be added to the Dataset with this name. It will be an integer- - type array indicating the position of each element in the jointly - encoded dictionary.""" - - -class TAZ_Settings(PydanticBase): - """ - Complex settings for TAZ skims that are not just OMX file(s). - - .. versionadded:: 1.2 - """ - - omx: str = None - """The filename of the data stored in OMX format. - - This is treated as a fallback for the raw input data, if ZARR format data - is not available. - """ - - zarr: str = None - """The filename of the data stored in ZARR format. - - Reading ZARR data can be much faster than reading OMX format data, so if - this filename is given, the ZARR file format is preferred if it exists. If - it does not exist, then OMX data is read in and then ZARR data is written - out for future usage. - - .. versionadded:: 1.2 - """ - - zarr_digital_encoding: list[ZarrDigitalEncoding] = None - """ - A list of encodings to apply before saving skims in ZARR format. - - .. versionadded:: 1.2 - """ - - -class NetworkSettings(PydanticBase): - """ - Network level of service and skims settings - - The input for these settings is typically stored in one YAML file, - usually called ``network_los.yaml``. - """ - - zone_system: int - """Which zone system type is used. - - * 1 - TAZ only. - * 2 - MAZ and TAZ. - * 3 - MAZ, TAZ, and TAP - """ - - taz_skims: Union[str, TAZ_Settings] = None - """Instructions for how to load and pre-process skim matrices. - - If given as a string, it is interpreted as the location for OMX file(s), - either as a single file or as a glob-matching pattern for multiple files. - The time period for the matrix must be represented at the end of the matrix - name and be seperated by a double_underscore (e.g. `BUS_IVT__AM` indicates base - skim BUS_IVT with a time period of AM. - - Alternatively, this can be given as a nested dictionary defined via the - TAZ_Settings class, which allows for ZARR transformation and pre-processing. - """ - - skim_time_periods: dict - """time period upper bound values and labels - - * ``time_window`` - total duration (in minutes) of the modeled time span (Default: 1440 minutes (24 hours)) - * ``period_minutes`` - length of time (in minutes) each model time period represents. Must be whole factor of ``time_window``. (Default: 60 minutes) - * ``periods`` - Breakpoints that define the aggregate periods for skims and assignment - * ``labels`` - Labels to define names for aggregate periods for skims and assignment - """ - - read_skim_cache: bool = False - """Read cached skims (using numpy memmap) from output directory. - - Reading from memmap is much faster than omx, but the memmap is a huge - uncompressed file. - """ - - write_skim_cache: bool = False - """Write memmapped cached skims to output directory. - - This is needed if you want to use the cached skims to speed up subsequent - runs. - """ - - cache_dir: str = None - """alternate dir to read/write cache files (defaults to output_dir)""" diff --git a/activitysim/core/configuration/top.py b/activitysim/core/configuration/top.py index c24861138..0b6121037 100644 --- a/activitysim/core/configuration/top.py +++ b/activitysim/core/configuration/top.py @@ -296,7 +296,34 @@ class Settings(PydanticBase, extra="allow", validate_assignment=True): """ The method to use for chunk training. - Valid values include {disabled, training, production, adaptive}. + * "disabled" + All chunking is disabled. If you have enough RAM, this is the fastest + mode, but it requires potentially a lot of RAM. + * "training" + The model is run in training mode, which tracks the amount of memory + used by each table by submodel and writes the results to a cache file + that is then re-used for production runs. This mode is significantly + slower than production mode since it does significantly more memory + inspection. + * "production" + The model is run in production mode, using the cache file created in + training mode. If no such file is found, the model falls back to + training mode. This mode is significantly faster than training mode, as + it uses the cached memory inspection results to determine chunk sizes. + * "adaptive" + Like production mode, any existing cache file is used to determine the + starting chunk settings, but the model also updates the cache settings + based on additional memory inspection. This may additionally improve the + cache settings to reduce runtimes when run in production mode, but at + the cost of some slowdown during the run to accommodate extra memory + inspection. + * "explicit" + The model is run without memory inspection, and the chunk cache file is + not used, even if it exists. Instead, the chunk size settings are + explicitly set in the settings file of each compatible model step. Only + those steps that have an "explicit_chunk" setting are chunkable with + this mode, all other steps are run without chunking. + See :ref:`chunk_size` for more details. """ diff --git a/activitysim/core/interaction_simulate.py b/activitysim/core/interaction_simulate.py index 451345f2f..30ec2f3f9 100644 --- a/activitysim/core/interaction_simulate.py +++ b/activitysim/core/interaction_simulate.py @@ -888,10 +888,10 @@ def interaction_simulate( skims=None, locals_d=None, sample_size=None, - chunk_size=0, trace_label=None, trace_choice_name=None, estimator=None, + explicit_chunk_size=0, ): """ Run a simulation in the situation in which alternatives must @@ -926,13 +926,14 @@ def interaction_simulate( sample_size : int, optional Sample alternatives with sample of given size. By default is None, which does not sample alternatives. - chunk_size : int - if chunk_size > 0 iterates over choosers in chunk_size chunks trace_label: str This is the label to be used for trace log file entries and dump file names when household tracing enabled. No tracing occurs if label is empty or None. trace_choice_name: str This is the column label to be used in trace file csv dump of choices + explicit_chunk_size : int, optional + If > 0, specifies the chunk size to use when chunking the interaction + simulation. Returns ------- @@ -952,7 +953,9 @@ def interaction_simulate( chooser_chunk, chunk_trace_label, chunk_sizer, - ) in chunk.adaptive_chunked_choosers(state, choosers, trace_label): + ) in chunk.adaptive_chunked_choosers( + state, choosers, trace_label, explicit_chunk_size=explicit_chunk_size + ): choices = _interaction_simulate( state, chooser_chunk,