diff --git a/docs/web/docs/guides/how_to_use/data_porter/reference.md b/docs/web/docs/guides/how_to_use/data_porter/reference.md index e90ce9354..a558bf2d9 100644 --- a/docs/web/docs/guides/how_to_use/data_porter/reference.md +++ b/docs/web/docs/guides/how_to_use/data_porter/reference.md @@ -31,6 +31,7 @@ mephisto db export --export-task-runs-since-date 2024-01-01 mephisto db export --export-task-runs-since-date 2023-01-01T00:00:00 mephisto db export --labels first_dump --labels second_dump mephisto db export --export-tasks-by-ids 1 --delete-exported-data --randomize-legacy-ids --export-indent 4 +mephisto db export --qualification-only ``` Options (all optional): @@ -43,11 +44,14 @@ Options (all optional): - `-del/--delete-exported-data` - after exporting data, delete it from local DB - `-r/--randomize-legacy-ids` - replace legacy autoincremented ids with new pseudo-random ids to avoid conflicts during data merging +- `-qo/--qualification-only` - export only data related to worker qualifications (by default it's disabled) +- `-qn/--qualification-names` - is specified with `--qualification-only` option, only qualifications with these names will be exported - `-i/--export-indent` - make dump easy to read via formatting JSON with indentations (Default 2) - `-v/--verbosity` - write more informative messages about progress (Default 0. Values: 0, 1) Note that the following options cannot be used together: -`--export-tasks-by-names`, `--export-tasks-by-ids`, `--export-task-runs-by-ids`, `--export-task-runs-since-date`, `--labels`. +- `--export-tasks-by-names`, `--export-tasks-by-ids`, `--export-task-runs-by-ids`, `--export-task-runs-since-date`, `--labels` +- `-qo/--qualification-only` and `--delete-exported-data`, `--export-tasks-by-names`, `--export-tasks-by-ids`, `--export-task-runs-by-ids`, `--export-task-runs-since-date`, `--labels`, `--randomize-legacy-ids` ## Import @@ -62,6 +66,7 @@ mephisto db import --file 2024_01_01_00_00_01_mephisto_dump.json --verbosity mephisto db import --file 2024_01_01_00_00_01_mephisto_dump.json --labels my_first_dump mephisto db import --file 2024_01_01_00_00_01_mephisto_dump.json --conflict-resolver MyCustomMergeConflictResolver mephisto db import --file 2024_01_01_00_00_01_mephisto_dump.json --keep-import-metadata +mephisto db import --file 2024_01_01_00_00_01_mephisto_dump.json --qualification-only ``` Options: @@ -72,12 +77,16 @@ Options: - `-l/--labels` - one or more short strings serving as a reference for the ported data (stored in `imported_data` table), so later you can export the imported data with `--labels` export option - `-k/--keep-import-metadata` - write data from `imported_data` table of the dump (by default it's not imported) +- `-qo/--qualification-only` - import only data related to worker qualifications (by default it's disabled) - `-v/--verbosity` - level of logging (default: 0; values: 0, 1) Note that before every import we create a full snapshot copy of your local data, by archiving content of your `data` directory. If any data gets corrupte during the import, you can always return to the original state by replacing your `data` folder with the snaphot. +Note that the following options cannot be used together: +- `-qo/--qualification-only` and `--labels`, `--keep-import-metadata` + ## Backup Creates full backup of all current data (Mephisto DB, provider-specific datastores, and related files) on local machine. diff --git a/mephisto/client/cli_db_commands.py b/mephisto/client/cli_db_commands.py index cf457d3da..bb8c6c549 100644 --- a/mephisto/client/cli_db_commands.py +++ b/mephisto/client/cli_db_commands.py @@ -110,6 +110,22 @@ def db_cli(): "to avoid conflicts during data merging" ), ) +@click.option( + "-qo", + "--qualification-only", + type=bool, + default=False, + is_flag=True, + help="export only data related to worker qualifications", +) +@click.option( + "-qn", + "--qualification-names", + type=str, + multiple=True, + default=None, + help="names of related to worker qualifications to export with `--qualification-only` option", +) @click.option("-v", "--verbosity", type=int, default=VERBOSITY_DEFAULT_VALUE, help=VERBOSITY_HELP) def export(ctx: click.Context, **options: dict): """ @@ -128,9 +144,11 @@ def export(ctx: click.Context, **options: dict): export_tasks_by_ids: Optional[List[str]] = options.get("export_tasks_by_ids") export_task_runs_by_ids: Optional[List[str]] = options.get("export_task_runs_by_ids") export_task_runs_since_date: Optional[str] = options.get("export_task_runs_since_date") - export_labels: Optional[List[str]] = options.get("export_labels") + export_labels: Optional[List[str]] = options.get("labels") delete_exported_data: bool = options.get("delete_exported_data", False) randomize_legacy_ids: bool = options.get("randomize_legacy_ids", False) + qualification_only: bool = options.get("qualification_only", False) + qualification_names: Optional[List[str]] = options.get("qualification_names") verbosity: int = options.get("verbosity", VERBOSITY_DEFAULT_VALUE) porter = DBDataPorter() @@ -167,6 +185,54 @@ def export(ctx: click.Context, **options: dict): ) exit() + has_conflicting_qualification_only_options = ( + len( + list( + filter( + bool, + [ + delete_exported_data, + export_labels, + export_task_runs_by_ids, + export_task_runs_since_date, + export_tasks_by_ids, + export_tasks_by_names, + qualification_only, + randomize_legacy_ids, + ], + ) + ) + ) + > 1 + ) + + if qualification_only and has_conflicting_qualification_only_options: + logger.warning( + "[yellow]" + "You cannot use following options together:" + "\n\t--qualification-only" + "\nand" + "\n\t--delete-exported-data" + "\n\t--export-task-runs-by-ids" + "\n\t--export-task-runs-since-date" + "\n\t--export-task-runs-since-date" + "\n\t--export-tasks-by-ids" + "\n\t--export-tasks-by-names" + "\n\t--labels" + "\n\t--randomize-legacy-ids" + "\nUse `--qualification-only` or other options to export data." + "[/yellow]" + ) + exit() + + if qualification_names and not qualification_only: + logger.warning( + "[yellow]" + "You cannot use option `--qualification-names` without `--qualification-only`." + "[/yellow]" + ) + exit() + export_results = porter.export_dump( json_indent=export_indent, task_names=export_tasks_by_names, @@ -176,6 +242,8 @@ def export(ctx: click.Context, **options: dict): task_run_labels=export_labels, delete_exported_data=delete_exported_data, randomize_legacy_ids=randomize_legacy_ids, + qualification_only=qualification_only, + qualification_names=qualification_names, metadata_export_options=get_export_options_for_metadata(ctx, options), verbosity=verbosity, ) @@ -236,6 +304,14 @@ def export(ctx: click.Context, **options: dict): is_flag=True, help="write data from `imported_data` table of the dump (by default it's not imported)", ) +@click.option( + "-qo", + "--qualification-only", + type=bool, + default=False, + is_flag=True, + help="import only data related to worker qualifications", +) @click.option("-v", "--verbosity", type=int, default=VERBOSITY_DEFAULT_VALUE, help=VERBOSITY_HELP) def _import(ctx: click.Context, **options: dict): """ @@ -249,21 +325,62 @@ def _import(ctx: click.Context, **options: dict): labels: Optional[str] = options.get("labels") conflict_resolver: Optional[str] = options.get("conflict_resolver", DEFAULT_CONFLICT_RESOLVER) keep_import_metadata: Optional[bool] = options.get("keep_import_metadata", False) + qualification_only: bool = options.get("qualification_only", False) verbosity: int = options.get("verbosity", VERBOSITY_DEFAULT_VALUE) + has_conflicting_qualification_only_options = ( + len( + list( + filter( + bool, + [ + keep_import_metadata, + labels, + qualification_only, + ], + ) + ) + ) + > 1 + ) + + if qualification_only and has_conflicting_qualification_only_options: + logger.warning( + "[yellow]" + "You cannot use following options together:" + "\n\t--qualification-only" + "\nand" + "\n\t--labels" + "\n\t--keep-import-metadata" + "\nUse `--qualification-only` or other options to import data." + "[/yellow]" + ) + exit() + porter = DBDataPorter() results = porter.import_dump( dump_archive_file_name_or_path=file, conflict_resolver_name=conflict_resolver, labels=labels, keep_import_metadata=keep_import_metadata, + qualification_only=qualification_only, verbosity=verbosity, ) - logger.info( - f"[green]" - f"Finished successfully. Imported {results['imported_task_runs_number']} TaskRuns" - f"[/green]" - ) + if qualification_only: + logger.info( + f"[green]" + f"Finished successfully. Imported " + f"{results['workers_number']} Workers, " + f"{results['qualifications_number']} Qualifications, " + f"{results['granted_qualifications_number']} GrantedQualifications" + f"[/green]" + ) + else: + logger.info( + f"[green]" + f"Finished successfully. Imported {results['task_runs_number']} TaskRuns" + f"[/green]" + ) # --- BACKUP --- diff --git a/mephisto/tools/db_data_porter/constants.py b/mephisto/tools/db_data_porter/constants.py index 4e364344d..93cafd80f 100644 --- a/mephisto/tools/db_data_porter/constants.py +++ b/mephisto/tools/db_data_porter/constants.py @@ -13,8 +13,15 @@ BACKUP_OUTPUT_DIR = "outputs/backup" EXPORT_OUTPUT_DIR = "outputs/export" + MEPHISTO_DUMP_KEY = "mephisto" + METADATA_DUMP_KEY = "dump_metadata" +METADATA_MIGRATIONS_KEY = "migrations" +METADATA_EXPORT_OPTIONS_KEY = "export_options" +METADATA_TIMESTAMP_KEY = "timestamp" +METADATA_PK_SUBSTITUTIONS_KEY = "pk_substitutions" + AVAILABLE_PROVIDER_TYPES = [ MEPHISTO_DUMP_KEY, MOCK_PROVIDER_TYPE, @@ -216,3 +223,9 @@ LOCAL_DB_LABEL = "_" DEFAULT_ARCHIVE_FORMAT = "zip" + +TABLE_NAMES_RELATED_TO_QUALIFICATIONS = [ + "granted_qualifications", + "qualifications", + "workers", +] diff --git a/mephisto/tools/db_data_porter/db_data_porter.py b/mephisto/tools/db_data_porter/db_data_porter.py index 4ccd1b82d..e663fca7b 100644 --- a/mephisto/tools/db_data_porter/db_data_porter.py +++ b/mephisto/tools/db_data_porter/db_data_porter.py @@ -29,6 +29,10 @@ from mephisto.tools.db_data_porter.constants import IMPORTED_DATA_TABLE_NAME from mephisto.tools.db_data_porter.constants import MEPHISTO_DUMP_KEY from mephisto.tools.db_data_porter.constants import METADATA_DUMP_KEY +from mephisto.tools.db_data_porter.constants import METADATA_EXPORT_OPTIONS_KEY +from mephisto.tools.db_data_porter.constants import METADATA_MIGRATIONS_KEY +from mephisto.tools.db_data_porter.constants import METADATA_PK_SUBSTITUTIONS_KEY +from mephisto.tools.db_data_porter.constants import METADATA_TIMESTAMP_KEY from mephisto.tools.db_data_porter.constants import MIGRATIONS_TABLE_NAME from mephisto.tools.db_data_porter.randomize_ids import randomize_ids from mephisto.tools.db_data_porter.validation import validate_dump_data @@ -101,19 +105,37 @@ def _prepare_dump_data( task_run_labels: Optional[List[str]] = None, since_datetime: Optional[datetime] = None, randomize_legacy_ids: Optional[bool] = False, + qualification_only: Optional[bool] = False, + qualification_names: Optional[List[str]] = None, ) -> dict: - partial = bool(task_names or task_ids or task_run_ids or task_run_labels or since_datetime) + partial = any( + [ + task_names, + task_ids, + task_run_ids, + task_run_labels, + since_datetime, + qualification_only, + ] + ) + if not partial: dump_data = dumps.prepare_full_dump_data(self.db, self.provider_datastores) else: - dump_data = dumps.prepare_partial_dump_data( - self.db, - task_names=task_names, - task_ids=task_ids, - task_run_ids=task_run_ids, - task_run_labels=task_run_labels, - since_datetime=since_datetime, - ) + if not qualification_only: + dump_data = dumps.prepare_partial_dump_data( + self.db, + task_names=task_names, + task_ids=task_ids, + task_run_ids=task_run_ids, + task_run_labels=task_run_labels, + since_datetime=since_datetime, + ) + else: + dump_data = dumps.prepare_qualification_related_dump_data( + self.db, + qualification_names=qualification_names, + ) if randomize_legacy_ids: randomize_ids_results = randomize_ids(self.db, dump_data, legacy_only=True) @@ -179,9 +201,11 @@ def export_dump( task_run_ids: Optional[List[str]] = None, task_runs_since_date: Optional[str] = None, task_run_labels: Optional[List[str]] = None, - delete_exported_data: bool = False, - randomize_legacy_ids: bool = False, - metadata_export_options: dict = None, + delete_exported_data: Optional[bool] = False, + randomize_legacy_ids: Optional[bool] = False, + qualification_only: Optional[bool] = False, + qualification_names: Optional[List[str]] = None, + metadata_export_options: Optional[dict] = None, verbosity: int = 0, ) -> dict: # 1. Protect from accidental launches @@ -215,6 +239,8 @@ def export_dump( task_run_labels=task_run_labels, since_datetime=since_datetime, randomize_legacy_ids=randomize_legacy_ids, + qualification_only=qualification_only, + qualification_names=qualification_names, ) # 3. Prepare export dirs and get dump file path. @@ -228,18 +254,10 @@ def export_dump( # 4. Prepare metadata metadata = { - "migrations": self._get_latest_migrations(), - "export_options": metadata_export_options, - # "export_options": { - # "--export-indent": json_indent, - # "--export-tasks-by-names": task_names, - # "--export-tasks-by-ids": task_ids, - # "--export-task-runs-by-ids": task_run_ids, - # "--export-task-runs-since-date": task_runs_since_date, - # "--verbosity": verbosity, - # }, - "timestamp": dump_timestamp, - "pk_substitutions": self._pk_substitutions, + METADATA_MIGRATIONS_KEY: self._get_latest_migrations(), + METADATA_EXPORT_OPTIONS_KEY: metadata_export_options or {}, + METADATA_TIMESTAMP_KEY: dump_timestamp, + METADATA_PK_SUBSTITUTIONS_KEY: self._pk_substitutions, } dump_data_to_export[METADATA_DUMP_KEY] = metadata @@ -305,8 +323,11 @@ def import_dump( conflict_resolver_name: Optional[str] = DEFAULT_CONFLICT_RESOLVER, labels: Optional[List[str]] = None, keep_import_metadata: Optional[bool] = None, + qualification_only: Optional[bool] = False, verbosity: int = 0, ): + results = {} + # 1. Check dump file path if not dump_archive_file_name_or_path: error_message = "Option `-f/--file` is required." @@ -350,7 +371,11 @@ def import_dump( exit() # 3. Validate dump - dump_data_errors = validate_dump_data(self.db, dump_file_data) + dump_data_errors = validate_dump_data( + db=self.db, + dump_data=dump_file_data, + qualification_only=qualification_only, + ) if dump_data_errors: error_message = make_error_message( "Your dump file has incorrect format", @@ -382,7 +407,7 @@ def import_dump( # 7. Write dump data into local DBs logger.info(f"Started importing from dump file {dump_archive_file_name_or_path} ...") - imported_task_runs_number = 0 + results["task_runs_number"] = 0 for db_or_datastore_name, db_or_datastore_data in dump_file_data.items(): # Pop `imported_data` from dump content, to merge it into local `imported_data` @@ -396,7 +421,13 @@ def import_dump( IMPORTED_DATA_TABLE_NAME, [], ) - imported_task_runs_number = len(db_or_datastore_data.get("task_runs", [])) + + # Update results for printing results in the end + if qualification_only: + for table_name, table_data in db_or_datastore_data.items(): + results[f"{table_name}_number"] = len(table_data) + else: + results["task_runs_number"] = len(db_or_datastore_data.get("task_runs", [])) else: # Provider's datastore. # NOTE: It is being created if it does not exist (yes, here, magically) @@ -474,9 +505,7 @@ def import_dump( f"Finished importing into `{db_or_datastore_name}` database successfully!" ) - return { - "imported_task_runs_number": imported_task_runs_number, - } + return results def create_backup(self, verbosity: int = 0) -> str: backup_dir = self._get_backup_dir() diff --git a/mephisto/tools/db_data_porter/dumps.py b/mephisto/tools/db_data_porter/dumps.py index 9d7fc9aa0..71df0f986 100644 --- a/mephisto/tools/db_data_porter/dumps.py +++ b/mephisto/tools/db_data_porter/dumps.py @@ -18,6 +18,7 @@ from mephisto.data_model.task_run import TaskRun from mephisto.tools.db_data_porter.constants import DATASTORE_EXPORT_METHOD_NAME from mephisto.tools.db_data_porter.constants import MEPHISTO_DUMP_KEY +from mephisto.tools.db_data_porter.constants import TABLE_NAMES_RELATED_TO_QUALIFICATIONS from mephisto.tools.db_data_porter.constants import TASK_RUNS_TABLE_NAME from mephisto.tools.db_data_porter.randomize_ids import get_old_pk_from_substitutions from mephisto.utils import db as db_utils @@ -209,6 +210,72 @@ def prepare_full_dump_data(db: "MephistoDB", provider_datastores: Dict[str, "Mep return dump_data_to_export +def prepare_qualification_related_dump_data( + db: "MephistoDB", + qualification_names: Optional[List[str]] = None, +) -> dict: + table_names = TABLE_NAMES_RELATED_TO_QUALIFICATIONS + dump_data_to_export = {} + + if not qualification_names: + dump_data_to_export[MEPHISTO_DUMP_KEY] = db_utils.db_tables_to_dict(db, table_names) + else: + dump_data = {} + + # Find and serialize `qualifications` + qualification_rows = db_utils.select_rows_by_list_of_field_values( + db, + "qualifications", + ["qualification_name"], + [qualification_names], + ) + + # Validate passed `qualification_names` + not_existing_qualification_names = set(qualification_names) - set( + [q["qualification_name"] for q in qualification_rows] + ) + if not_existing_qualification_names: + logger.error( + f"[yellow]" + f"You passed non-existing qualification names: " + f"{', '.join(not_existing_qualification_names)}" + f"[/yellow]" + ) + exit() + + dump_data["qualifications"] = db_utils.serialize_data_for_table(qualification_rows) + + # Find and serialize `granted_qualifications` + qualification_ids = list( + set(filter(bool, [i["qualification_id"] for i in dump_data["qualifications"]])) + ) + granted_qualification_rows = db_utils.select_rows_by_list_of_field_values( + db, + "granted_qualifications", + ["qualification_id"], + [qualification_ids], + ) + dump_data["granted_qualifications"] = db_utils.serialize_data_for_table( + granted_qualification_rows + ) + + # Find and serialize `workers` + worker_ids = list( + set(filter(bool, [i["worker_id"] for i in dump_data["granted_qualifications"]])) + ) + worker_rows = db_utils.select_rows_by_list_of_field_values( + db, + "workers", + ["worker_id"], + [worker_ids], + ) + dump_data["workers"] = db_utils.serialize_data_for_table(worker_rows) + + dump_data_to_export[MEPHISTO_DUMP_KEY] = dump_data + + return dump_data_to_export + + def delete_exported_data( db: "MephistoDB", dump_data_to_export: dict, diff --git a/mephisto/tools/db_data_porter/export_dump.py b/mephisto/tools/db_data_porter/export_dump.py index 6985d501f..bd92e2b7f 100644 --- a/mephisto/tools/db_data_porter/export_dump.py +++ b/mephisto/tools/db_data_porter/export_dump.py @@ -89,8 +89,6 @@ def _export_data_dir_for_task_runs( tmp_export_dir = make_tmp_export_dir() task_run_data_dirs = [i.get_run_dir() for i in task_runs] - if not task_run_data_dirs: - return False try: tmp_task_run_dirs = [] @@ -137,17 +135,22 @@ def archive_and_copy_data_files( if verbosity: logger.debug(f"Archiving data files started ...") + mephisto_dump_data = dump_data.get(MEPHISTO_DUMP_KEY, {}) + task_runs_dump_data = mephisto_dump_data.get(TASK_RUNS_TABLE_NAME, []) + # Get TaskRuns for PKs in dump task_runs: List[TaskRun] = [] task_runs_ids: List[str] = [] - for dump_task_run in dump_data[MEPHISTO_DUMP_KEY][TASK_RUNS_TABLE_NAME]: - task_runs_pk_field_name = db_utils.get_table_pk_field_name(db, TASK_RUNS_TABLE_NAME) - dump_pk = dump_task_run[task_runs_pk_field_name] - db_pk = get_old_pk_from_substitutions(dump_pk, pk_substitutions, TASK_RUNS_TABLE_NAME) - db_pk = db_pk or dump_pk - task_run: TaskRun = TaskRun.get(db, db_pk) - task_runs.append(task_run) - task_runs_ids.append(db_pk) + + if task_runs_dump_data: + for dump_task_run in task_runs_dump_data: + task_runs_pk_field_name = db_utils.get_table_pk_field_name(db, TASK_RUNS_TABLE_NAME) + dump_pk = dump_task_run[task_runs_pk_field_name] + db_pk = get_old_pk_from_substitutions(dump_pk, pk_substitutions, TASK_RUNS_TABLE_NAME) + db_pk = db_pk or dump_pk + task_run: TaskRun = TaskRun.get(db, db_pk) + task_runs.append(task_run) + task_runs_ids.append(db_pk) if verbosity: logger.debug(f"Archiving data files for TaskRuns: {', '.join(task_runs_ids)}") @@ -200,7 +203,11 @@ def unarchive_data_files( if verbosity: logger.debug("Copying TaskRuns files into {mephisto_data_runs_path} ...") - copy_tree(tmp_unarchive_task_runs_dir, mephisto_data_runs_path, verbose=0) + if os.path.exists(tmp_unarchive_task_runs_dir): + copy_tree(tmp_unarchive_task_runs_dir, mephisto_data_runs_path, verbose=0) + else: + if verbosity: + logger.debug("No files for TaskRuns in archive found, nothing to copy") if verbosity: logger.debug("Copying TaskRuns files finished") diff --git a/mephisto/tools/db_data_porter/import_dump.py b/mephisto/tools/db_data_porter/import_dump.py index 6530930b3..599527e72 100644 --- a/mephisto/tools/db_data_porter/import_dump.py +++ b/mephisto/tools/db_data_porter/import_dump.py @@ -112,7 +112,11 @@ def import_single_db( # They must be imported before other tables tables_with_special_unique_field = TABLES_UNIQUE_LOOKUP_FIELDS.get(provider_type) for table_name, unique_field_names in tables_with_special_unique_field.items(): - dump_table_rows = dump_data[table_name] + dump_table_rows = dump_data.get(table_name) + if not dump_table_rows: + # It can be partial dump without these tables, even empty lists + continue + table_pk_field_name = db_utils.get_table_pk_field_name(db, table_name) is_table_with_special_unique_field = unique_field_names is not None diff --git a/mephisto/tools/db_data_porter/validation.py b/mephisto/tools/db_data_porter/validation.py index 9bed8a161..2c535332c 100644 --- a/mephisto/tools/db_data_porter/validation.py +++ b/mephisto/tools/db_data_porter/validation.py @@ -12,22 +12,46 @@ from mephisto.tools.db_data_porter.constants import AVAILABLE_PROVIDER_TYPES from mephisto.tools.db_data_porter.constants import MEPHISTO_DUMP_KEY from mephisto.tools.db_data_porter.constants import METADATA_DUMP_KEY +from mephisto.tools.db_data_porter.constants import METADATA_EXPORT_OPTIONS_KEY from mephisto.utils import db as db_utils -def validate_dump_data(db: "MephistoDB", dump_data: dict) -> Optional[List[str]]: +def validate_dump_data( + db: "MephistoDB", + dump_data: dict, + qualification_only: Optional[bool] = False, +) -> Optional[List[str]]: errors = [] + # 1. Validate metadata + metadata = dump_data.get(METADATA_DUMP_KEY, {}) + if not metadata: + errors.append(f"Dump file has to contain metadata under `{METADATA_DUMP_KEY}` key.") + return errors + + metadata_qualification_only = metadata.get(METADATA_EXPORT_OPTIONS_KEY, {}).get( + "-qo/--qualification-only" + ) + if qualification_only and not metadata_qualification_only: + errors.append( + f"You cannot use `--qualification-only` option to import a regular dump file." + ) + return errors + + if not qualification_only and metadata_qualification_only: + errors.append(f"You cannot use regular import with a qualification-only dump file.") + return errors + db_dumps = {k: v for k, v in dump_data.items() if k != METADATA_DUMP_KEY} - # 1. Check provider names + # 2. Check provider names incorrect_db_names = list(filter(lambda i: i not in AVAILABLE_PROVIDER_TYPES, db_dumps.keys())) if incorrect_db_names: errors.append( f"Dump file cannot contain these database names: {', '.join(incorrect_db_names)}." ) - # 2. Check if dump file contains JSON-object + # 3. Check if dump file contains JSON-object db_values_are_not_dicts = list(filter(lambda i: not isinstance(i, dict), dump_data.values())) if db_values_are_not_dicts: errors.append( @@ -35,7 +59,7 @@ def validate_dump_data(db: "MephistoDB", dump_data: dict) -> Optional[List[str]] f"that are not JSON-objects." ) - # 3. Check dumps of DBs + # 4. Check dumps of DBs _db_dumps = [(n, d) for n, d in db_dumps.items() if n not in incorrect_db_names] for db_name, db_dump_data in _db_dumps: # Get ot create DB/Datastore to request for available tables diff --git a/mephisto/utils/db.py b/mephisto/utils/db.py index 0fcdd3792..275933883 100644 --- a/mephisto/utils/db.py +++ b/mephisto/utils/db.py @@ -420,10 +420,9 @@ def get_providers_datastores(db: "MephistoDB") -> Dict[str, "MephistoDB"]: return provider_datastores -def db_or_datastore_to_dict(db: "MephistoDB") -> dict: - """Convert all kind of DBs to dict""" +def db_tables_to_dict(db: "MephistoDB", table_names: List[str]) -> dict: + """Convert from tables to dict by their names""" dump_data = {} - table_names = get_list_of_tables_to_export(db) for table_name in table_names: table_rows = select_all_table_rows(db, table_name) table_data = serialize_data_for_table(table_rows) @@ -432,6 +431,13 @@ def db_or_datastore_to_dict(db: "MephistoDB") -> dict: return dump_data +def db_or_datastore_to_dict(db: "MephistoDB") -> dict: + """Convert all kind of DBs to dict""" + table_names = get_list_of_tables_to_export(db) + dump_data = db_tables_to_dict(db, table_names) + return dump_data + + def mephisto_db_to_dict_for_task_runs( db: "MephistoDB", task_run_ids: Optional[List[str]] = None, diff --git a/test/tools/db_data_porter/test_db_data_porter.py b/test/tools/db_data_porter/test_db_data_porter.py index 789fede07..79a65e839 100644 --- a/test/tools/db_data_porter/test_db_data_porter.py +++ b/test/tools/db_data_porter/test_db_data_porter.py @@ -25,6 +25,12 @@ from mephisto.data_model.task_run import TaskRun from mephisto.tools.db_data_porter import DBDataPorter from mephisto.tools.db_data_porter.constants import EXAMPLE_CONFLICT_RESOLVER +from mephisto.tools.db_data_porter.constants import MEPHISTO_DUMP_KEY +from mephisto.tools.db_data_porter.constants import METADATA_DUMP_KEY +from mephisto.tools.db_data_porter.constants import METADATA_EXPORT_OPTIONS_KEY +from mephisto.tools.db_data_porter.constants import METADATA_MIGRATIONS_KEY +from mephisto.tools.db_data_porter.constants import METADATA_PK_SUBSTITUTIONS_KEY +from mephisto.tools.db_data_porter.constants import METADATA_TIMESTAMP_KEY from mephisto.utils import db as db_utils from mephisto.utils.testing import get_test_qualification from mephisto.utils.testing import get_test_requester @@ -258,20 +264,27 @@ def test_export_dump_full( dump_file_data = json.loads(f.read()) # Test main keys - self.assertIn("dump_metadata", dump_file_data) - self.assertIn("mephisto", dump_file_data) + self.assertIn(METADATA_DUMP_KEY, dump_file_data) + self.assertIn(MEPHISTO_DUMP_KEY, dump_file_data) # Test `dump_metadata` - self.assertEqual(dump_file_data["dump_metadata"]["export_options"], None) self.assertEqual( - dump_file_data["dump_metadata"]["migrations"], - {"mephisto": "20240418_data_porter_feature"}, + dump_file_data[METADATA_DUMP_KEY][METADATA_EXPORT_OPTIONS_KEY], + {}, + ) + self.assertEqual( + dump_file_data[METADATA_DUMP_KEY][METADATA_MIGRATIONS_KEY], + {MEPHISTO_DUMP_KEY: "20240418_data_porter_feature"}, + ) + self.assertEqual( + dump_file_data[METADATA_DUMP_KEY][METADATA_PK_SUBSTITUTIONS_KEY], {} + ) + self.assertEqual( + dump_file_data[METADATA_DUMP_KEY][METADATA_TIMESTAMP_KEY], FILE_TIMESTAMP ) - self.assertEqual(dump_file_data["dump_metadata"]["pk_substitutions"], {}) - self.assertEqual(dump_file_data["dump_metadata"]["timestamp"], FILE_TIMESTAMP) # Test `mephisto` - mephisto_dump = dump_file_data["mephisto"] + mephisto_dump = dump_file_data[MEPHISTO_DUMP_KEY] tables_without_task_run_id = [ "workers", @@ -345,7 +358,7 @@ def test_export_dump_option_task_names( with archive.open(json_dump_file_name) as f: dump_file_data = json.loads(f.read()) - mephisto_dump = dump_file_data["mephisto"] + mephisto_dump = dump_file_data[MEPHISTO_DUMP_KEY] self.assertEqual(len(mephisto_dump["tasks"]), 1) self.assertEqual(len(db_utils.select_all_table_rows(self.db, "tasks")), 2) @@ -417,7 +430,7 @@ def test_export_dump_option_task_ids( with archive.open(json_dump_file_name) as f: dump_file_data = json.loads(f.read()) - mephisto_dump = dump_file_data["mephisto"] + mephisto_dump = dump_file_data[MEPHISTO_DUMP_KEY] self.assertEqual(len(mephisto_dump["tasks"]), 1) self.assertEqual(len(db_utils.select_all_table_rows(self.db, "tasks")), 2) @@ -489,7 +502,7 @@ def test_export_dump_option_task_run_ids( with archive.open(json_dump_file_name) as f: dump_file_data = json.loads(f.read()) - mephisto_dump = dump_file_data["mephisto"] + mephisto_dump = dump_file_data[MEPHISTO_DUMP_KEY] self.assertEqual(len(mephisto_dump["tasks"]), 1) self.assertEqual(len(db_utils.select_all_table_rows(self.db, "tasks")), 2) @@ -561,7 +574,7 @@ def test_export_dump_option_task_runs_since_date( with archive.open(json_dump_file_name) as f: dump_file_data = json.loads(f.read()) - mephisto_dump = dump_file_data["mephisto"] + mephisto_dump = dump_file_data[MEPHISTO_DUMP_KEY] self.assertEqual(len(mephisto_dump["tasks"]), 1) self.assertEqual(len(db_utils.select_all_table_rows(self.db, "tasks")), 2) @@ -635,7 +648,7 @@ def test_export_dump_option_task_runs_labels( with archive.open(json_dump_file_name) as f: dump_file_data = json.loads(f.read()) - mephisto_dump = dump_file_data["mephisto"] + mephisto_dump = dump_file_data[MEPHISTO_DUMP_KEY] self.assertEqual(len(mephisto_dump["tasks"]), 1) self.assertEqual(len(db_utils.select_all_table_rows(self.db, "tasks")), 2) @@ -725,7 +738,7 @@ def test_export_dump_option_delete_exported_data( with archive.open(json_dump_file_name) as f: dump_file_data = json.loads(f.read()) - mephisto_dump = dump_file_data["mephisto"] + mephisto_dump = dump_file_data[MEPHISTO_DUMP_KEY] # Tables where we deleted entries task_run_rows_after = db_utils.select_all_table_rows(self.db, "task_runs") @@ -820,10 +833,10 @@ def test_export_dump_option_randomize_legacy_ids( with archive.open(json_dump_file_name) as f: dump_file_data = json.loads(f.read()) - mephisto_dump = dump_file_data["mephisto"] - pk_substitutions = dump_file_data["dump_metadata"]["pk_substitutions"]["mephisto"][ - "task_runs" - ] + mephisto_dump = dump_file_data[MEPHISTO_DUMP_KEY] + pk_substitutions = dump_file_data[METADATA_DUMP_KEY][METADATA_PK_SUBSTITUTIONS_KEY][ + MEPHISTO_DUMP_KEY + ]["task_runs"] task_runs_dump = sorted( mephisto_dump["task_runs"], key=lambda k: k["creation_date"], @@ -942,7 +955,7 @@ def test_import_dump_full( # Test imported data in database mock__ask_user_if_they_are_sure.assert_called_once() - self.assertEqual(results["imported_task_runs_number"], 1) + self.assertEqual(results["task_runs_number"], 1) table_names = db_utils.get_list_of_tables_to_export(self.db) for table_name in table_names: rows = db_utils.select_all_table_rows(self.db, table_name) @@ -1052,7 +1065,7 @@ def test_import_dump_option_labels( # Test imported data in database mock__ask_user_if_they_are_sure.assert_called_once() - self.assertEqual(results["imported_task_runs_number"], 1) + self.assertEqual(results["task_runs_number"], 1) # Test labels available_labels = db_utils.get_list_of_available_labels(self.db) @@ -1104,7 +1117,7 @@ def test_import_dump_option_keep_import_metadata( # Test imported data in database mock__ask_user_if_they_are_sure.assert_called_once() - self.assertEqual(results["imported_task_runs_number"], 1) + self.assertEqual(results["task_runs_number"], 1) # Test labels available_labels = db_utils.get_list_of_available_labels(self.db) @@ -1164,7 +1177,7 @@ def test_import_dump_option_conflict_resolver_name( # Test imported data in database mock__ask_user_if_they_are_sure.assert_called_once() - self.assertEqual(results["imported_task_runs_number"], 1) + self.assertEqual(results["task_runs_number"], 1) # Test tasks were merged task_rows = db_utils.select_all_table_rows(self.db, "tasks") diff --git a/test/tools/db_data_porter/test_dumps.py b/test/tools/db_data_porter/test_dumps.py index de0efc6fa..73c80d6df 100644 --- a/test/tools/db_data_porter/test_dumps.py +++ b/test/tools/db_data_porter/test_dumps.py @@ -4,8 +4,10 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. +import io import os import shutil +import sys import tempfile import unittest from datetime import timedelta @@ -18,10 +20,13 @@ from mephisto.abstractions.database import MephistoDB from mephisto.abstractions.databases.local_database import LocalMephistoDB from mephisto.data_model.task_run import TaskRun +from mephisto.tools.db_data_porter.constants import MEPHISTO_DUMP_KEY +from mephisto.tools.db_data_porter.constants import TABLE_NAMES_RELATED_TO_QUALIFICATIONS from mephisto.tools.db_data_porter.dumps import _make_options_error_message from mephisto.tools.db_data_porter.dumps import delete_exported_data from mephisto.tools.db_data_porter.dumps import prepare_full_dump_data from mephisto.tools.db_data_porter.dumps import prepare_partial_dump_data +from mephisto.tools.db_data_porter.dumps import prepare_qualification_related_dump_data from mephisto.utils import db as db_utils from mephisto.utils.testing import get_test_qualification from mephisto.utils.testing import get_test_requester @@ -1080,3 +1085,80 @@ def test_delete_exported_data_pk_substitutions(self, mock_get_data_dir, *args): self.assertEqual(len(rows), 1) else: self.assertEqual(len(rows), 0) + + def test_prepare_qualification_related_dump_data_without_qualification_names(self): + _, worker_id = get_test_worker(self.db) + qualification_id = get_test_qualification(self.db, "qual_1") + grant_test_qualification(self.db, worker_id=worker_id, qualification_id=qualification_id) + + result = prepare_qualification_related_dump_data(self.db, qualification_names=None) + + self.assertIn(MEPHISTO_DUMP_KEY, result) + self.assertEqual( + len(result[MEPHISTO_DUMP_KEY].keys()), len(TABLE_NAMES_RELATED_TO_QUALIFICATIONS) + ) + for _, table_value in result[MEPHISTO_DUMP_KEY].items(): + self.assertEqual(len(table_value), 1) + + def test_prepare_qualification_related_dump_data_with_qualification_names(self): + qualification_1_name = "qual_1" + qualification_2_name = "qual_2" + + _, worker_1_id = get_test_worker(self.db, "worker_1") + _, worker_2_id = get_test_worker(self.db, "worker_2") + qualification_1_id = get_test_qualification(self.db, qualification_1_name) + qualification_2_id = get_test_qualification(self.db, qualification_2_name) + grant_test_qualification( + self.db, worker_id=worker_1_id, qualification_id=qualification_2_id + ) + grant_test_qualification( + self.db, worker_id=worker_2_id, qualification_id=qualification_2_id + ) + + # By first qualification + result_1 = prepare_qualification_related_dump_data( + self.db, qualification_names=[qualification_1_name] + ) + + self.assertIn(MEPHISTO_DUMP_KEY, result_1) + self.assertEqual( + len(result_1[MEPHISTO_DUMP_KEY].keys()), len(TABLE_NAMES_RELATED_TO_QUALIFICATIONS) + ) + self.assertEqual(len(result_1[MEPHISTO_DUMP_KEY]["workers"]), 0) + self.assertEqual(len(result_1[MEPHISTO_DUMP_KEY]["qualifications"]), 1) + self.assertEqual(len(result_1[MEPHISTO_DUMP_KEY]["granted_qualifications"]), 0) + + # By second qualification + result_2 = prepare_qualification_related_dump_data( + self.db, qualification_names=[qualification_2_name] + ) + + self.assertIn(MEPHISTO_DUMP_KEY, result_2) + self.assertEqual( + len(result_2[MEPHISTO_DUMP_KEY].keys()), len(TABLE_NAMES_RELATED_TO_QUALIFICATIONS) + ) + self.assertEqual(len(result_2[MEPHISTO_DUMP_KEY]["workers"]), 2) + self.assertEqual(len(result_2[MEPHISTO_DUMP_KEY]["qualifications"]), 1) + self.assertEqual(len(result_2[MEPHISTO_DUMP_KEY]["granted_qualifications"]), 2) + + def test_prepare_qualification_related_dump_data_non_existing_qualification_names(self): + qualification_name = "qual_1" + non_existing_qualification_name = "non_existing_qual" + + _, worker_id = get_test_worker(self.db) + qualification_id = get_test_qualification(self.db, qualification_name) + grant_test_qualification(self.db, worker_id=worker_id, qualification_id=qualification_id) + + with self.assertRaises(SystemExit) as cm: + captured_print_output = io.StringIO() + sys.stdout = captured_print_output + prepare_qualification_related_dump_data( + self.db, qualification_names=[qualification_name, non_existing_qualification_name] + ) + sys.stdout = sys.__stdout__ + + self.assertEqual(cm.exception.code, None) + self.assertIn( + "You passed non-existing qualification names", captured_print_output.getvalue() + ) + self.assertIn(non_existing_qualification_name, captured_print_output.getvalue()) diff --git a/test/tools/db_data_porter/test_export_dump.py b/test/tools/db_data_porter/test_export_dump.py index 7fa082b3b..30d2b02c1 100644 --- a/test/tools/db_data_porter/test_export_dump.py +++ b/test/tools/db_data_porter/test_export_dump.py @@ -234,7 +234,7 @@ def test__export_data_dir_for_task_runs_no_task_runs( pk_substitutions={}, ) - self.assertFalse(result) + self.assertTrue(result) mock__rename_dirs_with_new_pks.assert_not_called() @patch("mephisto.tools.db_data_porter.export_dump.make_tmp_export_dir") diff --git a/test/tools/db_data_porter/test_validation.py b/test/tools/db_data_porter/test_validation.py index 4acfb73f9..21d7a055e 100644 --- a/test/tools/db_data_porter/test_validation.py +++ b/test/tools/db_data_porter/test_validation.py @@ -8,16 +8,30 @@ import shutil import tempfile import unittest +from copy import deepcopy from typing import ClassVar from typing import Type +from unittest.mock import patch import pytest from mephisto.abstractions.database import MephistoDB from mephisto.abstractions.databases.local_database import LocalMephistoDB from mephisto.tools.db_data_porter.constants import MEPHISTO_DUMP_KEY +from mephisto.tools.db_data_porter.constants import METADATA_DUMP_KEY +from mephisto.tools.db_data_porter.constants import METADATA_EXPORT_OPTIONS_KEY +from mephisto.tools.db_data_porter.constants import METADATA_MIGRATIONS_KEY +from mephisto.tools.db_data_porter.constants import METADATA_PK_SUBSTITUTIONS_KEY +from mephisto.tools.db_data_porter.constants import METADATA_TIMESTAMP_KEY from mephisto.tools.db_data_porter.validation import validate_dump_data +MOCK_METADATA = { + METADATA_MIGRATIONS_KEY: {}, + METADATA_EXPORT_OPTIONS_KEY: {}, + METADATA_TIMESTAMP_KEY: "2024_05_01_00_00_00", + METADATA_PK_SUBSTITUTIONS_KEY: {}, +} + @pytest.mark.db_data_porter class TestValidation(unittest.TestCase): @@ -39,6 +53,7 @@ def tearDown(self): def test_validate_dump_data_incorrect_provider_name(self, *args): incorrect_provider_name = "incorrect_provider" dump_data = { + METADATA_DUMP_KEY: MOCK_METADATA, incorrect_provider_name: {}, } @@ -51,6 +66,7 @@ def test_validate_dump_data_incorrect_provider_name(self, *args): def test_validate_dump_data_db_values_are_not_dicts(self, *args): dump_data = { + METADATA_DUMP_KEY: MOCK_METADATA, MEPHISTO_DUMP_KEY: [], } @@ -64,6 +80,7 @@ def test_validate_dump_data_db_values_are_not_dicts(self, *args): def test_validate_dump_data_incorrect_format_table_name(self, *args): table_name = 1 dump_data = { + METADATA_DUMP_KEY: MOCK_METADATA, MEPHISTO_DUMP_KEY: { table_name: [], }, @@ -87,6 +104,7 @@ def test_validate_dump_data_incorrect_format_table_name(self, *args): def test_validate_dump_data_incorrect_format_table_rows(self, *args): table_rows = "1" dump_data = { + METADATA_DUMP_KEY: MOCK_METADATA, MEPHISTO_DUMP_KEY: { "tasks": table_rows, }, @@ -105,6 +123,7 @@ def test_validate_dump_data_incorrect_format_table_rows(self, *args): def test_validate_dump_data_incorrect_format_field_name(self, *args): field_name = 1 dump_data = { + METADATA_DUMP_KEY: MOCK_METADATA, MEPHISTO_DUMP_KEY: { "tasks": [ {field_name: ""}, @@ -121,6 +140,7 @@ def test_validate_dump_data_incorrect_format_field_name(self, *args): def test_validate_dump_data_success(self, *args): dump_data = { + METADATA_DUMP_KEY: MOCK_METADATA, MEPHISTO_DUMP_KEY: { "tasks": [ { @@ -134,3 +154,82 @@ def test_validate_dump_data_success(self, *args): result = validate_dump_data(db=self.db, dump_data=dump_data) self.assertEqual(result, []) + + @patch("mephisto.utils.db.get_list_of_db_table_names") + def test_validate_dump_data_metadata_absence(self, mock_get_list_of_db_table_names, *args): + dump_data = { + MEPHISTO_DUMP_KEY: { + "tasks": [ + { + "task_id": "1", + "task_anem": "test", + }, + ], + }, + } + + result = validate_dump_data(db=self.db, dump_data=dump_data) + + self.assertEqual( + result, [f"Dump file has to contain metadata under `{METADATA_DUMP_KEY}` key."] + ) + mock_get_list_of_db_table_names.assert_not_called() + + @patch("mephisto.utils.db.get_list_of_db_table_names") + def test_validate_dump_data_qualifications_only_true_option_without_it_in_metadata( + self, mock_get_list_of_db_table_names, *args + ): + dump_data = { + METADATA_DUMP_KEY: MOCK_METADATA, + MEPHISTO_DUMP_KEY: { + "tasks": [ + { + "task_id": "1", + "task_anem": "test", + }, + ], + }, + } + + result = validate_dump_data(db=self.db, dump_data=dump_data, qualification_only=True) + + self.assertEqual( + result, + [ + "You are attempting to import dump file prepared to import as a full dump, " + "but using `--qualification-only` option. Remove this option and try again." + ], + ) + mock_get_list_of_db_table_names.assert_not_called() + + @patch("mephisto.utils.db.get_list_of_db_table_names") + def test_validate_dump_data_qualifications_only_false_option_with_it_in_metadata( + self, mock_get_list_of_db_table_names, *args + ): + mock_metadata_with_qualifications_only = deepcopy(MOCK_METADATA) + mock_metadata_with_qualifications_only[METADATA_EXPORT_OPTIONS_KEY] = { + "-qo/--qualification-only": True, + } + dump_data = { + METADATA_DUMP_KEY: mock_metadata_with_qualifications_only, + MEPHISTO_DUMP_KEY: { + "tasks": [ + { + "task_id": "1", + "task_anem": "test", + }, + ], + }, + } + + result = validate_dump_data(db=self.db, dump_data=dump_data, qualification_only=False) + + self.assertEqual( + result, + [ + "You are attempting to import dump file prepared to import data related to " + "worker qualifications, but forgot to pass `--qualification-only` option. " + "Add this option and try again." + ], + ) + mock_get_list_of_db_table_names.assert_not_called()