ecmwf-projects · aperezpredictia · Oct 8, 2024 · Oct 8, 2024 · Oct 8, 2024 · Oct 8, 2024
diff --git a/cdsobs/data/insitu-observations-gruan-reference-network/service_definition.yml b/cdsobs/data/insitu-observations-gruan-reference-network/service_definition.yml
@@ -111,7 +111,7 @@ sources:
         units: See uncertainty_units1
       uncertainty_type1:
         description: Uncertainty type 1
-        dtype: float32
+        dtype: uint8
       uncertainty_units1:
         description: Uncertainty units 1
         dtype: object
@@ -121,7 +121,7 @@ sources:
         units: See uncertainty_units2
       uncertainty_type2:
         description: Uncertainty type 2
-        dtype: float32
+        dtype: uint8
       uncertainty_units2:
         description: Uncertainty units 2
         dtype: object
@@ -131,7 +131,7 @@ sources:
         units: See uncertainty_units5
       uncertainty_type5:
         description: Uncertainty type 5
-        dtype: float32
+        dtype: uint8
       uncertainty_units5:
         description: Uncertainty units 5
         dtype: object
@@ -225,7 +225,7 @@ sources:
         units: m
       z_coordinate_type:
         description: Type of z coordinate
-        dtype: float32
+        dtype: uint8
     space_columns:
       z: altitude
       y: latitude|station_configuration

diff --git a/cdsobs/ingestion/api.py b/cdsobs/ingestion/api.py
@@ -92,7 +92,23 @@ def validate_and_homogenise(
         )
     else:
         data_renamed = data
-    # Check mandatory columns are present
+    # Add z coordinate if needed
+    if (
+        "z_coordinate" not in data_renamed
+        and source_definition.space_columns is not None
+        and source_definition.space_columns.z is not None
+    ):
+        z_column = source_definition.space_columns.z
+        logger.info(f"Using {z_column} to define z_coordinate")
+        # We copy it so the original can still be melted as a main_variable.
+        data_renamed["z_coordinate"] = data_renamed.loc[:, z_column].copy()
+        zcol2zcoordtype = dict(altitude=0, pressure=1)
+        data_renamed["z_coordinate_type"] = zcol2zcoordtype[z_column]
+        data_renamed["z_coordinate_type"] = data_renamed["z_coordinate_type"].astype(
+            "int"
+        )
+
+        # Check mandatory columns are present
     check_mandatory_columns(data_renamed, source_definition)
     # Cast data types to those specified in Service Definition file.
     cast_to_descriptions(data_renamed, source_definition)
@@ -285,18 +301,11 @@ def _melt_variables(
         value_vars=variables,
         var_name="observed_variable",
         value_name="observation_value",
-    ).rename(dict(observation_id="original_observation_id"), axis=1, copy=False)
-    # New observation id unique for each observation value
-    logger.info("Adding new observation id (only unique for this chunk)")
-    if "original_observation_id" in homogenised_data_melted:
-        unique_keys = ["original_observation_id", "observed_variable"]
-    else:
-        unique_keys = ["primary_station_id", "report_timestamp", "observed_variable"]
-    homogenised_data_melted = homogenised_data_melted.assign(
-        observation_id=homogenised_data_melted.apply(
-            lambda x: "".join(x[unique_keys].astype(str)), axis=1
-        ).apply(hash_string)
     )
+    # New observation id unique for each observation value
+    if "observation_id" not in homogenised_data_melted:
+        logger.info("Adding new observation id (only unique for this chunk)")
+        homogenised_data_melted["observation_id"] = homogenised_data_melted.index
     # Handle auxiliary variables
     homogenised_data_melted = _handle_aux_variables(
         melt_columns, cdm_tables_location, homogenised_data_melted
@@ -342,7 +351,7 @@ def _handle_aux_variables(
             homogenised_data_melted = homogenised_data_melted.drop(qf_col.name, axis=1)
             # Ensure is int and fill nans with 3 (missing according to the CDM)
             homogenised_data_melted["quality_flag"] = (
-                homogenised_data_melted["quality_flag"].fillna(3).astype("int")
+                homogenised_data_melted["quality_flag"].fillna(3).astype("uint8")
             )
     # Add processing level
     if melt_columns.processing_level:
@@ -373,7 +382,13 @@ def _add_uncertainty_fields(
         uncertainty_type_name = f"uncertainty_type{unc_type_code}"
         uncertainty_units_name = f"uncertainty_units{unc_type_code}"
         homogenised_data_melted[uncertainty_value_name] = numpy.nan
+        homogenised_data_melted[uncertainty_value_name] = homogenised_data_melted[
+            uncertainty_value_name
+        ].astype("float32")
         homogenised_data_melted[uncertainty_type_name] = unc_type_code
+        homogenised_data_melted[uncertainty_type_name] = homogenised_data_melted[
+            uncertainty_type_name
+        ].astype("uint8")
         homogenised_data_melted[uncertainty_units_name] = "NA"
 
         for unc_col in unc_cols:

diff --git a/cdsobs/netcdf.py b/cdsobs/netcdf.py
@@ -18,8 +18,10 @@ def get_encoding_with_compression(
                 encoding[var].update(dict(dtype="S"))
             case "S" | "O", "char_to_str":
                 encoding[var].update(dict(dtype="str", compression=None))
+            case "M", _:
+                pass
             case _, _:
-                continue
+                encoding[var].update(dict(dtype=dataset[var].values.dtype))
     return encoding