rs_workflows/on_demand_conversion.md

Prefect flows and tasks for on-demand SAFE -zarr conversion.

`cleanup_staged_safe_item_task(env, collection_id, item_id)` `async`

Remove the staged SAFE item from the catalog after conversion.

Source code in docs/rs-client-libraries/rs_workflows/on_demand_conversion.py

@task(name="Cleanup staged SAFE item")
async def cleanup_staged_safe_item_task(
    env: FlowEnvArgs,
    collection_id: str,
    item_id: str,
) -> None:
    """Remove the staged SAFE item from the catalog after conversion."""
    logger = get_run_logger()
    flow_env = FlowEnv(env)
    with flow_env.start_span(__name__, "cleanup-staged-safe-item"):
        catalog_client: CatalogClient = flow_env.rs_client.get_catalog_client()
        logger.info(f"Removing staged SAFE item {item_id!r} from output collection {collection_id!r}.")
        catalog_client.remove_item(collection_id, item_id, raise_for_status=False)
        logger.info(f"Removed staged SAFE item {item_id!r} from output collection {collection_id!r}.")

`on_demand_conversion(conversion_input, retry_config=RetryConfig())` `async`

Convert a legacy SAFE product to its EOPF (Zarr) counterpart on demand.

The flow stages the input SAFE product into the target catalog collection, prepares its assets (decompressing/unzipping archived items when needed), maps the legacy product type to the corresponding output product type, then runs the conversion and publishes the generated product.

Parameters:

Name	Type	Description	Default
`conversion_input`	`ConversionIn`	Conversion parameters (STAC input, environment, target collection mapping and optional selected assets).	required
`retry_config`	`RetryConfig`	Retry policy applied to the staging task.	`RetryConfig()`

Source code in docs/rs-client-libraries/rs_workflows/on_demand_conversion.py

@flow(name="convert-safe")
async def on_demand_conversion(
    conversion_input: ConversionIn,
    retry_config: RetryConfig = RetryConfig(),  # type: ignore
):
    """
    Convert a legacy SAFE product to its EOPF (Zarr) counterpart on demand.

    The flow stages the input SAFE product into the target catalog collection,
    prepares its assets (decompressing/unzipping archived items when needed),
    maps the legacy product type to the corresponding output product type, then
    runs the conversion and publishes the generated product.

    Args:
        conversion_input: Conversion parameters (STAC input, environment, target
            collection mapping and optional selected assets).
        retry_config: Retry policy applied to the staging task.
    """
    logger = get_run_logger()
    logger.info(f"Starting on-demand conversion flow with input: {conversion_input}")
    flow_env = FlowEnv(conversion_input.env)
    staging_collection = conversion_input.generated_product_to_collection_identifier.collection_name
    if staging_collection is None:
        raise ValueError("collection_name is required to stage and retrieve the SAFE item")

    with flow_env.start_span(__name__, "legacy-conversion"):
        # 1. stage
        logger.info("Staging task submitted, waiting for completion...")

        # stac_input is str | dict; normalize once so all downstream code works uniformly.
        stac_item: str | dict[str, Any] = conversion_input.stac_input
        stac_item_id = stac_item.split("/")[-1] if isinstance(stac_item, str) else stac_item["features"][0]["id"]
        original_stac_href = None
        if isinstance(stac_item, str):
            original_stac_href = stac_item
        else:
            input_feature = stac_item["features"][0]
            original_stac_href = next(
                (link["href"] for link in input_feature.get("links", []) if link.get("rel") == "self"),
                input_feature["id"],
            )

        legacy_product = staging_task.with_options(
            retries=retry_config.staging_retries,
            retry_delay_seconds=retry_config.staging_retry_delay,
        ).submit(
            flow_env.serialize(),
            stac_input=conversion_input.stac_input,
            catalog_collection_identifier=staging_collection,
            asset_names=conversion_input.selected_assets or {"product"},
            poll_interval=10,
        )
        staging_results = legacy_product.result()  # type: ignore[unused-coroutine]

        for job_name, job_result in staging_results.items():
            if job_result.get("status") != "successful":
                raise RuntimeError(
                    f"Staging job {job_name!r} failed with status {job_result.get('status')!r}: "
                    f"{job_result.get('message')}",
                )
        catalog_client: CatalogClient = flow_env.rs_client.get_catalog_client()
        catalog_items = ItemCollection(
            catalog_client.get_items(
                collection_id=staging_collection,
                items_ids=[stac_item_id],
            ),
        )
        logger.info(f"Retrieved catalog items after staging: {catalog_items.to_dict()}")

        # Staging can report a "successful" job while staging nothing (e.g. when the
        # requested assets don't match any asset of the input item), which leaves the
        # collection empty. Fail explicitly instead of raising an opaque IndexError.
        if not catalog_items.items:
            raise RuntimeError(
                f"Staging produced no catalog item for {stac_item_id!r} in collection "
                f"{staging_collection!r}. Check that 'selected_assets' matches an asset "
                f"name of the input item (e.g. 'product').",
            )

        # Start from the staged catalog item; if it contains an archived SAFE asset,
        # step 2 will replace this with the uncompressed item.
        safe_item = catalog_items.items[0]

        # 2. Prepare assets for conversion (e.g. unzip if needed)
        try:
            for idx in get_archived_item_indexes(catalog_items):
                safe_zipped_item = catalog_items.items[idx]
                logger.info(f"Processing item {safe_zipped_item.id} for asset extraction...")
                safe_unzipped_item = asset_unzip_decompress_task.submit(safe_zipped_item, True)
                safe_item = safe_unzipped_item.result()  # type: ignore[assignment]
                safe_item.assets.pop("product", None)
                catalog_client.update_item(safe_item)
        except Exception as err:
            raise RuntimeError(
                "Error while trying to update the item collection with the uncompressed/unzipped items. "
                "This error is likely due to a failure in the asset_unzip_decompress_task. "
                "Check previous logs for more details.",
            ) from err
        logger.info(f"Asset preparation completed, proceeding with conversion... {safe_item.to_dict()}")
        logger.info("Staging task completed, proceeding with conversion...")

        # 3. compute the output product type from the product type mapping

        legacy_product_type = safe_item.properties["product:type"]  # ex: IW_SLC__1S
        logger.info(f"Legacy SAFE product type used for mapping: {legacy_product_type}")
        mapping = find_product_type(legacy_product_type)
        output_product_type = mapping["productType"]  # ex: S01SIWSLC
        if not output_product_type:
            raise RuntimeError(f"No product type mapping found for legacy product type {legacy_product_type!r}")
        logger.info(f"Resolved SAFE conversion output product type: {output_product_type}")

        # 4. compute the output bucket from the provided generated_product_to_collection_identifier mapping

        # Match the computed output product type with the flow input mapping.
        # This gives us the output collection requested by the caller.
        generated_product = resolve_generated_product(
            output_product_type,
            [conversion_input.generated_product_to_collection_identifier],
        )

        output_collection = generated_product.collection_name or generated_product.product_type

        # Read the owner/collection/product-type to S3 bucket rules from OSAM.
        bucket_configuration = fetch_csv_from_endpoint(os.environ["RSPY_HOST_OSAM"] + "/internal/configuration")

        # Resolve the final S3 bucket using the same rules as generic DPR processing.
        output_bucket = find_s3_output_bucket(
            bucket_configuration,
            conversion_input.owner_id,
            output_collection,
            output_product_type,
        )
        logger.info(f"Computed SAFE conversion output bucket: {output_bucket}")

        # DPR receives the output directory; EOPF appends the generated product name under it.
        output_zarr_dir_path = os.path.join(
            "s3://",
            output_bucket,
            conversion_input.owner_id,
            output_collection,
        )

        # 5. convert to zarr
        # dpr_client.run_conv_safe_zarr(payload, cluster_info_eopf)
        # The staged catalog item must expose the SAFE product asset expected by the conversion step.
        input_asset = safe_item.assets.get("product") or next(iter(safe_item.assets.values()), None)
        if input_asset is None:
            raise RuntimeError(f"No SAFE asset found for item {safe_item.id!r}")

        href = input_asset.href.rstrip("/")
        if ".SAFE/" in href:
            input_safe_path = href.split(".SAFE/", 1)[0] + ".SAFE"
        else:
            marker = f"/{safe_item.id}/"
            if marker not in href:
                raise RuntimeError(
                    f"Cannot derive SAFE root path from asset href {href!r} and item id {safe_item.id!r}",
                )
            input_safe_path = href.split(marker, 1)[0] + f"/{safe_item.id}"
        logger.info(f"Using input SAFE path for conversion: {input_safe_path}")

        # Temporary local workaround: use the original input SAFE location until staging keeps file:local_path.
        # input_safe_path = original_input_safe_path
        # logger.info(f"Using original input SAFE path for conversion: {input_safe_path}")

        payload = {
            "input_safe_path": input_safe_path,
            "output_zarr_dir_path": output_zarr_dir_path,
        }

        # Create cluster info from JUPYTERHUB_API_TOKEN env var (only in cluster mode, read from the
        # prefect blocks) and Dask cluster label.
        cluster_info = ClusterInfo(
            jupyter_token=os.environ["JUPYTERHUB_API_TOKEN"] if prefect_utils.CLUSTER_MODE else "",
            cluster_label=conversion_input.dask_cluster_label,
            cluster_instance=conversion_input.dask_cluster_instance or "",
        )

        conversion = safe_conversion_task.submit(
            flow_env.serialize(),
            payload,
            cluster_info,
        )
        conversion_result: dict[str, Any] = conversion.result()  # type: ignore[assignment]

        # 6. Read .zattrs to get stac item
        converted_zarr_uri = conversion_result["zarr_uri"]
        converted_item = read_zarr_stac_item(converted_zarr_uri)
        logger.info(f"Staged SAFE item geometry: {safe_item.geometry}")
        logger.info(f"Staged SAFE item bbox: {safe_item.bbox}")
        converted_item.geometry = safe_item.geometry
        converted_item.bbox = safe_item.bbox
        converted_item.properties["product:type"] = output_product_type

        # Keep a reference to the staged SAFE item used as conversion input.
        staged_item = catalog_items.to_dict()["features"][0]
        derived_from_href = next(
            (link["href"] for link in staged_item.get("links", []) if link.get("rel") == "self"),
            original_stac_href or staged_item["id"],
        )
        converted_item.add_link(Link(rel="derived_from", target=derived_from_href, media_type="application/geo+json"))
        logger.info(f"Created STAC item from converted Zarr metadata: {converted_item.to_dict()}")

        # 7. upload to S3
        # 8. post / put to catalog
        processed_item = DprProcessedItemMetadata(
            output_product_id=generated_product.name,
            product_type=output_product_type,
            stac_item=converted_item,
        )
        published = catalog_flow.publish.submit(
            flow_env.serialize(),
            [conversion_input.generated_product_to_collection_identifier],
            [processed_item],
        )
        published.result()  # type: ignore[unused-coroutine]

        # 9. cleanup (legacy files, staging area)
        cleanup = cleanup_staged_safe_item_task.submit(flow_env.serialize(), output_collection, safe_item.id)
        cleanup.result()  # type: ignore[unused-coroutine]

        logger.info("On-demand conversion flow completed successfully.")

`read_zarr_stac_item(zarr_uri)`

Read the Zarr .zattrs file and build the STAC item generated by EOPF.

Source code in docs/rs-client-libraries/rs_workflows/on_demand_conversion.py

def read_zarr_stac_item(zarr_uri: str) -> Item:
    """Read the Zarr .zattrs file and build the STAC item generated by EOPF."""
    zarr_uri = zarr_uri.rstrip("/")
    # EOPF writes the discovery metadata used for catalog publication in the
    # root .zattrs file of the generated Zarr product.
    zattrs_uri = f"{zarr_uri}/.zattrs"
    storage_options = {}
    if urlparse(zattrs_uri).scheme == "s3":
        # The conversion output is stored in S3, so fsspec needs the same local
        # object-storage credentials used by the workflow services.
        storage_options = {
            "key": os.environ["S3_ACCESSKEY"],
            "secret": os.environ["S3_SECRETKEY"],
            "client_kwargs": {
                "endpoint_url": os.environ["S3_ENDPOINT"],
                "region_name": os.environ["S3_REGION"],
            },
        }

    with fsspec.open(zattrs_uri, "r", encoding="utf-8", **storage_options) as file:
        zattrs = json.load(file)

    # The STAC item is built from the EOPF discovery payload embedded in the Zarr metadata.
    stac_discovery = zattrs.get("stac_discovery")
    if not isinstance(stac_discovery, dict) or "properties" not in stac_discovery:
        raise RuntimeError(f"Missing 'stac_discovery' metadata in {zattrs_uri}")

    # Reuse the existing DPR STAC builder so the SAFE conversion output follows
    # the same catalog item shape as the other DPR products.
    item_id = os.path.basename(zarr_uri)
    # Remove .zarr suffix if the id looks like a valid EOPF id, otherwise keep it
    # to avoid potential conflict with existing legacy product in catalogue
    if item_id.startswith("S0"):
        item_id = item_id.removesuffix(".zarr")
    return create_stac_item(
        eopf_origin_datetime=None,
        eopf_feature=stac_discovery,
        s3_data_location=zattrs_uri,
        product_name=item_id,
        dpr_processor="safe_to_zarr",
    )

`resolve_generated_product(output_product_type, generated_products)`

Find the generated product mapping for the computed output product type.

Source code in docs/rs-client-libraries/rs_workflows/on_demand_conversion.py

def resolve_generated_product(
    output_product_type: str,
    generated_products: list[FlowGeneratedProduct],
) -> FlowGeneratedProduct:
    """Find the generated product mapping for the computed output product type."""
    fallback_generated_product = None

    for generated_product in generated_products:
        # SAFE conversion uses the computed EOPF product type as output product id/name.
        if generated_product.name != output_product_type:
            continue

        # Prefer the strict mapping: exact generated product name and exact product type.
        if generated_product.product_type == output_product_type:
            return generated_product

        # Keep the same fallback convention as catalog publishing: exact name with wildcard type.
        if generated_product.product_type == "*" and fallback_generated_product is None:
            fallback_generated_product = generated_product

    if fallback_generated_product:
        if not fallback_generated_product.collection_name:
            raise ValueError(
                f"collection_name is mandatory when product_type is '*' for {output_product_type!r}",
            )
        return fallback_generated_product

    raise ValueError(
        f"No generated product mapping found for output product type {output_product_type!r}: " f"{generated_products}",
    )

`safe_conversion_task(env, payload, cluster_info)` `async`

Submit and monitor the SAFE-to-Zarr DPR conversion job.

Source code in docs/rs-client-libraries/rs_workflows/on_demand_conversion.py

@task(name="SAFE conversion")
async def safe_conversion_task(
    env: FlowEnvArgs,
    payload: dict,
    cluster_info: ClusterInfo,
) -> dict[str, Any]:
    """Submit and monitor the SAFE-to-Zarr DPR conversion job."""
    logger = get_run_logger()
    flow_env = FlowEnv(env)
    with flow_env.start_span(__name__, "safe-conversion"):
        # Use the DPR service client to submit and monitor the SAFE-to-Zarr conversion job.
        dpr_client: DprClient = flow_env.rs_client.get_dpr_client()
        logger.info(f"Triggering SAFE conversion with payload: {payload}")
        job_status = dpr_client.run_conv_safe_zarr(payload, cluster_info)
        conversion_result = dpr_client.wait_for_job(job_status, logger, "SAFE conversion")
        logger.info(f"SAFE conversion completed with result: {conversion_result}")
        # The generic DPR client annotation is list[dict], but conv_safe_zarr returns one result dictionary.
        return cast(dict[str, Any], conversion_result)

rs_workflows/on_demand_conversion.md

cleanup_staged_safe_item_task(env, collection_id, item_id) async

on_demand_conversion(conversion_input, retry_config=RetryConfig()) async

read_zarr_stac_item(zarr_uri)

resolve_generated_product(output_product_type, generated_products)

safe_conversion_task(env, payload, cluster_info) async

`cleanup_staged_safe_item_task(env, collection_id, item_id)` `async`

`on_demand_conversion(conversion_input, retry_config=RetryConfig())` `async`

`read_zarr_stac_item(zarr_uri)`

`resolve_generated_product(output_product_type, generated_products)`

`safe_conversion_task(env, payload, cluster_info)` `async`