Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bsweger/expand moto fixtures #66

Merged
merged 3 commits into from
Dec 6, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/cladetime/sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def get_metadata(
# get sequence metadata from a URL
file_suffix = Path(urlparse(metadata_url).path).suffix
if file_suffix in [".tsv", ".zst"]:
metadata = pl.scan_csv(metadata_url, separator="\t", n_rows=num_rows)
metadata = pl.scan_csv(metadata_url, separator="\t", n_rows=num_rows, infer_schema_length=100000)
elif file_suffix == ".xz":
# pytyon's lzma module doesn't support opening via HTTP, so use requests
# to download the file in chunks and then decompress it
Expand All @@ -83,7 +83,7 @@ def get_metadata(
decompressed_chunk = decompressor.decompress(chunk)
buffer.write(decompressed_chunk)
buffer.seek(0)
metadata = pl.scan_csv(buffer, separator="\t", n_rows=num_rows)
metadata = pl.scan_csv(buffer, separator="\t", n_rows=num_rows, infer_schema_length=100000)
else:
raise ValueError(f"Unsupported compression type: {file_suffix}")

Expand Down
62 changes: 23 additions & 39 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import json
import lzma
from datetime import datetime, timezone
from pathlib import Path

import boto3
import pytest
Expand All @@ -11,6 +10,15 @@
from cladetime.util.config import Config


@pytest.fixture
def moto_file_path() -> Path:
"""
Return path to the unit test files.
"""
moto_file_path = Path(__file__).parent.joinpath("data").joinpath("moto_fixture")
return moto_file_path


@pytest.fixture(scope="function")
def demo_mode(monkeypatch):
"Set demo mode to True for tests using the Nextstrain 100K sequence files."
Expand All @@ -37,22 +45,13 @@ def test_sequences():
return (file_name, set(sequences))


@pytest.fixture
def ncov_metadata():
return {
"schema_version": "v1",
"nextclade_dataset_name": "SARS-CoV-2",
"nextclade_dataset_version": "",
}


@pytest.fixture
def s3_object_keys():
return {
"sequence_metadata": "data/object-key/metadata.tsv.zst",
"sequence_metadata_xz": "data/object-key/metadata.tsv.xz",
"sequence": "data/object-key/sequences.fasta.zst",
"ncov_metadata": "data/object-key/metadata_version.json",
"sequence_metadata_zst": "data/metadata.tsv.zst",
"sequence_metadata_xz": "data/metadata.tsv.xz",
"sequences_xz": "data/sequences.fasta.xz",
"ncov_metadata": "data/metadata_version.json",
}


Expand All @@ -65,7 +64,7 @@ def mock_session(mocker):


@pytest.fixture
def s3_setup(s3_object_keys, ncov_metadata):
def s3_setup(moto_file_path, s3_object_keys):
"""
Setup mock S3 bucket with versioned objects that represent testing files for
sequence data, sequence metadata, and ncov pipeline metadata.
Expand Down Expand Up @@ -93,27 +92,12 @@ def s3_setup(s3_object_keys, ncov_metadata):
# Add versioned sequence, sequence metadata, and ncov metadata test objects
versions = ["2023-01-01 03:05:01", "2023-02-05 03:33:06", "2023-02-05 14:33:06", "2023-03-22 22:55:12"]
for i, version in enumerate(versions, start=1):
for key, value in s3_object_keys.items():
if key == "ncov_metadata":
ncov_metadata["nextclade_dataset_version"] = f"version-{i}"
ncov_metadata["nextclade_dataset_name"] = "sars-cov-2"
ncov_metadata["nextclade_dataset_name_full"] = "data/clades"
ncov_metadata["nextclade_version"] = "nexclade 3.8.2"
ncov_metadata["nextclade_version_num"] = "3.8.2"
ncov_metadata["greeting"] = "hello from pytest and moto"
content = json.dumps(ncov_metadata)
elif key == "sequence_metadata_xz":
content = lzma.compress(str.encode(f"{value} version {i}"))
else:
content = f"{value} version {i}"
# use freezegun to override system date, which in
# turn sets S3 object version LastModified date
with freeze_time(version):
s3_client.put_object(
Bucket=bucket_name,
Key=value,
Body=content,
)
extra_args = {"Metadata": {"version": str(i)}}
# use freezegun to override system date, which in
# turn sets S3 object version LastModified date
with freeze_time(version):
for file in moto_file_path.iterdir():
s3_client.upload_file(file, bucket_name, f"data/{file.name}", ExtraArgs=extra_args)

yield s3_client, bucket_name, s3_object_keys

Expand All @@ -127,8 +111,8 @@ def test_config(s3_setup):
test_config = Config()
test_config.nextstrain_min_seq_date = datetime(2023, 1, 1).replace(tzinfo=timezone.utc)
test_config.nextstrain_ncov_bucket = "versioned-bucket"
test_config.nextstrain_genome_metadata_key = s3_object_keys["sequence_metadata"]
test_config.nextstrain_genome_sequence_key = s3_object_keys["sequence"]
test_config.nextstrain_genome_metadata_key = s3_object_keys["sequence_metadata_zst"]
test_config.nextstrain_genome_sequence_key = s3_object_keys["sequences_xz"]
test_config.nextstrain_ncov_metadata_key = s3_object_keys["ncov_metadata"]

return test_config
5 changes: 3 additions & 2 deletions tests/data/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@

This directory contains test files used by CladeTime's test suite.

* `moto_fixture` directory contains files used when recreating Nextstrain/Nextclade data in the moto mocked S3 bucket
* `test_metadata.tsv` was used to test `get_clade_list` before that functionality moved to variant-nowcast-hub
* `metadata.tsv.xz` and `metadata.tsv.xz` are used to test setting CladeTime's sequence_metadata property.
* `test_sequence.xz` is used to test the sequence filter function
* `test_sequences.fasta`, `test_sequences.fasta`, and `test_nexclade_dataset.zip` are used in Nextclade integration tests
* `test_sequences_evolving.fasta` is used to test clade assignments with prior reference trees
* `test_sequences.fasta`, `test_sequences_fake.fasta`, and `test_nexclade_dataset.zip` are used in Nextclade integration tests
* `test_sequences_updated.fasta` is used to test clade assignments with prior reference trees
* it contains 3 sequence strains with clade assignments that changed between 2024-08-02 and 2024-11-07
* differing clade assignments were determined by comparing the 2024-08-02 and 2024-11-07 versions of Nexstrain's sequence metadata
* `USA/VA-CDC-LC1109961/2024` is assigned to `24C` as of 2024-08-02 and `24E` as of 2024-11-07
Expand Down
Binary file added tests/data/moto_fixture/metadata.tsv.xz
Binary file not shown.
Binary file added tests/data/moto_fixture/metadata.tsv.zst
Binary file not shown.
1 change: 1 addition & 0 deletions tests/data/moto_fixture/metadata_version.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"schema_version":"v1","nextclade_version":"nextclade 3.8.2","nextclade_dataset_name":"SARS-CoV-2","nextclade_dataset_version":"2024-11-19--14-18-53Z","nextclade_tsv_sha256sum":"1800155490bd925a85fbcb4a46d19c72311a0ed6d1cd58d7d26899673cca83f1","metadata_tsv_sha256sum":"dae40f81f1cef7cb4a246c4ad483d20bda91ed3c79f7bfb81de4f67cd4797156"}
Binary file added tests/data/moto_fixture/sequences.fasta.xz
Binary file not shown.
15 changes: 7 additions & 8 deletions tests/unit/test_cladetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,27 +114,27 @@ def test_cladetime_future_date():


@pytest.mark.parametrize(
"sequence_as_of, expected_content",
"sequence_as_of, expected_metadata",
[
(
"2024-09-01",
"version 4",
{"version": "4"},
),
(
None,
"version 4",
{"version": "4"},
),
(
datetime(2023, 2, 5, 5, 55),
"version 2",
{"version": "2"},
),
(
datetime(2023, 2, 5, 1, 22),
"version 1",
{"version": "1"},
),
],
)
def test_cladetime_urls(s3_setup, test_config, sequence_as_of, expected_content):
def test_cladetime_urls(s3_setup, test_config, sequence_as_of, expected_metadata):
s3_client, bucket_name, s3_object_keys = s3_setup

mock = MagicMock(return_value=test_config, name="CladeTime._get_config_mock")
Expand All @@ -147,7 +147,7 @@ def test_cladetime_urls(s3_setup, test_config, sequence_as_of, expected_content)
key = parsed_url.path.strip("/")
version_id = parse_qs(parsed_url.query)["versionId"][0]
object = s3_client.get_object(Bucket=bucket_name, Key=key, VersionId=version_id)
assert expected_content in object["Body"].read().decode("utf-8").lower()
assert object.get("Metadata") == expected_metadata

if ct.sequence_as_of < test_config.nextstrain_min_ncov_metadata_date:
assert ct.url_ncov_metadata is None
Expand All @@ -170,7 +170,6 @@ def test_cladetime_ncov_metadata(s3_setup, s3_object_keys, test_config):
)
ct.url_ncov_metadata = presigned_url

assert ct.ncov_metadata.get("nextclade_dataset_version") == "version-4"
assert ct.ncov_metadata.get("nextclade_dataset_name_full") == "nextstrain/sars-cov-2/wuhan-hu-1/orfs"
assert ct.ncov_metadata.get("nextclade_version_num") == "3.8.2"

Expand Down
44 changes: 22 additions & 22 deletions tests/unit/test_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,41 +53,41 @@ def test_get_metadata(test_file_path, metadata_file):
assert expected_cols.issubset(metadata_cols)


def test_get_metadata_url(s3_setup):
def test_get_metadata_url(s3_setup, test_file_path):
"""
Test get_metadata when used with an S3 URL instead of a local file.
Needs additional research into moto and S3 url access.
"""
s3_client, bucket_name, s3_object_keys = s3_setup

# get metadata file from S3 using ZSTD compression
presigned_url = s3_client.generate_presigned_url(
"get_object",
Params={"Bucket": bucket_name, "Key": s3_object_keys["sequence_metadata"]},
ExpiresIn=3600,
# For .zst files, get_metadata uses polars to access the file directly via scan_csv
# However, that is difficult to test, because polars doesn't use requests or boto
# under the hood, so it doesn't work with moto. Thus, this hacky test passes a
# test file path as the metadata_url param.
test_file = test_file_path / "metadata.tsv.zst"
metadata = sequence.get_metadata(metadata_url=str(test_file))
# ensure lazyframe can be collected and check its shape and columns
metadata_df = metadata.collect()
assert metadata_df.shape == (99373, 58)
assert all(
col in metadata_df.columns for col in ["strain", "date", "country", "division", "location", "clade_nextstrain"]
)
metadata = sequence.get_metadata(metadata_url=presigned_url)
assert isinstance(metadata, pl.LazyFrame)
# ZNK 2024-11-25: I would like to test this, but I am not sure what the
# output should be and I am getting 403: no body errors with this.
# expected_metadata = pl.DataFrame(
# {"data/object-key/metadata.tsv.zst version 4": []}
# ).cast({"data/object-key/metadata.tsv.zst version 4": str})
# assert_frame_equal(expected_metadata, metadata.collect_schema(), check_column_order=False, check_row_order=False)

# get metadata file from S3 using XZ compression

# Get metadata file from S3 using XZ compression. Here we can use a presigned S3 URL
# because for .xz files, get_metadata uses requests to download the file in chunks
# before polars processes it.
presigned_url = s3_client.generate_presigned_url(
"get_object",
Params={"Bucket": bucket_name, "Key": s3_object_keys["sequence_metadata_xz"]},
ExpiresIn=3600,
)
metadata = sequence.get_metadata(metadata_url=presigned_url)
assert isinstance(metadata, pl.LazyFrame)
expected_metadata = pl.DataFrame(
{"data/object-key/metadata.tsv.xz version 4": []}
).cast({"data/object-key/metadata.tsv.xz version 4": str})

assert_frame_equal(expected_metadata, metadata.collect(), check_column_order=False, check_row_order=False)
# ensure lazyframe can be collected and check its shape and columns
metadata_df = metadata.collect()
assert metadata_df.shape == (99373, 58)
assert all(
col in metadata_df.columns for col in ["strain", "date", "country", "division", "location", "clade_nextstrain"]
)


def test_filter_metadata():
Expand Down
3 changes: 2 additions & 1 deletion tests/unit/util/test_reference.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,15 @@ def test__get_s3_object_url(s3_setup):
s3_client, bucket_name, s3_object_keys = s3_setup

target_date = datetime.strptime("2023-02-15", "%Y-%m-%d").replace(tzinfo=timezone.utc)
object_key = s3_object_keys["sequence_metadata"]
object_key = s3_object_keys["sequence_metadata_zst"]

version_id, version_url = _get_s3_object_url(bucket_name, object_key, target_date)

assert version_id is not None
s3_object = s3_client.get_object(Bucket=bucket_name, Key=object_key, VersionId=version_id)
last_modified = s3_object["LastModified"]

assert s3_object.get("Metadata") == {"version": "3"}
assert last_modified <= target_date
assert last_modified == datetime.strptime("2023-02-05 14:33:06", "%Y-%m-%d %H:%M:%S").replace(tzinfo=timezone.utc)
assert version_url == f"https://{bucket_name}.s3.amazonaws.com/{object_key}?versionId={version_id}"
Loading