diff --git a/README.md b/README.md index 86a087f..8459c24 100644 --- a/README.md +++ b/README.md @@ -57,13 +57,13 @@ files = os.listdir(bw_dir) bw_files = [f"{bw_dir}/{f}" for f in files] features = pd.DataFrame({ - "chrom": ["chr1", "chr1"], - "start": [1000, 2000], - "end": [1500, 2500] + "seqnames": ["chr1", "chr1"], + "starts": [1000, 2000], + "ends": [1500, 2500] }) # Build GenomicArray -garr.build_genomicarray( +dataset = garr.build_genomicarray( files=bw_files, output_path=tempdir, features=features, @@ -80,6 +80,48 @@ garr.build_genomicarray( The build process stores missing intervals from a bigwig file as `np.nan`. The default is to choose an aggregate functions that works with `np.nan`. +### Query a `GenomicArrayDataset` + +Users have the option to reuse the `dataset` object retuned when building the arrays or by creating a `GenomicArrayDataset` object by initializing it to the path where the files were created. + +```python +# Create a GenomicArrayDataset object from the existing dataset +dataset = GenomicArrayDataset(dataset_path=tempdir) + +# Query data for the first 10 regions across all samples +coverage_data = dataset[0:10, :] + +print(expression_data.matrix) +print(expression_data.feature_annotation) +``` + + ## output 1 + array([[1. , 0.5], + [1. , 0.5], + [1. , 0.5], + [1. , 0.5], + [1. , 0.5], + [1. , 0.5], + [1. , 0.5], + [1. , 0.5], + [1. , 0.5], + [1. , 0.5], + [1. , nan]], dtype=float32) + + ## output 2 + seqnames starts ends genarr_feature_index + 0 chr1 300 315 0 + 1 chr1 320 335 1 + 2 chr1 340 355 2 + 3 chr1 360 375 3 + 4 chr1 380 395 4 + 5 chr1 400 415 5 + 6 chr1 420 435 6 + 7 chr1 440 455 7 + 8 chr1 460 475 8 + 9 chr1 480 495 9 + 10 chr1 500 515 10 + diff --git a/src/genomicarrays/GenomicArrayDataset.py b/src/genomicarrays/GenomicArrayDataset.py index 6e2c87c..7bfcaf5 100644 --- a/src/genomicarrays/GenomicArrayDataset.py +++ b/src/genomicarrays/GenomicArrayDataset.py @@ -11,8 +11,7 @@ from genomicarray import GenomicArrayDataset garr = GenomicArrayDataset(dataset_path="/path/to/genomicarray/dir") - feature_indices = 1:10 - result1 = garr[feature_indices, 0] + result1 = garr[0:10, 0] print(result1) """ diff --git a/src/genomicarrays/GenomicArrayDatasetSlice.py b/src/genomicarrays/GenomicArrayDatasetSlice.py index 80ad090..67788c6 100644 --- a/src/genomicarrays/GenomicArrayDatasetSlice.py +++ b/src/genomicarrays/GenomicArrayDatasetSlice.py @@ -11,7 +11,7 @@ from genomicarray import GenomicArrayDataset garr = GenomicArrayDataset(dataset_path="/path/to/genomicarray/dir") - feature_indices = slice(1, 10) + feature_indices = slice(0, 10) result1 = garr[feature_indices, 0] print(result1) diff --git a/src/genomicarrays/build_genomicarray.py b/src/genomicarrays/build_genomicarray.py index 65991e5..beb5b93 100644 --- a/src/genomicarrays/build_genomicarray.py +++ b/src/genomicarrays/build_genomicarray.py @@ -20,8 +20,15 @@ # or just provide the path bw2 = "path/to/object2.bw" + features = pd.DataFrame({ + "seqnames": ["chr1", "chr1"], + "starts": [1000, 2000], + "ends": [1500, 2500] + }) + # Build GenomicArray dataset = build_genomicarray( + features=features output_path=tempdir, files=[bw1, bw2], matrix_options=MatrixOptions(dtype=np.float32),