Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

(stale) Add cyclic block bootstrap - alternate implementation #523

Draft
wants to merge 10 commits into
base: develop
Choose a base branch
from
74 changes: 74 additions & 0 deletions src/scores/emerging/__scaffolding/block_bootstrap/blkbtrp.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#!/usr/bin/env cabal
{- cabal:
build-depends:
base,
transformers,
vector,
random
-}
{-# LANGUAGE TupleSections #-}

import Control.Monad.Trans.State.Strict
import qualified Data.Vector as V
import System.Random
import System.Random.Stateful

data AxisInfo = AxisInfo
{ _axisLen :: Int,
_axisBlockSize :: Int,
_axisNumBlocks :: Int
}
deriving (Eq, Show)

type AxisBlockIdx = (V.Vector (V.Vector Int))
type AxisBlockIdxExpanded = (V.Vector (V.Vector Int))

-- | Struct to hold axis information. Trims axis if block size is not a multiple of axis length.
mkAxisInfo :: Int -> Int -> AxisInfo
mkAxisInfo l b =
case l `divMod` b of
(0, 0) -> error "Empty block"
(n, 0) -> AxisInfo l b n
(n, _r) -> AxisInfo (n * b) b n -- trim

-- | Make cyclic block indices based on blocksize
mkCyclicBlockIdx :: AxisInfo -> Int -> V.Vector Int
mkCyclicBlockIdx (AxisInfo l b _) = V.unfoldrExactN b (\n -> (n `mod` l, n + 1))

-- | Indices for one block sample for an axis
mkAxisBlockSampleM :: AxisInfo -> State StdGen (V.Vector Int)
mkAxisBlockSampleM a@(AxisInfo l _ _) =
mkCyclicBlockIdx a <$> state (randomR (0, l - 1))

-- | Indices for N block samples for an axis
mkAxisBlockSampleNM :: AxisInfo -> State StdGen AxisBlockIdx
mkAxisBlockSampleNM a@(AxisInfo _ _ n) =
V.replicateM n (mkAxisBlockSampleM a)

-- | (Ordered) list containing n * b block sample indices for each axis,
-- where n = num blocks
-- b = block size
-- return = list containing n * b array of indices
btstrpIndices :: [AxisInfo] -> State StdGen [AxisBlockIdx]
btstrpIndices = mapM mkAxisBlockSampleNM

-- | Unfolds indices for axes based on axis length
indexRange :: [AxisInfo] -> AxisBlockIdx
indexRange = foldMap (\x -> V.singleton (V.iterateN (_axisLen x) (+ 1) 0))

-- | Expand indices into a vector of coordinates. A coordinate is a Vector of integers representing
-- the outermost -> innermost axis
expandIndices :: AxisBlockIdx -> AxisBlockIdxExpanded
expandIndices = foldr (\z acc -> foldMap (\x -> V.cons x <$> acc) z) (V.singleton V.empty)

main :: IO ()
main = do
let axi = mkAxisInfo 10 5
let axi2 = mkAxisInfo 21 7
let axi3 = mkAxisInfo 8 2
let pureGen = mkStdGen 32
let idxRange = indexRange [axi, axi3]
print axi
print idxRange
print $ expandIndices idxRange
print $ evalState (btstrpIndices [axi, axi2]) pureGen
Empty file.
197 changes: 197 additions & 0 deletions src/scores/emerging/block_bootstrap/_test_block_bootstrap_TO_REMOVE.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
# pylint: disable=all

import pprint

import numpy as np
import xarray as xr

from scores.emerging.block_bootstrap.array_info import ArrayInfo, ArrayInfoCollection
from scores.emerging.block_bootstrap.axis_info import AxisInfo, AxisInfoCollection
from scores.emerging.block_bootstrap.block_sampler import BlockSampler
from scores.emerging.block_bootstrap.helpers import (
partial_linear_order_by_ref,
reorder_all_arr_dims,
reorder_dims,
)
from scores.emerging.block_bootstrap.methods import FitBlocksMethod


def _test_numpy_blk_bootstrap_single_iter():
print("\nBOOTSTRAPPING (single iteration, single array)...")
# generate test data
axis_len = [13, 10, 8, 7]
block_sizes = [4, 3, 2, 3]
method = FitBlocksMethod.PARTIAL
cyclic = True
rng = np.random.default_rng(seed=42)
# random dataset with integers so its easy to visualize
input_arr = rng.integers(low=0, high=10, size=axis_len)
da = xr.DataArray(input_arr)

print("\n--- array info ---")
array_info_cln = ArrayInfoCollection.make_from_arrays(
arrs=[da],
bootstrap_dims=da.dims[::-1], # reverse dimensions for fun
block_sizes=block_sizes[::-1],
fit_blocks_method=method,
)
pprint.pp(array_info_cln.array_info)

print("\n--- input array ---")
pprint.pp(input_arr.shape)
res = np.histogram(input_arr, bins=5)
pprint.pp(res)

print("\n--- reordered input array ---")
input_arr_ord = array_info_cln.data_arrays[0].values
pprint.pp(input_arr_ord.shape)
res = np.histogram(input_arr, bins=5)
pprint.pp(res)

print("\n--- sample axis block indices ---")
block_sampler = BlockSampler(
array_info_collection=array_info_cln,
cyclic=cyclic,
)
output_arr = block_sampler.sample_blocks_unchecked([input_arr_ord])[0]
pprint.pp(block_sampler.bootstrap_idx_map)

print("\n--- output array ---")
pprint.pp(output_arr.shape)
# print(output_arr)
res = np.histogram(output_arr, bins=5)
pprint.pp(res)

# print("\nBOOTSTRAPPING (single iteration, multiple arrays, same dimensions)...")

# print("\nBOOTSTRAPPING (single iteration, multiple arrays, differing/partial dimensions)...")


def _test_make_axis_info_collection():
print("\nAXIS COLLECTION...")

arr_test_1 = np.random.rand(10, 10, 12, 7, 5)
da_1 = xr.DataArray(
data=arr_test_1,
dims=["x", "y", "time", "lead_time", "height"],
)
arr_test_2 = np.random.rand(10, 10, 7, 5)
da_2 = xr.DataArray(
data=arr_test_2,
dims=["y", "x", "lead_time", "height"],
)
arr_test_3 = np.random.rand(10, 10, 7)
da_3 = xr.DataArray(
data=arr_test_3,
dims=["x", "y", "lead_time"],
)
axi_collection = make_axis_info_collection(
[da_1, da_2, da_3],
["lead_time", "x", "y"],
[2, 4, 5],
)

print("\n--- axis collection ---")
pprint.pp(axi_collection)

try:
make_axis_info_collection(
[da_1, da_2, da_3],
["lead_time", "x", "y"],
[2, 4, 12],
)
except AssertionError:
print(f"\n--- failure due to invalid block size ---")
print(f"Caught expected assertion error")

try:
arr_test_fail = np.random.rand(10, 11, 7)
da_fail = xr.DataArray(
data=arr_test_fail,
dims=["x", "y", "lead_time"],
)
make_axis_info_collection(
[da_1, da_2, da_fail],
["lead_time", "x", "y"],
[2, 4, 5],
)
except ValueError as e:
print(f"\n--- failure due to inconsistent dim size ---")
print(f"Caught expected value error: {e}")


def _test_reorder_all_dims():
print("\nREORDER ALL ARRAY DIMS...")

arr_test_1 = np.random.rand(10, 10, 12, 7, 5)
da_1 = xr.DataArray(
data=arr_test_1,
dims=["x", "y", "time", "lead_time", "height"],
)
arr_test_2 = np.random.rand(10, 10, 7, 5)
da_2 = xr.DataArray(
data=arr_test_2,
dims=["y", "x", "lead_time", "height"],
)
arr_test_3 = np.random.rand(10, 10, 7)
da_3 = xr.DataArray(
data=arr_test_3,
dims=["x", "y", "lead_time"],
)
(arrs_ord, all_dims) = reorder_all_arr_dims([da_1, da_2, da_3], ["lead_time", "x", "y"])

print("\n--- re-order all arrays ---")
print(f"expected shape for leading dimensions = {(7, 10, 10)}")
print(f"all dimensions ordered = {all_dims}")
print("---")
print(f"shape arr_1 original: {da_1.shape}")
print(f"shape arr_1 reordered: {arrs_ord[0].shape}")
print("---")
print(f"shape arr_2 original: {da_2.shape}")
print(f"shape arr_2 reordered: {arrs_ord[1].shape}")
print("---")
print(f"shape arr_3 original: {da_3.shape}")
print(f"shape arr_3 reordered: {arrs_ord[2].shape}")
print("---")

try:
reorder_all_arr_dims(
[da_1, da_2, da_3],
["lead_time", "x", "y", "potato"],
)
except ValueError as e:
print(f"\n--- failure due to missing dim ---")
print(f"Caught expected value error: {e}")


def _test_reorder_dims():
print("\nREORDER DIMS...")

arr_test = np.random.rand(10, 10, 12, 7, 5)
da = xr.DataArray(
data=arr_test,
dims=["x", "y", "time", "lead_time", "height"],
)

print("\n--- partial order ---")
da_ord = reorder_dims(da, ["y", "height", "x"])
print(f"shape_in: {da.shape}, shape_out: {da_ord.shape}")
print(f"dims_in: {da.dims}, dims_out: {da_ord.dims}")
try:
reorder_dims(da, ["y", "height", "x"], False)
except ValueError as e:
print(f"\n--- no auto ordering for unspecified dims ---")
print(f"Caught expected value error: {e}")

try:
reorder_dims(da, ["x", "height", "alpha", "y"])
except ValueError as e:
print(f"\n--- gap in ordering, could not find alpha ---")
print(f"Caught expected value error: {e}")


if __name__ == "__main__":
# _test_reorder_dims()
# _test_reorder_all_dims()
# _test_make_axis_info_collection()
_test_numpy_blk_bootstrap_single_iter()
116 changes: 116 additions & 0 deletions src/scores/emerging/block_bootstrap/array_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
from dataclasses import dataclass

import xarray as xr

from scores.emerging.block_bootstrap.axis_info import AxisInfo, AxisInfoCollection
from scores.emerging.block_bootstrap.helpers import reorder_all_arr_dims
from scores.emerging.block_bootstrap.methods import FitBlocksMethod


@dataclass
class ArrayInfo:
#: ordered list of axis information for array
axis_info: list[AxisInfo]
#: ordered list of dimensions for array that are bootstrapped
bootstrap_dims: list[str]

@property
def block_sizes(self) -> list[int]:
"""
list of block sizes for all unique dimensions
"""
return [x.block_size for x in self.axis_info]

@property
def num_blocks(self) -> list[int]:
"""
list of number of blocks for all unique dimensions
"""
return [x.num_blocks for x in self.axis_info]

@property
def output_axis_lengths(self) -> list[int]:
"""
list of desired output axis lengths for all unique dimensions
"""
return [x.length_out for x in self.axis_info]

@staticmethod
def make_from_axis_collection(
dims: list[str],
axi_collection: AxisInfoCollection,
):
"""
Args:
bootstrap_dims: dimensions to bootstrap for array
axi_collection: Ordered collection containing axis information
for all arrays, which this array must a subset of.
"""
assert len(set(dims) - set(axi_collection.dims_order)) == 0

axis_info = []
bootstrap_dims = []

# insert in the order of axi_collection
for d, axi in axi_collection.items():
if d in dims:
axis_info.append(axi)
bootstrap_dims.append(d)

return ArrayInfo(axis_info, bootstrap_dims)


@dataclass
class ArrayInfoCollection:
#: Ordered collection of axis information for all arrays
axis_info_collection: AxisInfoCollection
#: Ordered list of array information for each array
array_info: list[ArrayInfo]
#: Arrays with dimensions ordered according `make_from_arrays`
data_arrays: list[xr.DataArray]

@property
def fit_blocks_method(self) -> FitBlocksMethod:
return self.axis_info_collection.fit_blocks_method

@staticmethod
def make_from_arrays(
arrs: list[xr.DataArray],
bootstrap_dims: list[str],
block_sizes: list[int],
fit_blocks_method: FitBlocksMethod,
auto_order_missing: bool = True,
):
if len(block_sizes) != len(bootstrap_dims):
ValueError("`block_sizes` must be the same size as `bootstrap_dims`.")

# reorder dimensions to align them across all arrays
(arrs_reordered, _) = reorder_all_arr_dims(
arrs,
bootstrap_dims,
auto_order_missing,
)

axi_collection = AxisInfoCollection.make_from_arrays(
arrs_reordered,
bootstrap_dims,
block_sizes,
fit_blocks_method,
)

arr_info = []

for arr in arrs:
btrp_dims_for_arr = set(arr.dims) & set(bootstrap_dims)
arr_info.append(
ArrayInfo.make_from_axis_collection(
btrp_dims_for_arr,
axi_collection,
)
)

return ArrayInfoCollection(
axis_info_collection=axi_collection,
array_info=arr_info,
data_arrays=arrs_reordered,
)
Loading