Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

uniform stratified sampling of alts #66

Open
wants to merge 5 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 58 additions & 4 deletions choicemodels/tools/mergedchoicetable.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"""
import numpy as np
import pandas as pd
import warnings


class MergedChoiceTable(object):
Expand Down Expand Up @@ -93,10 +94,24 @@ class MergedChoiceTable(object):
random_state : NOT YET IMPLEMENTED
Representation of random state, for replicability of the sampling.

sampling_regime : str, [None, 'stratified'], optional
Specify the sampling regime for construction of the MergedChoiceTable. Stratified
sampling as defined here involves creating a sample by sampling equally from
subsamples of the population as identified by membership in a specified group
or stratum.

strata: str, optional
If stratified sampling is specified as the sampling regime, then a column name
from the alternatives table must be provided upon which stratification will be
based. Because equal numbers of samples will be drawn from within each strata,
the strata should be distributed roughly evenly across the population, e.g. if
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As long as I have used the same sampling strategy to fit the model, Why does it need to be distributed roughly evenly across the population?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Only because I wrote the stratification code such that it defines one value for samp_size_per_strata (L388) and uses this number to sample from each stratum. A more generalized version of the code would compute strata proportions from the universe of alternatives and then sample from each strata accordingly such that the strata proportions in the sample match those from the universe, but I didn't code that up.

there are 10 strata then each strata should contain roughly 10% of the population.

"""
def __init__(self, observations, alternatives, chosen_alternatives=None,
sample_size=None, replace=True, weights=None, availability=None,
interaction_terms=None, random_state=None):
interaction_terms=None, random_state=None, sampling_regime=None,
strata=None):

# Standardize and validate the inputs...

Expand Down Expand Up @@ -164,6 +179,8 @@ def __init__(self, observations, alternatives, chosen_alternatives=None,
self.chosen_alternatives = chosen_alternatives
self.sample_size = sample_size
self.replace = replace
self.sampling_regime = sampling_regime
self.strata = strata
self.weights = weights
self.interaction_terms = interaction_terms
self.random_state = random_state
Expand Down Expand Up @@ -348,13 +365,51 @@ def _build_table(self):
samp_size = self.sample_size - 1

obs_ids = np.repeat(self.observations.index.values, samp_size)


# STRATIFIED SAMPLING OF ALTS: for now we are only supporting stratified sampling
# with replacement and no sampling weights
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this restriction needed? How hard is it to support the other configs?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not hard, just wasn't needed for my use case so I didn't spend the time to write the extra code.

if self.sampling_regime == 'stratified':

# stratified sampling requires obs_ids to appear in a different order for
# efficient sample generation: [1,1,1,2,2,2,3,3,3] --> [1,2,3,1,2,3,1,2,3]
obs_ids = list(self.observations.index.values) * samp_size

if (self.replace == False) or (self.weights is not None):
raise ValueError(
"Stratified sampling is currently only supported for sampling with "
"replacement with uniform sampling weights.")
elif self.strata is None:
raise ValueError(
"Must specify the name of the column to use for stratified sampling.")
else:
alt_ids = []
strata_vals = self.alternatives[self.strata].unique()
num_strata = float(len(strata_vals))
samp_size_per_strata = int(np.ceil(samp_size / num_strata))
new_samp_size = int(num_strata * samp_size_per_strata)

# if we've augmented the sample size, must updated the obs_ids object
if new_samp_size != samp_size:
warnings.warn(
"Total sample size will be {0} instead of {1} "
"after stratification".format(str(new_samp_size), str(samp_size)))
samp_size = new_samp_size
obs_ids = list(self.observations.index.values) * samp_size

for stratum in strata_vals:
stratum_alts = self.alternatives.loc[self.alternatives[self.strata] == stratum]
sampled_alts = np.random.choice(stratum_alts.index.values,
replace = True,
size = n_obs * samp_size_per_strata).tolist()
alt_ids += sampled_alts


# SINGLE SAMPLE: this covers cases where we can draw a single, large sample of
# alternatives and distribute them among the choosers, e.g. sampling without
# replacement, with optional alternative-specific weights but NOT weights that
# apply to combinations of observation x alternative

if (self.replace == True) & (self.weights is None):
elif (self.replace == True) & (self.weights is None):

alt_ids = np.random.choice(self.alternatives.index.values,
replace = True,
Expand Down Expand Up @@ -387,7 +442,6 @@ def _build_table(self):
p=w, size=samp_size).tolist()
alt_ids += sampled_alts


# Append chosen ids if applicable
if (self.chosen_alternatives is not None):
obs_ids = np.append(obs_ids, self.observations.index.values)
Expand Down
6 changes: 3 additions & 3 deletions choicemodels/tools/simulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"""
import numpy as np
import pandas as pd
from multiprocessing import Process, Manager, Array, cpu_count
from multiprocessing import Process, Manager, Array, cpu_count, Pool
from tqdm import tqdm
import warnings

Expand Down Expand Up @@ -179,7 +179,7 @@ def iterative_lottery_choices(choosers, alternatives, mct_callable, probs_callab
break
if alts[capacity].max() < choosers[size].min():
print("{} choosers cannot be allocated.".format(len(choosers)))
print("\nRemaining capacity on alternatives but not enough to accodomodate choosers' sizes")
print("\nRemaining capacity on alternatives but not enough to accomodate choosers' sizes")
break
if chooser_batch_size is None or chooser_batch_size > len(choosers):
mct = mct_callable(choosers.sample(frac=1), alts)
Expand Down Expand Up @@ -429,7 +429,7 @@ def parallel_lottery_choices(
chooser_size = '_size'
choosers.loc[:, chooser_size] = 1

if chooser_batch_size is None or chooser_batch_size > len(choosers):
if chooser_batch_size is None or chooser_batch_size >= len(choosers):
obs_batches = [choosers.index.values]
else:
obs_batches = [
Expand Down
22 changes: 19 additions & 3 deletions tests/test_mct.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,16 @@ def alts():
return pd.DataFrame(d2).set_index('aid')


def test_mergedchoicetable(obs, alts):
@pytest.fixture
def stratified_alts():
d2 = {'aid': [0,1,2,3,4,5,6,7,8,9],
'altval': [10,20,30,40,50,60,70,80,90,100],
'stratum': [1,2,3,4,5,1,2,3,4,5]}

return pd.DataFrame(d2).set_index('aid')


def test_mergedchoicetable(obs, alts, stratified_alts):
# NO SAMPLING, TABLE FOR SIMULATION

mct = choicemodels.tools.MergedChoiceTable(obs, alts).to_frame()
Expand All @@ -43,11 +52,18 @@ def test_mergedchoicetable(obs, alts):
assert list(mct.columns.sort_values()) == list(sorted(['obsval', 'altval',
'w', 'chosen']))

# STRATIFIED SAMPLING
mct = choicemodels.tools.MergedChoiceTable(
obs, stratified_alts, sample_size=5, sampling_regime='stratified',
strata='stratum').to_frame()

for obs_id, obs_df in mct.groupby(level=0):
assert len(obs_df['stratum'].unique()) == 5

# REPLACEMENT, NO WEIGHTS, TABLE FOR SIMULATION

mct = choicemodels.tools.MergedChoiceTable(obs, alts,
sample_size = 2).to_frame()
mct = choicemodels.tools.MergedChoiceTable(
obs, alts, sample_size=2).to_frame()

assert len(mct) == 4
assert sum(mct.altval==30) < 4
Expand Down