From 586c59e727cbcbd95494ae8944b284e50f613809 Mon Sep 17 00:00:00 2001 From: Mark Rucker Date: Thu, 16 May 2024 13:58:09 -0400 Subject: [PATCH] Added the Bietti benchmark and improved openml timeout handling. --- coba/environments/core.py | 82 ++++++++++++++++++++++++-- coba/environments/openml.py | 16 +++-- coba/tests/test_environments_core.py | 38 +++++++++++- coba/tests/test_environments_openml.py | 6 +- examples/scripts/Getting Started.py | 10 ++-- 5 files changed, 127 insertions(+), 25 deletions(-) diff --git a/coba/environments/core.py b/coba/environments/core.py index 64e0139b..708c2181 100644 --- a/coba/environments/core.py +++ b/coba/environments/core.py @@ -525,11 +525,10 @@ def from_feurer(drop_missing: bool = True) -> 'Environments': drop_missing: Exclude interactions with missing context features. Remarks: - The description of the benchmark is provided at https://arxiv.org/abs/2007.04074. - For Task ids 232, 3044, 75105, and 211723 every row has a missing feature. These - environments will be empty when drop_missing is True. Task id 189866 has been - updated to 361282, a new version of the original dataset that fixes api issues - with the old dataset. + The benchmark is described at https://arxiv.org/abs/2007.04074. For task ids + 232, 3044, 75105, and 211723 every row has a missing feature. These environments + will be empty when drop_missing is True. Task id 189866 has been removed due to an + OpenML issue (see https://github.com/openml/OpenML/issues/1036 for more information). Returns: An Environments object. @@ -552,13 +551,84 @@ def from_feurer(drop_missing: bool = True) -> 'Environments': 167152,167161,167168,167181,167184,167185,167190,167200,167201,167202,167203, 167204,167205,168785,168791,168792,168793,168794,168795,168796,168797,168798, 189779,189786,189828,189829,189836,189840,189841,189843,189844,189845,189846, - 189858,189859,189860,189861,189862,189863,189864,189865,361282,189869,189870, + 189858,189859,189860,189861,189862,189863,189864,189865,189869,189870, 189871,189872,189873,189874,189875,189878,189880,189881,189882,189883,189884, 189887,189890,189893,189894,189899,189900,189902,189905,189906,189908,189909, 190154,190155,190156,190157,190158,190159,211720,211721,211722,211723,211724] return Environments.from_openml(task_id=task_ids,drop_missing=drop_missing) + @staticmethod + def from_bietti(drop_missing: bool = True) -> 'Environments': + """Create Environments from the Bietti benchmark. + + Args: + drop_missing: Exclude interactions with missing context features. + + Remarks: + The benchmark is defined in https://www.jmlr.org/papers/volume22/18-863/18-863.pdf. + + The benchmark has many datasets repeated with small variations such + as a multiclass version and a binary version. Some datasets have many + more variations than others (e.g., fri_c0_1000_10 has 79 variations). + This benchmark also has several synthetically generated datasets such as + RandomRBF_0_0, fri_c0_1000_10, and synthetic_control. + + The following changes were made to the original data ids: + 1. 21 was replaced with a newer version 40975 + 2. 292 was replaced with a newer version 40981 + 3. 478 was replaced with a newer version 40971 + 4. 822 was removed because it is an old version of 823 + 5. 872 was removed because it is an old version of 853 + 6. 948 was removed because it is an old version of 772 + 7. 1036 was replaced with a newer version 40992 + 8. 1043 was replaced with a newer version 40993 + 9. 1454 was removed because it is a duplicate of 1049 + 10. 1470 was replaced with a newer version 23381 + 11. 1217 was removed because it is a subsample of 1216 + 12. 1113 was remove because it is a subsample of 1110 + + Returns: + An Environments object. + """ + + data_ids = [3,6,10,11,12,14,16,18,20,22,23,26,28,30,31,32,36,37,39,40,41,43,44,46, + 48,50,53,54,59,60,61,62,150,151,153,154,155,156,157,158,159,160,161,162, + 180,181,182,183,184,187,273,275,276,277,278,279,285,293,300,307,310,312,313, + 329,333,334,335,336,337,338,339,343,346,351,354,357,375,377,383,384,385,386, + 387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,444,446,448,450, + 457,458,459,461,462,463,464,465,467,468,469,472,475,476,477,479,480,554, + 679,682,683,685,694,713,714,715,716,717,718,719,720,721,722,723,724,725, + 726,727,728,729,730,731,732,733,734,735,736,737,740,741,742,743,744,745,746, + 747,748,749,750,751,752,753,754,755,756,758,759,761,762,763,764,765,766,767, + 768,769,770,771,772,773,774,775,776,777,778,779,780,782,783,784,785,787,788, + 789,790,791,792,793,794,795,796,797,799,800,801,803,804,805,806,807,808,811, + 812,813,814,815,816,817,818,819,820,821,823,824,825,826,827,828,829,830, + 832,833,834,835,836,837,838,841,843,845,846,847,848,849,850,851,853,855,857, + 859,860,862,863,864,865,866,867,868,869,870,871,873,874,875,876,877,878, + 879,880,881,882,884,885,886,888,891,892,893,894,895,896,900,901,902,903,904, + 905,906,907,908,909,910,911,912,913,914,915,916,917,918,919,920,921,922,923, + 924,925,926,927,928,929,931,932,933,934,935,936,937,938,941,942,943,945,946, + 947,949,950,951,952,953,954,955,956,958,959,962,964,965,969,970,971,973, + 974,976,977,978,979,980,983,987,988,991,994,995,996,997,1004,1005,1006,1009, + 1011,1012,1013,1014,1015,1016,1019,1020,1021,1022,1025,1026,1038,1040,1041, + 1044,1045,1046,1048,1049,1050,1054,1055,1056,1059,1060,1061,1062,1063,1064, + 1065,1066,1067,1068,1069,1071,1073,1075,1077,1078,1079,1080,1081,1082,1083, + 1084,1085,1086,1087,1088,1100,1104,1106,1107,1110,1113,1115,1116,1117,1120, + 1121,1122,1123,1124,1125,1126,1127,1128,1129,1130,1131,1132,1133,1135,1136, + 1137,1138,1139,1140,1141,1142,1143,1144,1145,1146,1147,1148,1149,1150,1151, + 1152,1153,1154,1155,1156,1157,1158,1159,1160,1161,1162,1163,1164,1165,1166, + 1169,1216,1217,1218,1233,1235,1236,1237,1238,1241,1242,1412,1413,1441,1442, + 1443,1444,1449,1451,1453,1455,1457,1459,1460,1464,1467,1471,1472,1473,1475, + 1481,1482,1483,1486,1487,1488,1489,1496,1498,1590,40975,40981,40971,23381] + + env40992 = Environments.from_openml(data_id=40992,target='label',drop_missing=drop_missing) + env40993 = Environments.from_openml(data_id=40993,target='label',drop_missing=drop_missing) + + return Environments.from_openml(data_id=data_ids,drop_missing=drop_missing) + env40992 + env40993 + + + def __init__(self, *environments: Union[Environment, Sequence[Environment]]): """Instantiate an Environments class. diff --git a/coba/environments/openml.py b/coba/environments/openml.py index 24a532f8..9601948c 100644 --- a/coba/environments/openml.py +++ b/coba/environments/openml.py @@ -89,7 +89,7 @@ def read(self) -> Iterable[Union[Dense,Sparse]]: raise CobaException(f"We were unable to find an appropriate target column for the given openml source.") if data_descr.get('status') == 'deactivated': - raise CobaException(f"Openml {self._data_id} has been deactivated. This is often due to flags on the data.") + raise CobaException(f"Openml {self._data_id} has been deactivated (see, https://docs.openml.org/#dataset-status).") is_ignore = lambda feat_descr:( feat_descr['is_ignore' ] == 'true' or @@ -139,25 +139,24 @@ def _get_data(self, url:str, key:str, checksum:str=None) -> Iterable[str]: self._clear_cache() raise - def _http_request(self, url: str, tries: int = 0) -> Iterable[str]: + def _http_request(self, url: str, tries: int = 1, timeout: int = 5) -> Iterable[str]: api_key = CobaContext.api_keys.get('openml') semaphore = CobaContext.store.get("openml_semaphore") - # In an attempt to be considerate we stagger/limit our hits of the REST API. - # Openml doesn't publish any rate-limiting guidelines, so this is just a guess. # if semaphore is not None it indictes that we are in a CobaMultiprocessor. + # When this is the case we stagger/limit our hits of the REST API to be considerate. + # Openml doesn't publish any rate-limiting guidelines, so our staggering is a guess. if semaphore: time.sleep(2*random()) try: KB = 1024 MB = 1024*KB if api_key: url = f"{url}?api_key={api_key}" - yield from HttpSource(url, timeout=20, chunk_size=10*MB).read() + yield from HttpSource(url, timeout=timeout, chunk_size=10*MB).read() except TimeoutError: - if tries >= 3: raise - yield from self._http_request(url, tries+1) - return + if tries == 3: raise + yield from self._http_request(url, timeout=5**(tries+1), tries=tries+1) except request.HTTPError as e: status, content = e.code, e.fp.read() @@ -179,7 +178,6 @@ def _http_request(self, url: str, tries: int = 0) -> Iterable[str]: raise CobaException(f"An error was returned by openml: {content}") - def _get_data_descr(self, data_id:int) -> Dict[str,Any]: descr_txt = " ".join(self._get_data(f'https://openml.org/api/v1/json/data/{data_id}', self._cache_keys['data'])) descr_obj = json.loads(descr_txt)["data_set_description"] diff --git a/coba/tests/test_environments_core.py b/coba/tests/test_environments_core.py index d3b84a7c..a8b776da 100644 --- a/coba/tests/test_environments_core.py +++ b/coba/tests/test_environments_core.py @@ -482,13 +482,49 @@ def test_from_feurer(self): 167097,167099,167100,167101,167103,167104,167105,167106,167149,167152,167161,167168,167181, 167184,167185,167190,167200,167201,167202,167203,167204,167205,168785,168791,168792,168793, 168794,168795,168796,168797,168798,189779,189786,189828,189829,189836,189840,189841,189843, - 189844,189845,189846,189858,189859,189860,189861,189862,189863,189864,189865,361282,189869, + 189844,189845,189846,189858,189859,189860,189861,189862,189863,189864,189865,189869, 189870,189871,189872,189873,189874,189875,189878,189880,189881,189882,189883,189884,189887, 189890,189893,189894,189899,189900,189902,189905,189906,189908,189909,190154,190155,190156, 190157,190158,190159,211720,211721,211722,211723,211724} self.assertEqual(actual_tasks,expected_tasks) + def test_from_bietti(self): + actual_tasks = set([e.params['openml_data'] for e in Environments.from_bietti()]) + + expected_tasks = {3,6,10,11,12,14,16,18,20,22,23,26,28,30,31,32,36,37,39,40,41,43,44,46, + 48,50,53,54,59,60,61,62,150,151,153,154,155,156,157,158,159,160,161,162, + 180,181,182,183,184,187,273,275,276,277,278,279,285,293,300,307,310,312,313, + 329,333,334,335,336,337,338,339,343,346,351,354,357,375,377,383,384,385,386, + 387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,444,446,448,450, + 457,458,459,461,462,463,464,465,467,468,469,472,475,476,477,479,480,554, + 679,682,683,685,694,713,714,715,716,717,718,719,720,721,722,723,724,725, + 726,727,728,729,730,731,732,733,734,735,736,737,740,741,742,743,744,745,746, + 747,748,749,750,751,752,753,754,755,756,758,759,761,762,763,764,765,766,767, + 768,769,770,771,772,773,774,775,776,777,778,779,780,782,783,784,785,787,788, + 789,790,791,792,793,794,795,796,797,799,800,801,803,804,805,806,807,808,811, + 812,813,814,815,816,817,818,819,820,821,823,824,825,826,827,828,829,830, + 832,833,834,835,836,837,838,841,843,845,846,847,848,849,850,851,853,855,857, + 859,860,862,863,864,865,866,867,868,869,870,871,873,874,875,876,877,878, + 879,880,881,882,884,885,886,888,891,892,893,894,895,896,900,901,902,903,904, + 905,906,907,908,909,910,911,912,913,914,915,916,917,918,919,920,921,922,923, + 924,925,926,927,928,929,931,932,933,934,935,936,937,938,941,942,943,945,946, + 947,949,950,951,952,953,954,955,956,958,959,962,964,965,969,970,971,973, + 974,976,977,978,979,980,983,987,988,991,994,995,996,997,1004,1005,1006,1009, + 1011,1012,1013,1014,1015,1016,1019,1020,1021,1022,1025,1026,1038,1040,1041, + 1044,1045,1046,1048,1049,1050,1054,1055,1056,1059,1060,1061,1062,1063,1064, + 1065,1066,1067,1068,1069,1071,1073,1075,1077,1078,1079,1080,1081,1082,1083, + 1084,1085,1086,1087,1088,1100,1104,1106,1107,1110,1113,1115,1116,1117,1120, + 1121,1122,1123,1124,1125,1126,1127,1128,1129,1130,1131,1132,1133,1135,1136, + 1137,1138,1139,1140,1141,1142,1143,1144,1145,1146,1147,1148,1149,1150,1151, + 1152,1153,1154,1155,1156,1157,1158,1159,1160,1161,1162,1163,1164,1165,1166, + 1169,1216,1217,1218,1233,1235,1236,1237,1238,1241,1242,1412,1413,1441,1442, + 1443,1444,1449,1451,1453,1455,1457,1459,1460,1464,1467,1471,1472,1473,1475, + 1481,1482,1483,1486,1487,1488,1489,1496,1498,1590,40975,40981,40971,23381, + 40992,40993} + + self.assertEqual(actual_tasks,expected_tasks) + def test_from_lambda(self): context = lambda index,rng : [ round(r,2) for r in rng.randoms(5) ] actions = lambda index,context,rng : [rng.randoms(5) for _ in range(3)] diff --git a/coba/tests/test_environments_openml.py b/coba/tests/test_environments_openml.py index 48980de5..caf84890 100644 --- a/coba/tests/test_environments_openml.py +++ b/coba/tests/test_environments_openml.py @@ -923,7 +923,7 @@ def thread_1(): self.assertIn('openml_042693_arff', CobaContext.cacher) @unittest.mock.patch('coba.environments.openml.HttpSource') - def test_three_timeouts(self,mock): + def test_two_timeouts(self,mock): task = { "task":{ @@ -978,7 +978,6 @@ def test_three_timeouts(self,mock): """ responses = [ - TimeoutError(), TimeoutError(), TimeoutError(), json.dumps(task).splitlines(), @@ -1013,7 +1012,7 @@ def test_three_timeouts(self,mock): self.assertIn('openml_042693_arff', CobaContext.cacher) @unittest.mock.patch('coba.environments.openml.HttpSource') - def test_four_timeouts(self,mock): + def test_three_timeouts(self,mock): task = { "task":{ @@ -1071,7 +1070,6 @@ def test_four_timeouts(self,mock): TimeoutError(), TimeoutError(), TimeoutError(), - TimeoutError(), json.dumps(task).splitlines(), json.dumps(data).splitlines(), json.dumps(feat).splitlines(), diff --git a/examples/scripts/Getting Started.py b/examples/scripts/Getting Started.py index bd5ad434..4153d787 100644 --- a/examples/scripts/Getting Started.py +++ b/examples/scripts/Getting Started.py @@ -1,18 +1,18 @@ """ This is an example script that creates and executes an Experiment. -This script requires that the matplotlib and vowpalwabbit packages be installed. +This script depends on the matplotlib and vowpalwabbit packages. """ import coba as cb -#First, we define the learners that we want to test +#First, we define the learners that we wish to evaluate learners = [ cb.VowpalEpsilonLearner(), cb.RandomLearner() ] -#Next we create an environment we'd like to evaluate against +#Next, we create an environment we'd like to evaluate against environments = cb.Environments.from_linear_synthetic(1000, n_action_features=0).shuffle([1,2,3]) -#We then create and run our experiment from our environments and learners +#We then create and run an experiment using our environments and learners result = cb.Experiment(environments,learners).run() -#After evaluating can create a quick summary plot to get a sense of how the learners performed +#Finally, we can plot the results of our experiment result.plot_learners(y='reward',err='se',xlim=(10,None))