From 586c59e727cbcbd95494ae8944b284e50f613809 Mon Sep 17 00:00:00 2001
From: Mark Rucker <rucker.mark@gmail.com>
Date: Thu, 16 May 2024 13:58:09 -0400
Subject: [PATCH] Added the Bietti benchmark and improved openml timeout
 handling.

---
 coba/environments/core.py              | 82 ++++++++++++++++++++++++--
 coba/environments/openml.py            | 16 +++--
 coba/tests/test_environments_core.py   | 38 +++++++++++-
 coba/tests/test_environments_openml.py |  6 +-
 examples/scripts/Getting Started.py    | 10 ++--
 5 files changed, 127 insertions(+), 25 deletions(-)

diff --git a/coba/environments/core.py b/coba/environments/core.py
index 64e0139b..708c2181 100644
--- a/coba/environments/core.py
+++ b/coba/environments/core.py
@@ -525,11 +525,10 @@ def from_feurer(drop_missing: bool = True) -> 'Environments':
             drop_missing: Exclude interactions with missing context features.
 
         Remarks:
-            The description of the benchmark is provided at https://arxiv.org/abs/2007.04074.
-            For Task ids 232, 3044, 75105, and 211723 every row has a missing feature. These
-            environments will be empty when drop_missing is True. Task id 189866 has been
-            updated to 361282, a new version of the original dataset that fixes api issues
-            with the old dataset.
+            The benchmark is described at https://arxiv.org/abs/2007.04074. For task ids
+            232, 3044, 75105, and 211723 every row has a missing feature. These environments
+            will be empty when drop_missing is True. Task id 189866 has been removed due to an
+            OpenML issue (see https://github.com/openml/OpenML/issues/1036 for more information).
 
         Returns:
             An Environments object.
@@ -552,13 +551,84 @@ def from_feurer(drop_missing: bool = True) -> 'Environments':
                     167152,167161,167168,167181,167184,167185,167190,167200,167201,167202,167203,
                     167204,167205,168785,168791,168792,168793,168794,168795,168796,168797,168798,
                     189779,189786,189828,189829,189836,189840,189841,189843,189844,189845,189846,
-                    189858,189859,189860,189861,189862,189863,189864,189865,361282,189869,189870,
+                    189858,189859,189860,189861,189862,189863,189864,189865,189869,189870,
                     189871,189872,189873,189874,189875,189878,189880,189881,189882,189883,189884,
                     189887,189890,189893,189894,189899,189900,189902,189905,189906,189908,189909,
                     190154,190155,190156,190157,190158,190159,211720,211721,211722,211723,211724]
 
         return Environments.from_openml(task_id=task_ids,drop_missing=drop_missing)
 
+    @staticmethod
+    def from_bietti(drop_missing: bool = True) -> 'Environments':
+        """Create Environments from the Bietti benchmark.
+
+        Args:
+            drop_missing: Exclude interactions with missing context features.
+
+        Remarks:
+            The benchmark is defined in https://www.jmlr.org/papers/volume22/18-863/18-863.pdf.
+
+            The benchmark has many datasets repeated with small variations such
+            as a multiclass version and a binary version. Some datasets have many
+            more variations than others (e.g., fri_c0_1000_10 has 79 variations).
+            This benchmark also has several synthetically generated datasets such as
+            RandomRBF_0_0, fri_c0_1000_10, and synthetic_control.
+
+            The following changes were made to the original data ids:
+                1. 21 was replaced with a newer version 40975
+                2. 292 was replaced with a newer version 40981
+                3. 478 was replaced with a newer version 40971
+                4. 822 was removed because it is an old version of 823
+                5. 872 was removed because it is an old version of 853
+                6. 948 was removed because it is an old version of 772
+                7. 1036 was replaced with a newer version 40992
+                8. 1043 was replaced with a newer version 40993
+                9. 1454 was removed because it is a duplicate of 1049
+                10. 1470 was replaced with a newer version 23381
+                11. 1217 was removed because it is a subsample of 1216
+                12. 1113 was remove because it is a subsample of 1110
+
+        Returns:
+            An Environments object.
+        """
+
+        data_ids = [3,6,10,11,12,14,16,18,20,22,23,26,28,30,31,32,36,37,39,40,41,43,44,46,
+                    48,50,53,54,59,60,61,62,150,151,153,154,155,156,157,158,159,160,161,162,
+                    180,181,182,183,184,187,273,275,276,277,278,279,285,293,300,307,310,312,313,
+                    329,333,334,335,336,337,338,339,343,346,351,354,357,375,377,383,384,385,386,
+                    387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,444,446,448,450,
+                    457,458,459,461,462,463,464,465,467,468,469,472,475,476,477,479,480,554,
+                    679,682,683,685,694,713,714,715,716,717,718,719,720,721,722,723,724,725,
+                    726,727,728,729,730,731,732,733,734,735,736,737,740,741,742,743,744,745,746,
+                    747,748,749,750,751,752,753,754,755,756,758,759,761,762,763,764,765,766,767,
+                    768,769,770,771,772,773,774,775,776,777,778,779,780,782,783,784,785,787,788,
+                    789,790,791,792,793,794,795,796,797,799,800,801,803,804,805,806,807,808,811,
+                    812,813,814,815,816,817,818,819,820,821,823,824,825,826,827,828,829,830,
+                    832,833,834,835,836,837,838,841,843,845,846,847,848,849,850,851,853,855,857,
+                    859,860,862,863,864,865,866,867,868,869,870,871,873,874,875,876,877,878,
+                    879,880,881,882,884,885,886,888,891,892,893,894,895,896,900,901,902,903,904,
+                    905,906,907,908,909,910,911,912,913,914,915,916,917,918,919,920,921,922,923,
+                    924,925,926,927,928,929,931,932,933,934,935,936,937,938,941,942,943,945,946,
+                    947,949,950,951,952,953,954,955,956,958,959,962,964,965,969,970,971,973,
+                    974,976,977,978,979,980,983,987,988,991,994,995,996,997,1004,1005,1006,1009,
+                    1011,1012,1013,1014,1015,1016,1019,1020,1021,1022,1025,1026,1038,1040,1041,
+                    1044,1045,1046,1048,1049,1050,1054,1055,1056,1059,1060,1061,1062,1063,1064,
+                    1065,1066,1067,1068,1069,1071,1073,1075,1077,1078,1079,1080,1081,1082,1083,
+                    1084,1085,1086,1087,1088,1100,1104,1106,1107,1110,1113,1115,1116,1117,1120,
+                    1121,1122,1123,1124,1125,1126,1127,1128,1129,1130,1131,1132,1133,1135,1136,
+                    1137,1138,1139,1140,1141,1142,1143,1144,1145,1146,1147,1148,1149,1150,1151,
+                    1152,1153,1154,1155,1156,1157,1158,1159,1160,1161,1162,1163,1164,1165,1166,
+                    1169,1216,1217,1218,1233,1235,1236,1237,1238,1241,1242,1412,1413,1441,1442,
+                    1443,1444,1449,1451,1453,1455,1457,1459,1460,1464,1467,1471,1472,1473,1475,
+                    1481,1482,1483,1486,1487,1488,1489,1496,1498,1590,40975,40981,40971,23381]
+
+        env40992 = Environments.from_openml(data_id=40992,target='label',drop_missing=drop_missing)
+        env40993 = Environments.from_openml(data_id=40993,target='label',drop_missing=drop_missing)
+
+        return Environments.from_openml(data_id=data_ids,drop_missing=drop_missing) + env40992 + env40993
+
+
+
     def __init__(self, *environments: Union[Environment, Sequence[Environment]]):
         """Instantiate an Environments class.
 
diff --git a/coba/environments/openml.py b/coba/environments/openml.py
index 24a532f8..9601948c 100644
--- a/coba/environments/openml.py
+++ b/coba/environments/openml.py
@@ -89,7 +89,7 @@ def read(self) -> Iterable[Union[Dense,Sparse]]:
                 raise CobaException(f"We were unable to find an appropriate target column for the given openml source.")
 
             if data_descr.get('status') == 'deactivated':
-                raise CobaException(f"Openml {self._data_id} has been deactivated. This is often due to flags on the data.")
+                raise CobaException(f"Openml {self._data_id} has been deactivated (see, https://docs.openml.org/#dataset-status).")
 
             is_ignore = lambda feat_descr:(
                 feat_descr['is_ignore'        ] == 'true' or
@@ -139,25 +139,24 @@ def _get_data(self, url:str, key:str, checksum:str=None) -> Iterable[str]:
             self._clear_cache()
             raise
 
-    def _http_request(self, url: str, tries: int = 0) -> Iterable[str]:
+    def _http_request(self, url: str, tries: int = 1, timeout: int = 5) -> Iterable[str]:
         api_key   = CobaContext.api_keys.get('openml')
         semaphore = CobaContext.store.get("openml_semaphore")
 
-        # In an attempt to be considerate we stagger/limit our hits of the REST API.
-        # Openml doesn't publish any rate-limiting guidelines, so this is just a guess.
         # if semaphore is not None it indictes that we are in a CobaMultiprocessor.
+        # When this is the case we stagger/limit our hits of the REST API to be considerate.
+        # Openml doesn't publish any rate-limiting guidelines, so our staggering is a guess.
         if semaphore: time.sleep(2*random())
 
         try:
             KB = 1024
             MB = 1024*KB
             if api_key: url = f"{url}?api_key={api_key}"
-            yield from HttpSource(url, timeout=20, chunk_size=10*MB).read()
+            yield from HttpSource(url, timeout=timeout, chunk_size=10*MB).read()
 
         except TimeoutError:
-            if tries >= 3: raise
-            yield from self._http_request(url, tries+1)
-            return
+            if tries == 3: raise
+            yield from self._http_request(url, timeout=5**(tries+1), tries=tries+1)
 
         except request.HTTPError as e:
             status, content = e.code, e.fp.read()
@@ -179,7 +178,6 @@ def _http_request(self, url: str, tries: int = 0) -> Iterable[str]:
 
             raise CobaException(f"An error was returned by openml: {content}")
 
-
     def _get_data_descr(self, data_id:int) -> Dict[str,Any]:
         descr_txt = " ".join(self._get_data(f'https://openml.org/api/v1/json/data/{data_id}', self._cache_keys['data']))
         descr_obj = json.loads(descr_txt)["data_set_description"]
diff --git a/coba/tests/test_environments_core.py b/coba/tests/test_environments_core.py
index d3b84a7c..a8b776da 100644
--- a/coba/tests/test_environments_core.py
+++ b/coba/tests/test_environments_core.py
@@ -482,13 +482,49 @@ def test_from_feurer(self):
             167097,167099,167100,167101,167103,167104,167105,167106,167149,167152,167161,167168,167181,
             167184,167185,167190,167200,167201,167202,167203,167204,167205,168785,168791,168792,168793,
             168794,168795,168796,168797,168798,189779,189786,189828,189829,189836,189840,189841,189843,
-            189844,189845,189846,189858,189859,189860,189861,189862,189863,189864,189865,361282,189869,
+            189844,189845,189846,189858,189859,189860,189861,189862,189863,189864,189865,189869,
             189870,189871,189872,189873,189874,189875,189878,189880,189881,189882,189883,189884,189887,
             189890,189893,189894,189899,189900,189902,189905,189906,189908,189909,190154,190155,190156,
             190157,190158,190159,211720,211721,211722,211723,211724}
 
         self.assertEqual(actual_tasks,expected_tasks)
 
+    def test_from_bietti(self):
+        actual_tasks = set([e.params['openml_data'] for e in Environments.from_bietti()])
+
+        expected_tasks = {3,6,10,11,12,14,16,18,20,22,23,26,28,30,31,32,36,37,39,40,41,43,44,46,
+                    48,50,53,54,59,60,61,62,150,151,153,154,155,156,157,158,159,160,161,162,
+                    180,181,182,183,184,187,273,275,276,277,278,279,285,293,300,307,310,312,313,
+                    329,333,334,335,336,337,338,339,343,346,351,354,357,375,377,383,384,385,386,
+                    387,388,389,390,391,392,393,394,395,396,397,398,399,400,401,444,446,448,450,
+                    457,458,459,461,462,463,464,465,467,468,469,472,475,476,477,479,480,554,
+                    679,682,683,685,694,713,714,715,716,717,718,719,720,721,722,723,724,725,
+                    726,727,728,729,730,731,732,733,734,735,736,737,740,741,742,743,744,745,746,
+                    747,748,749,750,751,752,753,754,755,756,758,759,761,762,763,764,765,766,767,
+                    768,769,770,771,772,773,774,775,776,777,778,779,780,782,783,784,785,787,788,
+                    789,790,791,792,793,794,795,796,797,799,800,801,803,804,805,806,807,808,811,
+                    812,813,814,815,816,817,818,819,820,821,823,824,825,826,827,828,829,830,
+                    832,833,834,835,836,837,838,841,843,845,846,847,848,849,850,851,853,855,857,
+                    859,860,862,863,864,865,866,867,868,869,870,871,873,874,875,876,877,878,
+                    879,880,881,882,884,885,886,888,891,892,893,894,895,896,900,901,902,903,904,
+                    905,906,907,908,909,910,911,912,913,914,915,916,917,918,919,920,921,922,923,
+                    924,925,926,927,928,929,931,932,933,934,935,936,937,938,941,942,943,945,946,
+                    947,949,950,951,952,953,954,955,956,958,959,962,964,965,969,970,971,973,
+                    974,976,977,978,979,980,983,987,988,991,994,995,996,997,1004,1005,1006,1009,
+                    1011,1012,1013,1014,1015,1016,1019,1020,1021,1022,1025,1026,1038,1040,1041,
+                    1044,1045,1046,1048,1049,1050,1054,1055,1056,1059,1060,1061,1062,1063,1064,
+                    1065,1066,1067,1068,1069,1071,1073,1075,1077,1078,1079,1080,1081,1082,1083,
+                    1084,1085,1086,1087,1088,1100,1104,1106,1107,1110,1113,1115,1116,1117,1120,
+                    1121,1122,1123,1124,1125,1126,1127,1128,1129,1130,1131,1132,1133,1135,1136,
+                    1137,1138,1139,1140,1141,1142,1143,1144,1145,1146,1147,1148,1149,1150,1151,
+                    1152,1153,1154,1155,1156,1157,1158,1159,1160,1161,1162,1163,1164,1165,1166,
+                    1169,1216,1217,1218,1233,1235,1236,1237,1238,1241,1242,1412,1413,1441,1442,
+                    1443,1444,1449,1451,1453,1455,1457,1459,1460,1464,1467,1471,1472,1473,1475,
+                    1481,1482,1483,1486,1487,1488,1489,1496,1498,1590,40975,40981,40971,23381,
+                    40992,40993}
+
+        self.assertEqual(actual_tasks,expected_tasks)
+
     def test_from_lambda(self):
         context = lambda index,rng               : [ round(r,2) for r in rng.randoms(5) ]
         actions = lambda index,context,rng       : [rng.randoms(5) for _ in range(3)]
diff --git a/coba/tests/test_environments_openml.py b/coba/tests/test_environments_openml.py
index 48980de5..caf84890 100644
--- a/coba/tests/test_environments_openml.py
+++ b/coba/tests/test_environments_openml.py
@@ -923,7 +923,7 @@ def thread_1():
         self.assertIn('openml_042693_arff', CobaContext.cacher)
 
     @unittest.mock.patch('coba.environments.openml.HttpSource')
-    def test_three_timeouts(self,mock):
+    def test_two_timeouts(self,mock):
 
         task = {
             "task":{
@@ -978,7 +978,6 @@ def test_three_timeouts(self,mock):
         """
 
         responses = [
-            TimeoutError(),
             TimeoutError(),
             TimeoutError(),
             json.dumps(task).splitlines(),
@@ -1013,7 +1012,7 @@ def test_three_timeouts(self,mock):
         self.assertIn('openml_042693_arff', CobaContext.cacher)
 
     @unittest.mock.patch('coba.environments.openml.HttpSource')
-    def test_four_timeouts(self,mock):
+    def test_three_timeouts(self,mock):
 
         task = {
             "task":{
@@ -1071,7 +1070,6 @@ def test_four_timeouts(self,mock):
             TimeoutError(),
             TimeoutError(),
             TimeoutError(),
-            TimeoutError(),
             json.dumps(task).splitlines(),
             json.dumps(data).splitlines(),
             json.dumps(feat).splitlines(),
diff --git a/examples/scripts/Getting Started.py b/examples/scripts/Getting Started.py
index bd5ad434..4153d787 100644
--- a/examples/scripts/Getting Started.py	
+++ b/examples/scripts/Getting Started.py	
@@ -1,18 +1,18 @@
 """
 This is an example script that creates and executes an Experiment.
-This script requires that the matplotlib and vowpalwabbit packages be installed.
+This script depends on the matplotlib and vowpalwabbit packages.
 """
 
 import coba as cb
 
-#First, we define the learners that we want to test
+#First, we define the learners that we wish to evaluate
 learners = [ cb.VowpalEpsilonLearner(), cb.RandomLearner() ]
 
-#Next we create an environment we'd like to evaluate against
+#Next, we create an environment we'd like to evaluate against
 environments = cb.Environments.from_linear_synthetic(1000, n_action_features=0).shuffle([1,2,3])
 
-#We then create and run our experiment from our environments and learners
+#We then create and run an experiment using our environments and learners
 result = cb.Experiment(environments,learners).run()
 
-#After evaluating can create a quick summary plot to get a sense of how the learners performed
+#Finally, we can plot the results of our experiment
 result.plot_learners(y='reward',err='se',xlim=(10,None))