split and build coco datasets from raw features

erikbern · Jan 17, 2025 · 8fe409e · 8fe409e
1 parent 233c397
commit 8fe409e
Showing 1 changed file with 16 additions and 2 deletions.
diff --git a/ann_benchmarks/datasets.py b/ann_benchmarks/datasets.py
@@ -571,8 +571,22 @@ def dbpedia_entities_openai_1M(out_fn, n = None):
 
 def coco(out_fn: str, kind: str):
     assert kind in ('t2i', 'i2i')
-    url = "https://github.com/fabiocarrara/str-encoders/releases/download/v0.1.3/coco-%s-512-angular.hdf5" % kind
-    download(url, out_fn)
+
+    local_fn = "coco-clip-b16-512-features.hdf5"
+    url = "https://github.com/fabiocarrara/str-encoders/releases/download/v0.1.3/%s" % local_fn
+    download(url, local_fn)
+
+    with h5py.File(local_fn, "r") as f:
+        img_X = f['img_feats'][:]
+
+        X_train, X_test = train_test_split(img_X, test_size=10_000)
+
+        if kind == 't2i':
+            # there are 5 captions per image, take the first one
+            txt_X = f['txt_feats'][::5]
+            _, X_test = train_test_split(txt_X, test_size=10_000)
+
+    write_output(X_train, X_test, out_fn, "angular")
 
 
 DATASETS: Dict[str, Callable[[str], None]] = {