diff --git a/benchs/bench_fw/benchmark.py b/benchs/bench_fw/benchmark.py index 8ca68c4cd8..237d08bd9a 100644 --- a/benchs/bench_fw/benchmark.py +++ b/benchs/bench_fw/benchmark.py @@ -4,8 +4,7 @@ # LICENSE file in the root directory of this source tree. import logging -from copy import copy -from dataclasses import dataclass +from dataclasses import dataclass, field from operator import itemgetter from statistics import mean, median from typing import Any, Dict, List, Optional @@ -16,7 +15,16 @@ from scipy.optimize import curve_fit -from .descriptors import DatasetDescriptor, IndexDescriptor +from .benchmark_io import BenchmarkIO + +from .descriptors import ( + CodecDescriptor, + DatasetDescriptor, + IndexDescriptor, + IndexDescriptorClassic, + KnnDescriptor, +) + from .index import Index, IndexFromCodec, IndexFromFactory from .utils import dict_merge @@ -185,15 +193,9 @@ def sigmoid(x, a, b, c): @dataclass -class Benchmark: +class IndexOperator: num_threads: int - training_vectors: Optional[DatasetDescriptor] = None - database_vectors: Optional[DatasetDescriptor] = None - query_vectors: Optional[DatasetDescriptor] = None - index_descs: Optional[List[IndexDescriptor]] = None - range_ref_index_desc: Optional[str] = None - k: Optional[int] = None - distance_metric: str = "L2" + distance_metric: str def __post_init__(self): if self.distance_metric == "IP": @@ -203,20 +205,167 @@ def __post_init__(self): else: raise ValueError - def set_io(self, benchmark_io): + def set_io(self, benchmark_io: BenchmarkIO): self.io = benchmark_io self.io.distance_metric = self.distance_metric self.io.distance_metric_type = self.distance_metric_type - def get_index_desc(self, factory_or_codec: str) -> Optional[IndexDescriptor]: + +@dataclass +class TrainOperator(IndexOperator): + codec_descs: List[CodecDescriptor] = field(default_factory=lambda: []) + + def get_desc(self, name: str) -> Optional[CodecDescriptor]: + for desc in self.codec_descs: + if desc.get_name() == name: + return desc + elif desc.factory == name: + return desc + return None + + def get_flat_desc(self, name=None) -> Optional[CodecDescriptor]: + for desc in self.codec_descs: + desc_name = desc.get_name() + if desc_name == name: + return desc + if desc_name.startswith("Flat"): + return desc + return None + + def build_index_wrapper(self, codec_desc: CodecDescriptor): + if hasattr(codec_desc, "index"): + return + + if codec_desc.factory is not None: + assert ( + codec_desc.factory == "Flat" or codec_desc.training_vectors is not None + ) + index = IndexFromFactory( + num_threads=self.num_threads, + d=codec_desc.d, + metric=self.distance_metric, + construction_params=codec_desc.construction_params, + factory=codec_desc.factory, + training_vectors=codec_desc.training_vectors, + codec_name=codec_desc.get_name(), + ) + index.set_io(self.io) + codec_desc.index = index + else: + assert codec_desc.is_trained() + + def train( + self, codec_desc: CodecDescriptor, results: Dict[str, Any], dry_run=False + ): + self.build_index_wrapper(codec_desc) + if codec_desc.is_trained(): + return results, None + + if dry_run: + meta, requires = codec_desc.index.fetch_meta(dry_run=dry_run) + else: + codec_desc.index.get_codec() + meta, requires = codec_desc.index.fetch_meta(dry_run=dry_run) + assert requires is None + + if requires is None: + results["indices"][codec_desc.get_name()] = meta + return results, requires + + +@dataclass +class BuildOperator(IndexOperator): + index_descs: List[IndexDescriptor] = field(default_factory=lambda: []) + + def get_desc(self, name: str) -> Optional[IndexDescriptor]: + for desc in self.index_descs: + if desc.get_name() == name: + return desc + return None + + def get_flat_desc(self, name=None) -> Optional[IndexDescriptor]: for desc in self.index_descs: - if desc.factory == factory_or_codec: + desc_name = desc.get_name() + if desc_name == name: return desc - if desc.codec_alias == factory_or_codec: + if desc_name.startswith("Flat"): + return desc + return None + + def build_index_wrapper(self, index_desc: IndexDescriptor): + if hasattr(index_desc, "index"): + return + + if hasattr(index_desc.codec_desc, "index"): + index_desc.index = index_desc.codec_desc.index + index_desc.index.database_vectors = index_desc.database_desc + index_desc.index.index_name = index_desc.get_name() + return + + if index_desc.codec_desc is not None: + index = IndexFromCodec( + num_threads=self.num_threads, + d=index_desc.d, + metric=self.distance_metric, + database_vectors=index_desc.database_desc, + bucket=index_desc.codec_desc.bucket, + path=index_desc.codec_desc.path, + index_name=index_desc.get_name(), + codec_name=index_desc.codec_desc.get_name(), + ) + index.set_io(self.io) + index_desc.index = index + else: + assert index_desc.is_built() + + def build(self, index_desc: IndexDescriptor, results: Dict[str, Any]): + self.build_index_wrapper(index_desc) + if index_desc.is_built(): + return + index_desc.index.get_index() + + +@dataclass +class SearchOperator(IndexOperator): + knn_descs: List[KnnDescriptor] = field(default_factory=lambda: []) + range: bool = False + + def get_desc(self, name: str) -> Optional[KnnDescriptor]: + for desc in self.knn_descs: + if desc.get_name() == name: + return desc + return None + + def get_flat_desc(self, name=None) -> Optional[KnnDescriptor]: + for desc in self.knn_descs: + if desc.get_name().startswith("Flat"): return desc return None - def range_search_reference(self, index, parameters, range_metric): + def build_index_wrapper(self, knn_desc: KnnDescriptor): + if hasattr(knn_desc, "index"): + return + + if knn_desc.index_desc.index is not None: + knn_desc.index = knn_desc.index_desc.index + knn_desc.index.knn_name = knn_desc.get_name() + knn_desc.index.search_params = knn_desc.search_params + else: + index = Index( + num_threads=self.num_threads, + d=knn_desc.d, + metric=self.distance_metric, + bucket=knn_desc.index_desc.bucket, + index_path=knn_desc.index_desc.path, + # knn_name=knn_desc.get_name(), + search_params=knn_desc.search_params, + ) + index.set_io(self.io) + knn_desc.index = index + + knn_desc.index.get_index() + + def range_search_reference(self, index, parameters, range_metric, query_dataset): logger.info("range_search_reference: begin") if isinstance(range_metric, list): assert len(range_metric) > 0 @@ -233,6 +382,7 @@ def range_search_reference(self, index, parameters, range_metric): index, parameters, radius=m_radius, + query_dataset=query_dataset, ) flat = index.is_flat_index() ( @@ -253,11 +403,11 @@ def range_search_reference(self, index, parameters, range_metric): coefficients_training_data, ) - def estimate_range(self, index, parameters, range_scoring_radius): + def estimate_range(self, index, parameters, range_scoring_radius, query_dataset): D, I, R, P, _ = index.knn_search( False, parameters, - self.query_vectors, + query_dataset, self.k, ) samples = [] @@ -275,6 +425,7 @@ def range_search( dry_run, index: Index, search_parameters: Optional[Dict[str, int]], + query_dataset: DatasetDescriptor, radius: Optional[float] = None, gt_radius: Optional[float] = None, range_search_metric_function=None, @@ -287,25 +438,21 @@ def range_search( gt_radius if index.is_flat() else self.estimate_range( - index, - search_parameters, - gt_radius, + index, search_parameters, gt_radius, query_dataset ) ) logger.info(f"Radius={radius}") lims, D, I, R, P, requires = index.range_search( dry_run=dry_run, search_parameters=search_parameters, - query_vectors=self.query_vectors, + query_vectors=query_dataset, radius=radius, ) if requires is not None: return None, None, None, None, None, requires if range_search_metric_function is not None: range_search_metric = range_search_metric_function(R) - range_search_pr = range_search_pr_curve( - D, range_search_metric, gt_rsm - ) + range_search_pr = range_search_pr_curve(D, range_search_metric, gt_rsm) range_score_sum = np.sum(range_search_metric).item() P |= { "range_score_sum": range_score_sum, @@ -314,23 +461,29 @@ def range_search( } return lims, D, I, R, P, requires - def range_ground_truth(self, gt_radius, range_search_metric_function): + def range_ground_truth( + self, gt_radius, range_search_metric_function, flat_desc=None + ): logger.info("range_ground_truth: begin") - flat_desc = self.get_index_desc("Flat") + if flat_desc is None: + flat_desc = self.get_flat_desc() lims, D, I, R, P, _ = self.range_search( False, flat_desc.index, search_parameters=None, radius=gt_radius, + query_dataset=flat_desc.query_dataset, ) gt_rsm = np.sum(range_search_metric_function(R)).item() logger.info("range_ground_truth: end") return gt_rsm - def knn_ground_truth(self): + def knn_ground_truth(self, flat_desc=None): logger.info("knn_ground_truth: begin") - flat_desc = self.get_index_desc("Flat") + if flat_desc is None: + flat_desc = self.get_flat_desc() self.build_index_wrapper(flat_desc) + # TODO(kuarora): Consider moving gt results(gt_knn_D, gt_knn_I) to the index as there can be multiple ground truths. ( self.gt_knn_D, self.gt_knn_I, @@ -340,8 +493,8 @@ def knn_ground_truth(self): ) = flat_desc.index.knn_search( dry_run=False, search_parameters=None, - query_vectors=self.query_vectors, - k=self.k, + query_vectors=flat_desc.query_dataset, + k=flat_desc.k, ) assert requires is None logger.info("knn_ground_truth: end") @@ -371,6 +524,7 @@ def experiment(parameters, cost_metric, perf_metric): results["experiments"][key] = metrics return metrics[cost_metric], metrics[perf_metric], None + requires = None for cost_metric in cost_metrics: for perf_metric in perf_metrics: op = index.get_operating_points() @@ -386,52 +540,52 @@ def experiment(parameters, cost_metric, perf_metric): return results, requires def knn_search_benchmark( - self, dry_run, results: Dict[str, Any], index: Index + self, dry_run, results: Dict[str, Any], knn_desc: KnnDescriptor ): return self.search_benchmark( name="knn_search", - search_func=lambda parameters: index.knn_search( + search_func=lambda parameters: knn_desc.index.knn_search( dry_run, parameters, - self.query_vectors, - self.k, + knn_desc.query_dataset, + knn_desc.k, self.gt_knn_I, self.gt_knn_D, )[3:], - key_func=lambda parameters: index.get_knn_search_name( + key_func=lambda parameters: knn_desc.index.get_knn_search_name( search_parameters=parameters, - query_vectors=self.query_vectors, - k=self.k, + query_vectors=knn_desc.query_dataset, + k=knn_desc.k, reconstruct=False, ), cost_metrics=["time"], perf_metrics=["knn_intersection", "distance_ratio"], results=results, - index=index, + index=knn_desc.index, ) def reconstruct_benchmark( - self, dry_run, results: Dict[str, Any], index: Index + self, dry_run, results: Dict[str, Any], knn_desc: KnnDescriptor ): return self.search_benchmark( name="reconstruct", - search_func=lambda parameters: index.reconstruct( + search_func=lambda parameters: knn_desc.index.reconstruct( dry_run, parameters, - self.query_vectors, - self.k, + knn_desc.query_dataset, + knn_desc.k, self.gt_knn_I, ), - key_func=lambda parameters: index.get_knn_search_name( + key_func=lambda parameters: knn_desc.index.get_knn_search_name( search_parameters=parameters, - query_vectors=self.query_vectors, - k=self.k, + query_vectors=knn_desc.query_dataset, + k=knn_desc.k, reconstruct=True, ), cost_metrics=["encode_time"], perf_metrics=["sym_recall"], results=results, - index=index, + index=knn_desc.index, ) def range_search_benchmark( @@ -444,6 +598,7 @@ def range_search_benchmark( gt_radius: float, range_search_metric_function, gt_rsm: float, + query_dataset: DatasetDescriptor, ): return self.search_benchmark( name="range_search", @@ -455,10 +610,11 @@ def range_search_benchmark( gt_radius=gt_radius, range_search_metric_function=range_search_metric_function, gt_rsm=gt_rsm, + query_dataset=query_dataset, )[4:], key_func=lambda parameters: index.get_range_search_name( search_parameters=parameters, - query_vectors=self.query_vectors, + query_vectors=query_dataset, radius=radius, ) + metric_key, @@ -468,69 +624,88 @@ def range_search_benchmark( index=index, ) - def build_index_wrapper(self, index_desc: IndexDescriptor): - if hasattr(index_desc, "index"): - return - if index_desc.factory is not None: - training_vectors = copy(self.training_vectors) - if index_desc.training_size is not None: - training_vectors.num_vectors = index_desc.training_size - index = IndexFromFactory( - num_threads=self.num_threads, - d=self.d, - metric=self.distance_metric, - database_vectors=self.database_vectors, - search_params=index_desc.search_params, - construction_params=index_desc.construction_params, - factory=index_desc.factory, - training_vectors=training_vectors, - ) + +@dataclass +class ExecutionOperator: + distance_metric: str = "L2" + num_threads: int = 1 + train_op: Optional[TrainOperator] = None + build_op: Optional[BuildOperator] = None + search_op: Optional[SearchOperator] = None + + def __post_init__(self): + if self.distance_metric == "IP": + self.distance_metric_type = faiss.METRIC_INNER_PRODUCT + elif self.distance_metric == "L2": + self.distance_metric_type = faiss.METRIC_L2 else: - index = IndexFromCodec( - num_threads=self.num_threads, - d=self.d, - metric=self.distance_metric, - database_vectors=self.database_vectors, - search_params=index_desc.search_params, - construction_params=index_desc.construction_params, - path=index_desc.path, - bucket=index_desc.bucket, - ) - index.set_io(self.io) - index_desc.index = index + raise ValueError - def clone_one(self, index_desc): - benchmark = Benchmark( - num_threads=self.num_threads, - training_vectors=self.training_vectors, - database_vectors=self.database_vectors, - query_vectors=self.query_vectors, - index_descs=[self.get_index_desc("Flat"), index_desc], - range_ref_index_desc=self.range_ref_index_desc, - k=self.k, - distance_metric=self.distance_metric, - ) - benchmark.set_io(self.io.clone()) - return benchmark + def set_io(self, io: BenchmarkIO): + self.io = io + self.io.distance_metric = self.distance_metric + self.io.distance_metric_type = self.distance_metric_type + if self.train_op: + self.train_op.set_io(io) + if self.build_op: + self.build_op.set_io(io) + if self.search_op: + self.search_op.set_io(io) + + def train_one(self, codec_desc: CodecDescriptor, results: Dict[str, Any], dry_run): + faiss.omp_set_num_threads(self.num_threads) + assert self.train_op is not None + self.train_op.train(codec_desc, results, dry_run) + + def train(self, results, dry_run=False): + faiss.omp_set_num_threads(self.num_threads) + if self.train_op is None: + return + + for codec_desc in self.train_op.codec_descs: + self.train_one(codec_desc, results, dry_run) + + def build_one(self, results: Dict[str, Any], index_desc: IndexDescriptor): + faiss.omp_set_num_threads(self.num_threads) + assert self.build_op is not None + self.build_op.build(index_desc, results) + + def build(self, results: Dict[str, Any]): + faiss.omp_set_num_threads(self.num_threads) + if self.build_op is None: + return + + for index_desc in self.build_op.index_descs: + self.build_one(index_desc, results) + + def search(self): + faiss.omp_set_num_threads(self.num_threads) + if self.search_op is None: + return - def benchmark_one( + for index_desc in self.search_op.knn_descs: + self.search_one(index_desc) + + def search_one( self, - dry_run, + knn_desc: KnnDescriptor, results: Dict[str, Any], - index_desc: IndexDescriptor, - train, - reconstruct, - knn, - range, + dry_run=False, + range=False, ): faiss.omp_set_num_threads(self.num_threads) + assert self.search_op is not None + if not dry_run: - self.knn_ground_truth() - self.build_index_wrapper(index_desc) - meta, requires = index_desc.index.fetch_meta(dry_run=dry_run) + self.create_gt_knn(knn_desc) + self.create_range_ref_knn(knn_desc) + + self.search_op.build_index_wrapper(knn_desc) + meta, requires = knn_desc.index.fetch_meta(dry_run=dry_run) if requires is not None: - return results, (requires if train else None) - results["indices"][index_desc.index.get_codec_name()] = meta + # return results, (requires if train else None) + return results, requires + results["indices"][knn_desc.index.get_codec_name()] = meta # results, requires = self.reconstruct_benchmark( # dry_run=True, @@ -547,33 +722,32 @@ def benchmark_one( # index=index_desc.index, # ) # assert requires is None - - results, requires = self.knn_search_benchmark( + results, requires = self.search_op.knn_search_benchmark( dry_run=True, results=results, - index=index_desc.index, + knn_desc=knn_desc, ) - if knn and requires is not None: + if requires is not None: if dry_run: return results, requires else: - results, requires = self.knn_search_benchmark( + results, requires = self.search_op.knn_search_benchmark( dry_run=False, results=results, - index=index_desc.index, + knn_desc=knn_desc, ) assert requires is None if ( - self.range_ref_index_desc is None - or not index_desc.index.supports_range_search() + knn_desc.range_ref_index_desc is None or + not knn_desc.index.supports_range_search() ): return results, None - ref_index_desc = self.get_index_desc(self.range_ref_index_desc) + ref_index_desc = self.search_op.get_desc(knn_desc.range_ref_index_desc) if ref_index_desc is None: raise ValueError( - f"Unknown range index {self.range_ref_index_desc}" + f"{knn_desc.get_name()}: Unknown range index {knn_desc.range_ref_index_desc}" ) if ref_index_desc.range_metrics is None: raise ValueError( @@ -585,92 +759,360 @@ def benchmark_one( range_search_metric_function, coefficients, coefficients_training_data, - ) = self.range_search_reference( + ) = self.search_op.range_search_reference( ref_index_desc.index, ref_index_desc.search_params, range_metric, ) - gt_rsm = self.range_ground_truth( + gt_rsm = self.search_op.range_ground_truth( gt_radius, range_search_metric_function ) - results, requires = self.range_search_benchmark( + results, requires = self.search_op.range_search_benchmark( dry_run=True, results=results, - index=index_desc.index, + index=knn_desc.index, metric_key=metric_key, - radius=index_desc.radius, + radius=knn_desc.radius, gt_radius=gt_radius, range_search_metric_function=range_search_metric_function, gt_rsm=gt_rsm, + query_vectors=knn_desc.query_dataset, ) if range and requires is not None: if dry_run: return results, requires else: - results, requires = self.range_search_benchmark( + results, requires = self.search_op.range_search_benchmark( dry_run=False, results=results, - index=index_desc.index, + index=knn_desc.index, metric_key=metric_key, - radius=index_desc.radius, + radius=knn_desc.radius, gt_radius=gt_radius, range_search_metric_function=range_search_metric_function, gt_rsm=gt_rsm, + query_vectors=knn_desc.query_dataset, ) assert requires is None return results, None - def benchmark( - self, - result_file=None, - local=False, - train=False, - reconstruct=False, - knn=False, - range=False, - ): - logger.info("begin evaluate") + def create_gt_codec( + self, codec_desc, results, train=True + ) -> Optional[CodecDescriptor]: + gt_codec_desc = None + if self.train_op: + gt_codec_desc = self.train_op.get_flat_desc(codec_desc.flat_name()) + if gt_codec_desc is None: + gt_codec_desc = CodecDescriptor( + factory="Flat", + d=codec_desc.d, + metric=codec_desc.metric, + num_threads=self.num_threads, + ) + self.train_op.codec_descs.insert(0, gt_codec_desc) + if train: + self.train_op.train(gt_codec_desc, results, dry_run=False) - faiss.omp_set_num_threads(self.num_threads) - results = {"indices": {}, "experiments": {}} - xq = self.io.get_dataset(self.query_vectors) - self.d = xq.shape[1] - if self.get_index_desc("Flat") is None: - self.index_descs.append(IndexDescriptor(factory="Flat")) + return gt_codec_desc - self.knn_ground_truth() + def create_gt_index( + self, index_desc: IndexDescriptor, results: Dict[str, Any], build=True + ) -> Optional[IndexDescriptor]: + gt_index_desc = None + if self.build_op: + gt_index_desc = self.build_op.get_flat_desc(index_desc.flat_name()) + if gt_index_desc is None: + gt_codec_desc = self.train_op.get_flat_desc( + index_desc.codec_desc.flat_name() + ) + assert gt_codec_desc is not None + gt_index_desc = IndexDescriptor( + d=index_desc.d, + metric=index_desc.metric, + num_threads=self.num_threads, + codec_desc=gt_codec_desc, + database_desc=index_desc.database_desc, + ) + self.build_op.index_descs.insert(0, gt_index_desc) + if build: + self.build_op.build(gt_index_desc, results) - if self.range_ref_index_desc is not None: - index_desc = self.get_index_desc(self.range_ref_index_desc) - if index_desc is None: - raise ValueError( - f"Unknown range index {self.range_ref_index_desc}" + return gt_index_desc + + def create_gt_knn(self, knn_desc, search=True) -> Optional[KnnDescriptor]: + gt_knn_desc = None + if self.search_op: + gt_knn_desc = self.search_op.get_flat_desc(knn_desc.flat_name()) + if gt_knn_desc is None: + gt_index_desc = self.build_op.get_flat_desc( + knn_desc.index_desc.flat_name() + ) + assert gt_index_desc is not None + gt_knn_desc = KnnDescriptor( + d=knn_desc.d, + metric=knn_desc.metric, + num_threads=self.num_threads, + index_desc=gt_index_desc, + query_dataset=knn_desc.query_dataset, + k=knn_desc.k, ) - if index_desc.range_metrics is None: + self.search_op.knn_descs.insert(0, gt_knn_desc) + if search: + self.search_op.build_index_wrapper(gt_knn_desc) + self.search_op.knn_ground_truth(gt_knn_desc) + + return gt_knn_desc + + def create_range_ref_knn(self, knn_desc): + if ( + knn_desc.range_ref_index_desc is None or + not knn_desc.index.supports_range_search() + ): + return + + if knn_desc.range_ref_index_desc is not None: + ref_index_desc = self.get_desc(knn_desc.range_ref_index_desc) + if ref_index_desc is None: + raise ValueError(f"Unknown range index {knn_desc.range_ref_index_desc}") + if ref_index_desc.range_metrics is None: raise ValueError( - f"Range index {index_desc.factory} has no radius_score" + f"Range index {knn_desc.get_name()} has no radius_score" ) results["metrics"] = {} - self.build_index_wrapper(index_desc) - for metric_key, range_metric in index_desc.range_metrics.items(): + self.build_index_wrapper(ref_index_desc) + for metric_key, range_metric in ref_index_desc.range_metrics.items(): ( - gt_radius, + knn_desc.gt_radius, range_search_metric_function, coefficients, coefficients_training_data, ) = self.range_search_reference( - index_desc.index, index_desc.search_params, range_metric + knn_desc.index, knn_desc.search_params, range_metric ) results["metrics"][metric_key] = { "coefficients": coefficients, "training_data": coefficients_training_data, } - gt_rsm = self.range_ground_truth( - gt_radius, range_search_metric_function + knn_desc.gt_rsm = self.range_ground_truth( + knn_desc.gt_radius, range_search_metric_function + ) + + def create_ground_truths(self, results: Dict[str, Any]): + # TODO: Create all ground truth descriptors and put them in index descriptor as reference + if self.train_op is not None: + for codec_desc in self.train_op.codec_descs: + self.create_gt_codec(codec_desc, results) + + if self.build_op is not None: + for index_desc in self.build_op.index_descs: + self.create_gt_index( + index_desc, results + ) # may need to pass results in future + + if self.search_op is not None: + for knn_desc in self.search_op.knn_descs: + self.create_gt_knn(knn_desc, results) + self.create_range_ref_knn(knn_desc) + + def execute(self, results: Dict[str, Any], dry_run: False): + if self.train_op is not None: + for desc in self.train_op.codec_descs: + results, requires = self.train_op.train(desc, results, dry_run=dry_run) + if dry_run: + if requires is None: + continue + return results, requires + assert requires is None + + if self.build_op is not None: + for desc in self.build_op.index_descs: + self.build_op.build(desc, results) + if self.search_op is not None: + for desc in self.search_op.knn_descs: + results, requires = self.search_one( + knn_desc=desc, results=results, dry_run=dry_run, range=self.search_op.range + ) + if dry_run: + if requires is None: + continue + return results, requires + + assert requires is None + return results, None + + def execute_2(self, result_file=None): + results = {"indices": {}, "experiments": {}} + results, requires = self.execute(results=results) + assert requires is None + if result_file is not None: + self.io.write_json(results, result_file, overwrite=True) + + def add_index_descs(self, codec_desc, index_desc, knn_desc): + if codec_desc is not None: + self.train_op.codec_descs.append(codec_desc) + if index_desc is not None: + self.build_op.index_descs.append(index_desc) + if knn_desc is not None: + self.search_op.knn_descs.append(knn_desc) + + +@dataclass +class Benchmark: + num_threads: int + training_vectors: Optional[DatasetDescriptor] = None + database_vectors: Optional[DatasetDescriptor] = None + query_vectors: Optional[DatasetDescriptor] = None + index_descs: Optional[List[IndexDescriptorClassic]] = None + range_ref_index_desc: Optional[str] = None + k: int = 1 + distance_metric: str = "L2" + + def set_io(self, benchmark_io): + self.io = benchmark_io + + def get_embedding_dimension(self): + if self.training_vectors is not None: + xt = self.io.get_dataset(self.training_vectors) + return xt.shape[1] + if self.database_vectors is not None: + xb = self.io.get_dataset(self.database_vectors) + return xb.shape[1] + if self.query_vectors is not None: + xq = self.io.get_dataset(self.query_vectors) + return xq.shape[1] + raise ValueError("Failed to determine dimension of dataset") + + def create_descriptors( + self, ci_desc: IndexDescriptorClassic, train, build, knn, reconstruct, range + ): + codec_desc = None + index_desc = None + knn_desc = None + dim = self.get_embedding_dimension() + if train and ci_desc.factory is not None: + codec_desc = CodecDescriptor( + d=dim, + metric=self.distance_metric, + num_threads=self.num_threads, + factory=ci_desc.factory, + construction_params=ci_desc.construction_params, + training_vectors=self.training_vectors, + ) + if build: + if codec_desc is None: + assert ci_desc.path is not None + codec_desc = CodecDescriptor( + d=dim, + metric=self.distance_metric, + num_threads=self.num_threads, + bucket=ci_desc.bucket, + path=ci_desc.path, + ) + index_desc = IndexDescriptor( + d=codec_desc.d, + metric=self.distance_metric, + num_threads=self.num_threads, + codec_desc=codec_desc, + database_desc=self.database_vectors, + ) + if knn or range: + if index_desc is None: + assert ci_desc.path is not None + index_desc = IndexDescriptor( + d=dim, + metric=self.distance_metric, + num_threads=self.num_threads, + bucket=ci_desc.bucket, + path=ci_desc.path, ) + knn_desc = KnnDescriptor( + d=dim, + metric=self.distance_metric, + num_threads=self.num_threads, + index_desc=index_desc, + query_dataset=self.query_vectors, + search_params=ci_desc.search_params, + range_metrics=ci_desc.range_metrics, + radius=ci_desc.radius, + k=self.k, + ) - self.index_descs = list(dict.fromkeys(self.index_descs)) + return codec_desc, index_desc, knn_desc + + def create_execution_operator( + self, + train, + build, + knn, + reconstruct, + range, + ) -> ExecutionOperator: + # all operators are created, as ground truth are always created in benchmarking + train_op = TrainOperator( + num_threads=self.num_threads, distance_metric=self.distance_metric + ) + build_op = BuildOperator( + num_threads=self.num_threads, distance_metric=self.distance_metric + ) + search_op = SearchOperator( + num_threads=self.num_threads, distance_metric=self.distance_metric + ) + search_op.range = range + + exec_op = ExecutionOperator( + train_op=train_op, + build_op=build_op, + search_op=search_op, + num_threads=self.num_threads, + ) + assert hasattr(self, "io") + exec_op.set_io(self.io) + + # iterate over classic descriptors + for ci_desc in self.index_descs: + codec_desc, index_desc, knn_desc = self.create_descriptors( + ci_desc, train, build, knn, reconstruct, range + ) + exec_op.add_index_descs(codec_desc, index_desc, knn_desc) + + return exec_op + + def clone_one(self, index_desc): + benchmark = Benchmark( + num_threads=self.num_threads, + training_vectors=self.training_vectors, + database_vectors=self.database_vectors, + query_vectors=self.query_vectors, + # index_descs=[self.get_flat_desc("Flat"), index_desc], + index_descs=[index_desc], # Should automatically find flat descriptors + range_ref_index_desc=self.range_ref_index_desc, + k=self.k, + distance_metric=self.distance_metric, + ) + benchmark.set_io(self.io.clone()) + return benchmark + + def benchmark( + self, + result_file=None, + local=False, + train=False, + reconstruct=False, + knn=False, + range=False, + ): + logger.info("begin evaluate") + results = {"indices": {}, "experiments": {}} + faiss.omp_set_num_threads(self.num_threads) + exec_op = self.create_execution_operator( + train=train, + build=knn or range, + knn=knn, + reconstruct=reconstruct, + range=range, + ) + exec_op.create_ground_truths(results) todo = self.index_descs for index_desc in self.index_descs: @@ -681,15 +1123,7 @@ def benchmark( current_todo = [] next_todo = [] for index_desc in todo: - results, requires = self.benchmark_one( - dry_run=True, - results=results, - index_desc=index_desc, - train=train, - reconstruct=reconstruct, - knn=knn, - range=range, - ) + results, requires = exec_op.execute(results, dry_run=False) if requires is None: continue if requires in queued: @@ -731,15 +1165,14 @@ def benchmark( def run_benchmark_one(params): logger.info(params) index_desc, benchmark, results, train, reconstruct, knn, range = params - results, requires = benchmark.benchmark_one( - dry_run=False, - results=results, - index_desc=index_desc, + exec_op = benchmark.create_execution_operator( train=train, - reconstruct=reconstruct, + build=knn, knn=knn, + reconstruct=reconstruct, range=range, ) + results, requires = exec_op.execute(results=results, dry_run=False) assert requires is None assert results is not None return results diff --git a/benchs/bench_fw/benchmark_io.py b/benchs/bench_fw/benchmark_io.py index b39bb60290..5ee3eb3a6a 100644 --- a/benchs/bench_fw/benchmark_io.py +++ b/benchs/bench_fw/benchmark_io.py @@ -53,6 +53,7 @@ def clone(self): def __post_init__(self): self.cached_ds = {} + # TODO(kuarora): rename it as get_local_file def get_local_filename(self, filename): if len(filename) > 184: fn, ext = os.path.splitext(filename) @@ -61,6 +62,9 @@ def get_local_filename(self, filename): ) return os.path.join(self.path, filename) + def get_remote_filepath(self, filename) -> Optional[str]: + return None + def download_file_from_blobstore( self, filename: str, @@ -219,7 +223,7 @@ def read_index( fn = self.download_file_from_blobstore(filename, bucket, path) logger.info(f"Loading index {fn}") ext = os.path.splitext(fn)[1] - if ext in [".faiss", ".codec"]: + if ext in [".faiss", ".codec", ".index"]: index = faiss.read_index(fn) elif ext == ".pkl": with open(fn, "rb") as model_file: diff --git a/benchs/bench_fw/descriptors.py b/benchs/bench_fw/descriptors.py index 173b07ce16..e76278cedc 100644 --- a/benchs/bench_fw/descriptors.py +++ b/benchs/bench_fw/descriptors.py @@ -3,18 +3,21 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -from dataclasses import dataclass import logging +import os +from dataclasses import dataclass from typing import Any, Dict, List, Optional import faiss # @manual=//faiss/python:pyfaiss_gpu + +from .benchmark_io import BenchmarkIO from .utils import timer logger = logging.getLogger(__name__) @dataclass -class IndexDescriptor: +class IndexDescriptorClassic: bucket: Optional[str] = None # either path or factory should be set, # but not both at the same time. @@ -45,7 +48,6 @@ class IndexDescriptor: def __hash__(self): return hash(str(self)) - @dataclass class DatasetDescriptor: # namespace possible values: @@ -81,7 +83,7 @@ def __hash__(self): def get_filename( self, - prefix: str = None, + prefix: Optional[str] = None, ) -> str: filename = "" if prefix is not None: @@ -116,3 +118,208 @@ def k_means(self, io, k, dry_run): else: t = io.read_json(meta_filename)["k_means_time"] return kmeans_vectors, t, None + +@dataclass +class IndexBaseDescriptor: + d: int + metric: str + desc_name: Optional[str] = None + flat_desc_name: Optional[str] = None + bucket: Optional[str] = None + path: Optional[str] = None + num_threads: int = 1 + + def get_name(self) -> str: + raise NotImplementedError() + + def get_path(self, benchmark_io: BenchmarkIO) -> Optional[str]: + if self.path is not None: + return self.path + self.path = benchmark_io.get_remote_filepath(self.desc_name) + return self.path + + @staticmethod + def param_dict_list_to_name(param_dict_list): + if not param_dict_list: + return "" + l = 0 + n = "" + for param_dict in param_dict_list: + n += IndexBaseDescriptor.param_dict_to_name(param_dict, f"cp{l}") + l += 1 + return n + + @staticmethod + def param_dict_to_name(param_dict, prefix="sp"): + if not param_dict: + return "" + n = prefix + for name, val in param_dict.items(): + if name == "snap": + continue + if name == "lsq_gpu" and val == 0: + continue + if name == "use_beam_LUT" and val == 0: + continue + n += f"_{name}_{val}" + if n == prefix: + return "" + n += "." + return n + + +@dataclass +class CodecDescriptor(IndexBaseDescriptor): + # either path or factory should be set, + # but not both at the same time. + factory: Optional[str] = None + construction_params: Optional[List[Dict[str, int]]] = None + training_vectors: Optional[DatasetDescriptor] = None + + def __post_init__(self): + self.get_name() + + def is_trained(self): + return self.factory is None and self.path is not None + + def is_valid(self): + return self.factory is not None or self.path is not None + + def get_name(self) -> str: + if self.desc_name is not None: + return self.desc_name + if self.factory is not None: + self.desc_name = self.name_from_factory() + return self.desc_name + if self.path is not None: + self.desc_name = self.name_from_path() + return self.desc_name + raise ValueError("name, factory or path must be set") + + def flat_name(self) -> str: + if self.flat_desc_name is not None: + return self.flat_desc_name + self.flat_desc_name = f"Flat.d_{self.d}.{self.metric.upper()}." + return self.flat_desc_name + + def path(self, benchmark_io) -> str: + if self.path is not None: + return self.path + return benchmark_io.get_remote_filepath(self.get_name()) + + def name_from_factory(self) -> str: + assert self.factory is not None + name = f"{self.factory.replace(',', '_')}." + assert self.d is not None + assert self.metric is not None + name += f"d_{self.d}.{self.metric.upper()}." + if self.factory != "Flat": + assert self.training_vectors is not None + name += self.training_vectors.get_filename("xt") + name += IndexBaseDescriptor.param_dict_list_to_name(self.construction_params) + return name + + def name_from_path(self): + assert self.path is not None + filename = os.path.basename(self.path) + ext = filename.split(".")[-1] + if filename.endswith(ext): + name = filename[:-len(ext)] + else: # should never hit this rather raise value error + name = filename + return name + + def alias(self, benchmark_io : BenchmarkIO): + if hasattr(benchmark_io, "bucket"): + return CodecDescriptor(desc_name=self.get_name(), bucket=benchmark_io.bucket, path=self.get_path(benchmark_io), d=self.d, metric=self.metric) + return CodecDescriptor(desc_name=self.get_name(), d=self.d, metric=self.metric) + + +@dataclass +class IndexDescriptor(IndexBaseDescriptor): + codec_desc: Optional[CodecDescriptor] = None + database_desc: Optional[DatasetDescriptor] = None + + def __hash__(self): + return hash(str(self)) + + def __post_init__(self): + self.get_name() + + def is_built(self): + return self.codec_desc is None and self.database_desc is None + + def get_name(self) -> str: + if self.desc_name is None: + self.desc_name = self.codec_desc.get_name() + self.database_desc.get_filename(prefix="xb") + + return self.desc_name + + def flat_name(self): + if self.flat_desc_name is not None: + return self.flat_desc_name + self.flat_desc_name = self.codec_desc.flat_name() + self.database_desc.get_filename(prefix="xb") + return self.flat_desc_name + + # alias is used to refer when index is uploaded to blobstore and refered again + def alias(self, benchmark_io: BenchmarkIO): + if hasattr(benchmark_io, "bucket"): + return IndexDescriptor(desc_name=self.get_name(), bucket=benchmark_io.bucket, path=self.get_path(benchmark_io), d=self.d, metric=self.metric) + return IndexDescriptor(desc_name=self.get_name(), d=self.d, metric=self.metric) + +@dataclass +class KnnDescriptor(IndexBaseDescriptor): + index_desc: Optional[IndexDescriptor] = None + gt_index_desc: Optional[IndexDescriptor] = None + query_dataset: Optional[DatasetDescriptor] = None + search_params: Optional[Dict[str, int]] = None + reconstruct: bool = False + # range metric definitions + # key: name + # value: one of the following: + # + # radius + # [0..radius) -> 1 + # [radius..inf) -> 0 + # + # [[radius1, score1], ...] + # [0..radius1) -> score1 + # [radius1..radius2) -> score2 + # + # [[radius1_from, radius1_to, score1], ...] + # [radius1_from, radius1_to) -> score1, + # [radius2_from, radius2_to) -> score2 + range_metrics: Optional[Dict[str, Any]] = None + radius: Optional[float] = None + k: int = 1 + + range_ref_index_desc: Optional[str] = None + + def __hash__(self): + return hash(str(self)) + + def get_name(self): + name = self.index_desc.get_name() + name += IndexBaseDescriptor.param_dict_to_name(self.search_params) + name += self.query_dataset.get_filename("q") + name += f"k_{self.k}." + name += f"t_{self.num_threads}." + if self.reconstruct: + name += "rec." + else: + name += "knn." + return name + + def flat_name(self): + if self.flat_desc_name is not None: + return self.flat_desc_name + name = self.index_desc.flat_name() + name += self.query_dataset.get_filename("q") + name += f"k_{self.k}." + name += f"t_{self.num_threads}." + if self.reconstruct: + name += "rec." + else: + name += "knn." + self.flat_desc_name = name + return name diff --git a/benchs/bench_fw/index.py b/benchs/bench_fw/index.py index 3deaa4afcf..6b6c2d93af 100644 --- a/benchs/bench_fw/index.py +++ b/benchs/bench_fw/index.py @@ -13,6 +13,7 @@ import faiss # @manual=//faiss/python:pyfaiss_gpu import numpy as np +from faiss.benchs.bench_fw.descriptors import IndexBaseDescriptor from faiss.contrib.evaluation import ( # @manual=//faiss/contrib:faiss_contrib_gpu knn_intersection_measure, @@ -49,35 +50,6 @@ class IndexBase: def set_io(self, benchmark_io): self.io = benchmark_io - @staticmethod - def param_dict_list_to_name(param_dict_list): - if not param_dict_list: - return "" - l = 0 - n = "" - for param_dict in param_dict_list: - n += IndexBase.param_dict_to_name(param_dict, f"cp{l}") - l += 1 - return n - - @staticmethod - def param_dict_to_name(param_dict, prefix="sp"): - if not param_dict: - return "" - n = prefix - for name, val in param_dict.items(): - if name == "snap": - continue - if name == "lsq_gpu" and val == 0: - continue - if name == "use_beam_LUT" and val == 0: - continue - n += f"_{name}_{val}" - if n == prefix: - return "" - n += "." - return n - @staticmethod def set_index_param_dict_list(index, param_dict_list, assert_same=False): if not param_dict_list: @@ -282,7 +254,7 @@ def get_knn_search_name( reconstruct: bool = False, ): name = self.get_index_name() - name += Index.param_dict_to_name(search_parameters) + name += IndexBaseDescriptor.param_dict_to_name(search_parameters) name += query_vectors.get_filename("q") name += f"k_{k}." name += f"t_{self.num_threads}." @@ -582,14 +554,21 @@ class Index(IndexBase): num_threads: int d: int metric: str - database_vectors: DatasetDescriptor - construction_params: List[Dict[str, int]] - search_params: Dict[str, int] + codec_name: Optional[str] = None + index_name: Optional[str] = None + database_vectors: Optional[DatasetDescriptor] = None + construction_params: Optional[List[Dict[str, int]]] = None + search_params: Optional[Dict[str, int]] = None + serialize_full_index: bool = False + + bucket: Optional[str] = None + index_path: Optional[str] = None cached_codec: ClassVar[OrderedDict[str, faiss.Index]] = OrderedDict() cached_index: ClassVar[OrderedDict[str, faiss.Index]] = OrderedDict() def __post_init__(self): + logger.info(f"Initializing metric_type to {self.metric}") if isinstance(self.metric, str): if self.metric == "IP": self.metric_type = faiss.METRIC_INNER_PRODUCT @@ -628,13 +607,31 @@ def get_codec(self): Index.cached_codec.popitem(last=False) return Index.cached_codec[codec_name] - def get_index_name(self): - name = self.get_codec_name() - assert self.database_vectors is not None - name += self.database_vectors.get_filename("xb") - return name + def get_codec_name(self) -> Optional[str]: + return self.codec_name + + def get_index_name(self) -> Optional[str]: + return self.index_name def fetch_index(self): + # read index from file if it is already available + if self.index_path: + index_filename = os.path.basename(self.index_path) + else: + index_filename = self.index_name + "index" + if self.io.file_exist(index_filename): + if self.index_path: + index = self.io.read_index( + index_filename, + self.bucket, + os.path.dirname(self.index_path), + ) + else: + index = self.io.read_index(index_filename) + assert self.d == index.d + assert self.metric_type == index.metric_type + return index, 0 + index = self.get_codec() index.reset() assert index.ntotal == 0 @@ -664,10 +661,15 @@ def fetch_index(self): ) assert index.ntotal == xb.shape[0] or index_ivf.ntotal == xb.shape[0] logger.info("Added vectors to index") + if self.serialize_full_index: + codec_size = self.io.write_index(index, index_filename) + assert codec_size is not None + return index, t def get_index(self): - index_name = self.get_index_name() + index_name = self.index_name + # TODO(kuarora) : retrieve file from bucket and path. if index_name not in Index.cached_index: Index.cached_index[index_name], _ = self.fetch_index() if len(Index.cached_index) > 3: @@ -784,8 +786,12 @@ def is_flat_index(self): # are used to wrap pre-trained Faiss indices (codecs) @dataclass class IndexFromCodec(Index): - path: str - bucket: Optional[str] = None + path: Optional[str] = None + + def __post_init__(self): + super().__post_init__() + if self.path is None: + raise ValueError("path is not set") def get_quantizer(self): if not self.is_ivf(): @@ -804,12 +810,6 @@ def get_pretransform(self): def get_model_name(self): return os.path.basename(self.path) - def get_codec_name(self): - assert self.path is not None - name = os.path.basename(self.path) - name += Index.param_dict_list_to_name(self.construction_params) - return name - def fetch_meta(self, dry_run=False): return None, None @@ -871,20 +871,15 @@ def get_codec(self): # IndexFromFactory is for creating and training indices from scratch @dataclass class IndexFromFactory(Index): - factory: str - training_vectors: DatasetDescriptor + factory: Optional[str] = None + training_vectors: Optional[DatasetDescriptor] = None - def get_codec_name(self): - assert self.factory is not None - name = f"{self.factory.replace(',', '_')}." - assert self.d is not None - assert self.metric is not None - name += f"d_{self.d}.{self.metric.upper()}." - if self.factory != "Flat": - assert self.training_vectors is not None - name += self.training_vectors.get_filename("xt") - name += Index.param_dict_list_to_name(self.construction_params) - return name + def __post_init__(self): + super().__post_init__() + if self.factory is None: + raise ValueError("factory is not set") + if self.factory != "Flat" and self.training_vectors is None: + raise ValueError(f"training_vectors is not set for {self.factory}") def fetch_meta(self, dry_run=False): meta_filename = self.get_codec_name() + "json" diff --git a/benchs/bench_fw/optimize.py b/benchs/bench_fw/optimize.py index a2653b7144..b3d62980c3 100644 --- a/benchs/bench_fw/optimize.py +++ b/benchs/bench_fw/optimize.py @@ -14,7 +14,7 @@ # ) from .benchmark import Benchmark -from .descriptors import DatasetDescriptor, IndexDescriptor +from .descriptors import DatasetDescriptor, IndexDescriptorClassic from .utils import dict_merge, filter_results, ParetoMetric, ParetoMode logger = logging.getLogger(__name__) @@ -78,7 +78,7 @@ def benchmark_and_filter_candidates( ) assert filtered index_descs = [ - IndexDescriptor( + IndexDescriptorClassic( factory=v["factory"], construction_params=v["construction_params"], search_params=v["search_params"], @@ -103,8 +103,8 @@ def optimize_quantizer( dry_run=False, ) - descs = [IndexDescriptor(factory="Flat"),] + [ - IndexDescriptor( + descs = [IndexDescriptorClassic(factory="Flat"),] + [ + IndexDescriptorClassic( factory="HNSW32", construction_params=[{"efConstruction": 2**i}], ) @@ -131,7 +131,7 @@ def optimize_ivf( training_vectors: DatasetDescriptor, database_vectors: DatasetDescriptor, query_vectors: DatasetDescriptor, - quantizers: Dict[int, List[IndexDescriptor]], + quantizers: Dict[int, List[IndexDescriptorClassic]], codecs: List[Tuple[str, str]], min_accuracy: float, ): @@ -159,7 +159,7 @@ def optimize_ivf( quantizer_desc.search_params, ) ivf_descs.append( - IndexDescriptor( + IndexDescriptorClassic( factory=f"{pretransform}IVF{nlist}({quantizer_desc.factory}),{fine_ivf}", construction_params=construction_params, ) @@ -188,7 +188,7 @@ def ivf_flat_nprobe_required_for_accuracy( ): _, results = self.benchmark_and_filter_candidates( index_descs=[ - IndexDescriptor(factory=f"IVF{nlist}(Flat),Flat"), + IndexDescriptorClassic(factory=f"IVF{nlist}(Flat),Flat"), ], training_vectors=training_vectors, database_vectors=database_vectors, @@ -255,7 +255,7 @@ def optimize_codec( _, filtered = self.benchmark_and_filter_candidates( index_descs=[ - IndexDescriptor( + IndexDescriptorClassic( factory=f"IVF{nlist},{pq}" if opq is None else f"{opq},IVF{nlist},{pq}", diff --git a/benchs/bench_fw_codecs.py b/benchs/bench_fw_codecs.py index 80741e23f7..d3efc2da0f 100644 --- a/benchs/bench_fw_codecs.py +++ b/benchs/bench_fw_codecs.py @@ -7,10 +7,10 @@ import argparse import os -from bench_fw.benchmark import Benchmark -from bench_fw.benchmark_io import BenchmarkIO -from bench_fw.descriptors import DatasetDescriptor, IndexDescriptor -from bench_fw.index import IndexFromFactory +from faiss.benchs.bench_fw.benchmark import Benchmark +from faiss.benchs.bench_fw.benchmark_io import BenchmarkIO +from faiss.benchs.bench_fw.descriptors import DatasetDescriptor, IndexDescriptorClassic +from faiss.benchs.bench_fw.index import IndexFromFactory logging.basicConfig(level=logging.INFO) @@ -107,7 +107,7 @@ def run_local(rp): database_vectors=database_vectors, query_vectors=query_vectors, index_descs=[ - IndexDescriptor( + IndexDescriptorClassic( factory=factory, construction_params=construction_params, training_size=training_size, diff --git a/benchs/bench_fw_ivf.py b/benchs/bench_fw_ivf.py index e9e144c569..b0c108b7de 100644 --- a/benchs/bench_fw_ivf.py +++ b/benchs/bench_fw_ivf.py @@ -11,7 +11,7 @@ from faiss.benchs.bench_fw.benchmark_io import BenchmarkIO from faiss.benchs.bench_fw.descriptors import ( DatasetDescriptor, - IndexDescriptor, + IndexDescriptorClassic, ) logging.basicConfig(level=logging.INFO) @@ -30,7 +30,7 @@ def sift1M(bio): namespace="std_q", tablename="sift1M" ), index_descs=[ - IndexDescriptor( + IndexDescriptorClassic( factory=f"IVF{2 ** nlist},Flat", ) for nlist in range(8, 15) @@ -38,8 +38,8 @@ def sift1M(bio): k=1, distance_metric="L2", ) - benchmark.set_io(bio) - benchmark.benchmark(result_file="result.json", local=False, train=True, reconstruct=False, knn=True, range=False) + benchmark.io = bio + benchmark.benchmark(result_file="result.json", local=True, train=True, reconstruct=False, knn=True, range=False) def bigann(bio): @@ -56,11 +56,11 @@ def bigann(bio): namespace="std_q", tablename="bigann1M" ), index_descs=[ - IndexDescriptor( + IndexDescriptorClassic( factory=f"IVF{2 ** nlist},Flat", ) for nlist in range(11, 19) ] + [ - IndexDescriptor( + IndexDescriptorClassic( factory=f"IVF{2 ** nlist}_HNSW32,Flat", construction_params=[None, {"efConstruction": 200, "efSearch": 40}], ) for nlist in range(11, 19) @@ -84,18 +84,18 @@ def ssnpp(bio): tablename="ssnpp_queries_10K.npy" ), index_descs=[ - IndexDescriptor( + IndexDescriptorClassic( factory=f"IVF{2 ** nlist},PQ256x4fs,Refine(SQfp16)", ) for nlist in range(9, 16) ] + [ - IndexDescriptor( + IndexDescriptorClassic( factory=f"IVF{2 ** nlist},Flat", ) for nlist in range(9, 16) ] + [ - IndexDescriptor( + IndexDescriptorClassic( factory=f"PQ256x4fs,Refine(SQfp16)", ), - IndexDescriptor( + IndexDescriptorClassic( factory=f"HNSW32", ), ], diff --git a/benchs/bench_fw_optimize.py b/benchs/bench_fw_optimize.py index 31b56f9f51..11e625e23c 100644 --- a/benchs/bench_fw_optimize.py +++ b/benchs/bench_fw_optimize.py @@ -7,9 +7,9 @@ import logging import os -from bench_fw.benchmark_io import BenchmarkIO -from bench_fw.descriptors import DatasetDescriptor -from bench_fw.optimize import Optimizer +from faiss.benchs.bench_fw.benchmark_io import BenchmarkIO +from faiss.benchs.bench_fw.descriptors import DatasetDescriptor +from faiss.benchs.bench_fw.optimize import Optimizer logging.basicConfig(level=logging.INFO) diff --git a/benchs/bench_fw_range.py b/benchs/bench_fw_range.py index f38de114f9..0d4b65afa6 100644 --- a/benchs/bench_fw_range.py +++ b/benchs/bench_fw_range.py @@ -3,28 +3,29 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -import logging import argparse +import logging import os -from bench_fw.benchmark import Benchmark -from bench_fw.benchmark_io import BenchmarkIO -from bench_fw.descriptors import DatasetDescriptor, IndexDescriptor +from faiss.benchs.bench_fw.benchmark import Benchmark +from faiss.benchs.bench_fw.benchmark_io import BenchmarkIO +from faiss.benchs.bench_fw.descriptors import DatasetDescriptor, IndexDescriptorClassic logging.basicConfig(level=logging.INFO) + def ssnpp(bio): benchmark = Benchmark( num_threads=32, training_vectors=DatasetDescriptor( - tablename="ssnpp_training_5M.npy", + tablename="training.npy", ), database_vectors=DatasetDescriptor( - tablename="ssnpp_xb_range_filtered_119201.npy", + tablename="database.npy", ), - query_vectors=DatasetDescriptor(tablename="ssnpp_xq_range_filtered_33615.npy"), + query_vectors=DatasetDescriptor(tablename="query.npy"), index_descs=[ - IndexDescriptor( + IndexDescriptorClassic( factory="Flat", range_metrics={ "weighted": [ @@ -56,7 +57,7 @@ def ssnpp(bio): ] }, ), - IndexDescriptor( + IndexDescriptorClassic( factory="IVF262144(PQ256x4fs),PQ32", ), ], @@ -67,6 +68,7 @@ def ssnpp(bio): benchmark.set_io(bio) benchmark.benchmark("result.json", local=False, train=True, reconstruct=False, knn=False, range=True) + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('experiment')