From ed74f1e9cd4a528c6b375d299d54e45765cd0a05 Mon Sep 17 00:00:00 2001 From: mathislucka Date: Fri, 21 Jan 2022 18:06:13 +0100 Subject: [PATCH 01/10] ranker should return scores for later usage --- haystack/nodes/ranker/sentence_transformers.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/haystack/nodes/ranker/sentence_transformers.py b/haystack/nodes/ranker/sentence_transformers.py index 7593d440e8..15af237c7f 100644 --- a/haystack/nodes/ranker/sentence_transformers.py +++ b/haystack/nodes/ranker/sentence_transformers.py @@ -72,6 +72,10 @@ def __init__( revision=model_version) self.transformer_model.eval() + # activation functions to normalize scores after prediction + self.single_label_activation = torch.nn.Sigmoid() + self.multi_label_activation = torch.nn.Identity() + if len(self.devices) > 1: self.model = DataParallel(self.transformer_model, device_ids=self.devices) @@ -119,6 +123,14 @@ def predict(self, query: str, documents: List[Document], top_k: Optional[int] = similarity_document_tuple[0][-1] if logits_dim >= 2 else similarity_document_tuple[0], reverse=True) - # rank documents according to scores - sorted_documents = [doc for _, doc in sorted_scores_and_documents] - return sorted_documents[:top_k] + # add normalized scores to documents + sorted_documents = [] + for doc, raw_score in sorted_scores_and_documents[:top_k]: + if logits_dim >= 2: + score = self.multi_label_activation(raw_score)[-1] + else: + score = self.single_label_activation(raw_score)[0] + doc.score = score.detach().cpu().numpy().tolist() + sorted_documents.append(doc) + + return sorted_documents From a54c55cca9d0ba6aaf017b8767d107680072e582 Mon Sep 17 00:00:00 2001 From: mathislucka Date: Fri, 21 Jan 2022 19:05:30 +0100 Subject: [PATCH 02/10] fix wrong tuple order --- haystack/nodes/ranker/sentence_transformers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/nodes/ranker/sentence_transformers.py b/haystack/nodes/ranker/sentence_transformers.py index 15af237c7f..dc7d614343 100644 --- a/haystack/nodes/ranker/sentence_transformers.py +++ b/haystack/nodes/ranker/sentence_transformers.py @@ -125,7 +125,7 @@ def predict(self, query: str, documents: List[Document], top_k: Optional[int] = # add normalized scores to documents sorted_documents = [] - for doc, raw_score in sorted_scores_and_documents[:top_k]: + for raw_score, doc in sorted_scores_and_documents[:top_k]: if logits_dim >= 2: score = self.multi_label_activation(raw_score)[-1] else: From 3cfcfca95b3b68b0880dad3188e13f44870727b7 Mon Sep 17 00:00:00 2001 From: mathislucka Date: Thu, 23 Jun 2022 07:15:53 +0200 Subject: [PATCH 03/10] adjust ranker scores; add tests --- .../nodes/ranker/sentence_transformers.py | 39 ++++++++++---- test/nodes/test_ranker.py | 52 +++++++++++++++++++ 2 files changed, 81 insertions(+), 10 deletions(-) diff --git a/haystack/nodes/ranker/sentence_transformers.py b/haystack/nodes/ranker/sentence_transformers.py index 133829716f..2fed690adb 100644 --- a/haystack/nodes/ranker/sentence_transformers.py +++ b/haystack/nodes/ranker/sentence_transformers.py @@ -44,6 +44,7 @@ def __init__( use_gpu: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, batch_size: Optional[int] = None, + scale_score: bool = True ): """ :param model_name_or_path: Directory of a saved model or the name of a public model e.g. @@ -57,6 +58,9 @@ def __init__( https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device (e.g. ["cuda:0"]). :param batch_size: Number of documents to process at a time. + :param scale_score: The raw predictions will be transformed using a Sigmoid activation function in case the model + only predicts a single label. For multi-label predictions, no scaling is applied. Set this + to False if you do not want any scaling of the raw predictions. """ super().__init__() @@ -76,9 +80,13 @@ def __init__( ) self.transformer_model.eval() - # activation functions to normalize scores after prediction - self.single_label_activation = torch.nn.Sigmoid() - self.multi_label_activation = torch.nn.Identity() + # we use sigmoid activation function to scale the score in case there is only a single label + # we do not apply any scaling when scale_score is set to False + num_labels = self.transformer_model.num_labels + if num_labels == 1 and scale_score: + self.activation_function == torch.nn.Sigmoid() + else: + self.activation_function == torch.nn.Identity() if len(self.devices) > 1: self.model = DataParallel(self.transformer_model, device_ids=self.devices) @@ -124,12 +132,18 @@ def predict(self, query: str, documents: List[Document], top_k: Optional[int] = ) # add normalized scores to documents + sorted_documents = self._add_scores_to_documents(sorted_scores_and_documents[:top_k], logits_dim) + + return sorted_documents + + def _add_scores_to_documents(self, sorted_scores_and_documents, logits_dim): sorted_documents = [] - for raw_score, doc in sorted_scores_and_documents[:top_k]: + for raw_score, doc in sorted_scores_and_documents: if logits_dim >= 2: - score = self.multi_label_activation(raw_score)[-1] + score = self.activation_function(raw_score)[-1] else: - score = self.single_label_activation(raw_score)[0] + score = self.activation_function(raw_score)[0] + doc.score = score.detach().cpu().numpy().tolist() sorted_documents.append(doc) @@ -197,9 +211,12 @@ def predict_batch( reverse=True, ) - # rank documents according to scores - sorted_documents = [doc for _, doc in sorted_scores_and_documents if isinstance(doc, Document)] - return sorted_documents[:top_k] + + # is this step needed? + sorted_documents = [(score, doc) for score, doc in sorted_scores_and_documents if isinstance(doc, Document)] + sorted_documents = self._add_scores_to_documents(sorted_documents[:top_k], logits_dim) + + return sorted_documents else: # Group predictions together grouped_predictions = [] @@ -221,7 +238,9 @@ def predict_batch( ) # rank documents according to scores - sorted_documents = [doc for _, doc in sorted_scores_and_documents if isinstance(doc, Document)][:top_k] + sorted_documents = [(score, doc) for score, doc in sorted_scores_and_documents if isinstance(doc, Document)] + sorted_documents = self._add_scores_to_documents(sorted_documents[:top_k], logits_dim) + result.append(sorted_documents) return result diff --git a/test/nodes/test_ranker.py b/test/nodes/test_ranker.py index c4836b3bec..3784a282c3 100644 --- a/test/nodes/test_ranker.py +++ b/test/nodes/test_ranker.py @@ -1,4 +1,5 @@ import pytest +import math from haystack.errors import HaystackError from haystack.schema import Document @@ -173,3 +174,54 @@ def test_ranker_two_logits(ranker_two_logits): ] results = ranker_two_logits.predict(query=query, documents=docs) assert results[0] == docs[4] + + +def test_ranker_returns_normalized_score(ranker): + query = "What is the most important building in King's Landing that has a religious background?" + + docs = [ + Document( + content="""Aaron Aaron ( or ; ""Ahärôn"") is a prophet, high priest, and the brother of Moses in the Abrahamic religions. Knowledge of Aaron, along with his brother Moses, comes exclusively from religious texts, such as the Bible and Quran. The Hebrew Bible relates that, unlike Moses, who grew up in the Egyptian royal court, Aaron and his elder sister Miriam remained with their kinsmen in the eastern border-land of Egypt (Goshen). When Moses first confronted the Egyptian king about the Israelites, Aaron served as his brother's spokesman (""prophet"") to the Pharaoh. Part of the Law (Torah) that Moses received from""", + meta={"name": "0"}, + id="1", + ), + ] + + results = ranker.predict(query=query, documents=docs) + score = results[0].score + precomputed_score = 5.8601767e-05 + assert math.isclose(precomputed_score, score) + + +def test_ranker_returns_raw_score_when_no_scaling(): + ranker = SentenceTransformersRanker(model_name_or_path="cross-encoder/ms-marco-MiniLM-L-12-v2", scale_score=False) + query = "What is the most important building in King's Landing that has a religious background?" + + docs = [ + Document( + content="""Aaron Aaron ( or ; ""Ahärôn"") is a prophet, high priest, and the brother of Moses in the Abrahamic religions. Knowledge of Aaron, along with his brother Moses, comes exclusively from religious texts, such as the Bible and Quran. The Hebrew Bible relates that, unlike Moses, who grew up in the Egyptian royal court, Aaron and his elder sister Miriam remained with their kinsmen in the eastern border-land of Egypt (Goshen). When Moses first confronted the Egyptian king about the Israelites, Aaron served as his brother's spokesman (""prophet"") to the Pharaoh. Part of the Law (Torah) that Moses received from""", + meta={"name": "0"}, + id="1", + ), + ] + + results = ranker.predict(query=query, documents=docs) + score = results[0].score + precomputed_score = -9.744687 + assert math.isclose(precomputed_score, score) + + +def test_ranker_returns_raw_score_for_two_logits(ranker_two_logits): + query = "Welches ist das wichtigste Gebäude in Königsmund, das einen religiösen Hintergrund hat?" + docs = [ + Document( + content="""Aaron Aaron (oder ; "Ahärôn") ist ein Prophet, Hohepriester und der Bruder von Moses in den abrahamitischen Religionen. Aaron ist ebenso wie sein Bruder Moses ausschließlich aus religiösen Texten wie der Bibel und dem Koran bekannt. Die hebräische Bibel berichtet, dass Aaron und seine ältere Schwester Mirjam im Gegensatz zu Mose, der am ägyptischen Königshof aufwuchs, bei ihren Verwandten im östlichen Grenzland Ägyptens (Goschen) blieben. Als Mose den ägyptischen König zum ersten Mal mit den Israeliten konfrontierte, fungierte Aaron als Sprecher ("Prophet") seines Bruders gegenüber dem Pharao. Ein Teil des Gesetzes (Tora), das Mose von""", + meta={"name": "0"}, + id="1", + ), + ] + + results = ranker_two_logits.predict(query=query, documents=docs) + score = results[0].score + precomputed_score = -3.61354 + assert math.isclose(precomputed_score, score) From 0402bf3db88d5024104f847d6adba0638dcd6327 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 23 Jun 2022 05:21:09 +0000 Subject: [PATCH 04/10] Update Documentation & Code Style --- docs/_src/api/api/ranker.md | 5 +- docs/_src/tutorials/tutorials/14.md | 350 ++++++++---------- .../haystack-pipeline-master.schema.json | 5 + .../nodes/ranker/sentence_transformers.py | 7 +- test/nodes/test_ranker.py | 6 +- 5 files changed, 175 insertions(+), 198 deletions(-) diff --git a/docs/_src/api/api/ranker.md b/docs/_src/api/api/ranker.md index 39253dbb9f..b5efcbe2dd 100644 --- a/docs/_src/api/api/ranker.md +++ b/docs/_src/api/api/ranker.md @@ -92,7 +92,7 @@ p.add_node(component=ranker, name="Ranker", inputs=["ESRetriever"]) #### SentenceTransformersRanker.\_\_init\_\_ ```python -def __init__(model_name_or_path: Union[str, Path], model_version: Optional[str] = None, top_k: int = 10, use_gpu: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, batch_size: Optional[int] = None) +def __init__(model_name_or_path: Union[str, Path], model_version: Optional[str] = None, top_k: int = 10, use_gpu: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, batch_size: Optional[int] = None, scale_score: bool = True) ``` **Arguments**: @@ -108,6 +108,9 @@ The strings will be converted into pytorch devices, so use the string notation d https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device (e.g. ["cuda:0"]). - `batch_size`: Number of documents to process at a time. +- `scale_score`: The raw predictions will be transformed using a Sigmoid activation function in case the model +only predicts a single label. For multi-label predictions, no scaling is applied. Set this +to False if you do not want any scaling of the raw predictions. diff --git a/docs/_src/tutorials/tutorials/14.md b/docs/_src/tutorials/tutorials/14.md index fcdd7cdfa3..34a19b37d4 100644 --- a/docs/_src/tutorials/tutorials/14.md +++ b/docs/_src/tutorials/tutorials/14.md @@ -10,39 +10,29 @@ id: "tutorial14md" # Query Classifier Tutorial [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial14_Query_Classifier.ipynb) -In this tutorial we introduce the query classifier the goal of introducing this feature was to optimize the overall flow of Haystack pipeline by detecting the nature of user queries. Now, the Haystack can detect primarily three types of queries using both light-weight SKLearn Gradient Boosted classifier or Transformer based more robust classifier. The three categories of queries are as follows: +One of the great benefits of using state-of-the-art NLP models like those available in Haystack is that it allows users to state their queries as *plain natural language questions*: rather than trying to come up with just the right set of keywords to find the answer to their question, users can simply ask their question in much the same way that they would ask it of a (very knowledgeable!) person. +But just because users *can* ask their questions in "plain English" (or "plain German", etc.), that doesn't mean they always *will*. For instance, a user might input a few keywords rather than a complete question because they don't understand the pipeline's full capabilities, or because they are so accustomed to keyword search. While a standard Haystack pipeline might handle such queries with reasonable accuracy, for a variety of reasons we still might prefer that our pipeline be sensitive to the type of query it is receiving, so that it behaves differently when a user inputs, say, a collection of keywords instead of a question. -### 1. Keyword Queries: -Such queries don't have semantic meaning and merely consist of keywords. For instance these three are the examples of keyword queries. +For this reason, Haystack comes with built-in capabilities to distinguish between three types of queries: **keyword queries**, **interrogative queries**, and **statement queries**, described below. -* arya stark father -* jon snow country -* arya stark younger brothers +1. **Keyword queries** can be thought of more or less as lists of words, such as "Alaska cruises summer". While the meanings of individual words may matter in a keyword query, the linguistic connections *between* words do not. Hence, in a keyword query the order of words is largely irrelevant: "Alaska cruises summer", "summer Alaska cruises", and "summer cruises Alaska" are functionally the same. -### 2. Interrogative Queries: -In such queries users usually ask a question, regardless of presence of "?" in the query the goal here is to detect the intent of the user whether any question is asked or not in the query. For example: +2. **Interrogative queries** (or **question queries**) are queries phrased as natural language questions, such as "Who was the father of Eddard Stark?". Unlike with keyword queries, word order very much matters here: "Who was the father of Eddard Stark?" and "Who was Eddard Stark the father of?" are very different questions, despite having exactly the same words. (Note that while we often write questions with question marks, Haystack can find interrogative queries without such a dead giveaway!) -* who is the father of arya stark ? -* which country was jon snow filmed ? -* who are the younger brothers of arya stark ? +3. **Statement queries** are just declarative sentences, such as "Daenerys loved Jon". These are like interrogative queries in that word order matters—again, "Daenerys loved Jon" and "Jon loved Daenerys" mean very different things—but they are statements instead of questions. -### 3. Declarative Queries: -Such queries are variation of keyword queries, however, there is semantic relationship between words. Fo example: +In this tutorial you will learn how to use **query classifiers** to branch your Haystack pipeline based on the type of query it receives. Haystack comes with two out-of-the-box query classification schemas, each of which routes a given query into one of two branches: -* Arya stark was a daughter of a lord. -* Jon snow was filmed in a country in UK. -* Bran was brother of a princess. +1. **Keyword vs. Question/Statement** — routes a query into one of two branches depending on whether it is a full question/statement or a collection of keywords. -In this tutorial, you will learn how the `TransformersQueryClassifier` and `SklearnQueryClassifier` classes can be used to intelligently route your queries, based on the nature of the user query. Also, you can choose between a lightweight Gradients boosted classifier or a transformer based classifier. +2. **Question vs. Statement** — routes a natural language query into one of two branches depending on whether it is a question or a statement. -Furthermore, there are two types of classifiers you can use out of the box from Haystack. -1. Keyword vs Statement/Question Query Classifier -2. Statement vs Question Query Classifier +Furthermore, for each classification schema there are two types of nodes capable of performing this classification: a **`TransformersQueryClassifier`** that uses a transformer model, and an **`SklearnQueryClassifier`** that uses a more lightweight model built in `sklearn`. -As evident from the name the first classifier detects the keywords search queries and semantic statements like sentences/questions. The second classifier differentiates between question based queries and declarative sentences. +With all of that explanation out of the way, let's dive in! -### Prepare environment +### Prepare the Environment #### Colab: Enable the GPU runtime Make sure you enable the GPU runtime to experience decent speed in this tutorial. @@ -50,21 +40,105 @@ Make sure you enable the GPU runtime to experience decent speed in this tutorial -These lines are to install Haystack through pip +Next we make sure the latest version of Haystack is installed: ```python # Install the latest release of Haystack in your own environment #! pip install farm-haystack -# Install the latest master of Haystack +# Install the latest master of Haystack (Colab) !pip install --upgrade pip !pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab] -# Install pygraphviz +# Install these to allow pipeline visualization !apt install libgraphviz-dev !pip install pygraphviz +``` + +### Trying Some Query Classifiers on their Own + +Before integrating query classifiers into our pipelines, let's test them out on their own and see what they actually do. First we initiate a simple, out-of-the-box **keyword vs. question/statement** `SklearnQueryClassifier`: + + +```python +# Here we create the keyword vs question/statement query classifier +from haystack.nodes import SklearnQueryClassifier + +keyword_classifier = SklearnQueryClassifier() +``` + +Now let's feed some queries into this query classifier. We'll test with one keyword query, one interrogative query, and one statement query. Notice that we don't use any punctuation, such as question marks; this illustrates that the classifier doesn't need punctuation in order to make the right decision. + + +```python +queries = [ + "Arya Stark father", # Keyword Query + "Who was the father of Arya Stark", # Interrogative Query + "Lord Eddard was the father of Arya Stark", # Statement Query +] +``` + +We can see below what our classifier does with these queries: "Arya Stark father" is rightly determined to be a keyword query and is sent to branch 2, while both the interrogative query "Who was the father of Arya Stark" and the statement query "Lord Eddard was the father of Arya Stark" are correctly labeled as non-keyword queries, and are thus shipped off to branch 1. + + +```python +import pandas as pd + +k_vs_qs_results = {"Query": [], "Output Branch": [], "Class": []} + +for query in queries: + result = keyword_classifier.run(query=query) + k_vs_qs_results["Query"].append(query) + k_vs_qs_results["Output Branch"].append(result[1]) + k_vs_qs_results["Class"].append("Question/Statement" if result[1] == "output_1" else "Keyword") + +pd.DataFrame.from_dict(k_vs_qs_results) +``` + +Next we will illustrate a **question vs. statement** `SklearnQueryClassifier`. We define our classifier below; notice that this time we have to explicitly specify the model and vectorizer, since the default for an `SklearnQueryClassifier` (and a `TransformersQueryClassifier`) is keyword vs. question/statement classification. + + +```python +# Here we create the question vs statement query classifier +model_url = ( + "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/model.pickle" +) +vectorizer_url = ( + "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/vectorizer.pickle" +) + +question_classifier = SklearnQueryClassifier(model_name_or_path=model_url, vectorizer_name_or_path=vectorizer_url) +``` +We will test this classifier on the two question/statement queries from the last go-round: + + +```python +queries = [ + "Who was the father of Arya Stark", # Interrogative Query + "Lord Eddard was the father of Arya Stark", # Statement Query +] + +q_vs_s_results = {"Query": [], "Output Branch": [], "Class": []} + +for query in queries: + result = question_classifier.run(query=query) + q_vs_s_results["Query"].append(query) + q_vs_s_results["Output Branch"].append(result[1]) + q_vs_s_results["Class"].append("Question" if result[1] == "output_1" else "Statement") + +pd.DataFrame.from_dict(q_vs_s_results) +``` + +And as we see, the question "Who was the father of Arya Stark" is sent to branch 1, while the statement "Lord Eddard was the father of Arya Stark" is sent to branch 2, so we can have our pipeline treat statements and questions differently. + +### Using Query Classifiers in a Pipeline + +Now let's see how we can use query classifiers in a question-answering (QA) pipeline. We start by initiating Elasticsearch: + + +```python # In Colab / No Docker environments: Start Elasticsearch from source ! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q ! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz @@ -80,14 +154,7 @@ es_server = Popen( ! sleep 30 ``` -If running from Colab or a no Docker environment, you will want to start Elasticsearch from source - -## Initialization - -Here are some core imports - -Then let's fetch some data (in this case, pages from the Game of Thrones wiki) and prepare it so that it can -be used indexed into our `DocumentStore` +Next we fetch some data—for our example we'll use pages from the Game of Thrones wiki—and index it in our `DocumentStore`: ```python @@ -101,13 +168,7 @@ from haystack.utils import ( ) from haystack.pipelines import Pipeline from haystack.document_stores import ElasticsearchDocumentStore -from haystack.nodes import ( - BM25Retriever, - EmbeddingRetriever, - FARMReader, - TransformersQueryClassifier, - SklearnQueryClassifier, -) +from haystack.nodes import BM25Retriever, EmbeddingRetriever, FARMReader, TransformersQueryClassifier # Download and prepare data - 517 Wikipedia articles for Game of Thrones doc_dir = "data/tutorial14" @@ -118,15 +179,24 @@ fetch_archive_from_http(url=s3_url, output_dir=doc_dir) got_docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) # Initialize DocumentStore and index documents -launch_es() +# launch_es() # Uncomment this line for local Elasticsearch document_store = ElasticsearchDocumentStore() document_store.delete_documents() document_store.write_documents(got_docs) +``` + +#### Pipelines with Keyword vs. Question/Statement Classification -# Initialize Sparse retriever +Our first illustration will be a simple retriever-reader QA pipeline, but the choice of which retriever we use will depend on the type of query received: **keyword** queries will use a sparse **`BM25Retriever`**, while **question/statement** queries will use the more accurate but also more computationally expensive **`EmbeddingRetriever`**. + +We start by initializing our retrievers and reader: + + +```python +# Initialize sparse retriever for keyword queries bm25_retriever = BM25Retriever(document_store=document_store) -# Initialize dense retriever +# Initialize dense retriever for question/statement queries embedding_retriever = EmbeddingRetriever( document_store=document_store, embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1" ) @@ -135,19 +205,7 @@ document_store.update_embeddings(embedding_retriever, update_existing_embeddings reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2") ``` -## Keyword vs Question/Statement Classifier - -The keyword vs question/statement query classifier essentially distinguishes between the keyword queries and statements/questions. So you can intelligently route to different retrieval nodes based on the nature of the query. Using this classifier can potentially yield the following benefits: - -* Getting better search results (e.g. by routing only proper questions to DPR / QA branches and not keyword queries) -* Less GPU costs (e.g. if 50% of your traffic is only keyword queries you could just use elastic here and save the GPU resources for the other 50% of traffic with semantic queries) - -![image]() - - -Below, we define a `SklearnQueryClassifier` and show how to use it: - -Read more about the trained model and dataset used [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/readme.txt) +Now we define our pipeline. As promised, the question/statement branch `output_1` from the query classifier is fed into an `EmbeddingRetriever`, while the keyword branch `output_2` from the same classifier is fed into a `BM25Retriever`. Both of these retrievers are then fed into our reader. Our pipeline can thus be thought of as having something of a diamond shape: all queries are sent into the classifier, which splits those queries into two different retrievers, and those retrievers feed their outputs to the same reader. ```python @@ -157,57 +215,33 @@ sklearn_keyword_classifier.add_node(component=SklearnQueryClassifier(), name="Qu sklearn_keyword_classifier.add_node( component=embedding_retriever, name="EmbeddingRetriever", inputs=["QueryClassifier.output_1"] ) -sklearn_keyword_classifier.add_node(component=bm25_retriever, name="ESRetriever", inputs=["QueryClassifier.output_2"]) -sklearn_keyword_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "EmbeddingRetriever"]) -sklearn_keyword_classifier.draw("pipeline_classifier.png") +sklearn_keyword_classifier.add_node(component=bm25_retriever, name="BM25Retriever", inputs=["QueryClassifier.output_2"]) +sklearn_keyword_classifier.add_node(component=reader, name="QAReader", inputs=["BM25Retriever", "EmbeddingRetriever"]) + +# Visualization of the pipeline +sklearn_keyword_classifier.draw("sklearn_keyword_classifier.png") ``` +Below we can see some results from this choice in branching structure: the keyword query "arya stark father" and the question query "Who is the father of Arya Stark?" generate noticeably different results, a distinction that is likely due to the use of different retrievers for keyword vs. question/statement queries. + ```python +# Useful for framing headers +equal_line = "=" * 30 + # Run only the dense retriever on the full sentence query res_1 = sklearn_keyword_classifier.run(query="Who is the father of Arya Stark?") -print("Embedding Retriever Results" + "\n" + "=" * 15) +print(f"\n\n{equal_line}\nQUESTION QUERY RESULTS\n{equal_line}") print_answers(res_1, details="minimum") +print("\n\n") # Run only the sparse retriever on a keyword based query res_2 = sklearn_keyword_classifier.run(query="arya stark father") -print("ES Results" + "\n" + "=" * 15) +print(f"\n\n{equal_line}\nKEYWORD QUERY RESULTS\n{equal_line}") print_answers(res_2, details="minimum") ``` - -```python -# Run only the dense retriever on the full sentence query -res_3 = sklearn_keyword_classifier.run(query="which country was jon snow filmed ?") -print("Embedding Retriever Results" + "\n" + "=" * 15) -print_answers(res_3, details="minimum") - -# Run only the sparse retriever on a keyword based query -res_4 = sklearn_keyword_classifier.run(query="jon snow country") -print("ES Results" + "\n" + "=" * 15) -print_answers(res_4, details="minimum") -``` - - -```python -# Run only the dense retriever on the full sentence query -res_5 = sklearn_keyword_classifier.run(query="who are the younger brothers of arya stark ?") -print("Embedding Retriever Results" + "\n" + "=" * 15) -print_answers(res_5, details="minimum") - -# Run only the sparse retriever on a keyword based query -res_6 = sklearn_keyword_classifier.run(query="arya stark younger brothers") -print("ES Results" + "\n" + "=" * 15) -print_answers(res_6, details="minimum") -``` - -## Transformer Keyword vs Question/Statement Classifier - -Firstly, it's essential to understand the trade-offs between SkLearn and Transformer query classifiers. The transformer classifier is more accurate than SkLearn classifier however, it requires more memory and most probably GPU for faster inference however the transformer size is roughly `50 MBs`. Whereas, SkLearn is less accurate however is much more faster and doesn't require GPU for inference. - -Below, we define a `TransformersQueryClassifier` and show how to use it: - -Read more about the trained model and dataset used [here](https://huggingface.co/shahrukhx01/bert-mini-finetune-question-detection) +The above example uses an `SklearnQueryClassifier`, but of course we can do precisely the same thing with a `TransformersQueryClassifier`. This is illustrated below, where we have constructed the same diamond-shaped pipeline. ```python @@ -220,61 +254,40 @@ transformer_keyword_classifier.add_node( component=embedding_retriever, name="EmbeddingRetriever", inputs=["QueryClassifier.output_1"] ) transformer_keyword_classifier.add_node( - component=bm25_retriever, name="ESRetriever", inputs=["QueryClassifier.output_2"] + component=bm25_retriever, name="BM25Retriever", inputs=["QueryClassifier.output_2"] +) +transformer_keyword_classifier.add_node( + component=reader, name="QAReader", inputs=["BM25Retriever", "EmbeddingRetriever"] ) -transformer_keyword_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "EmbeddingRetriever"]) -transformer_keyword_classifier.draw("pipeline_classifier.png") -``` -```python +# Useful for framing headers +equal_line = "=" * 30 + # Run only the dense retriever on the full sentence query res_1 = transformer_keyword_classifier.run(query="Who is the father of Arya Stark?") -print("Embedding Retriever Results" + "\n" + "=" * 15) +print(f"\n\n{equal_line}\nQUESTION QUERY RESULTS\n{equal_line}") print_answers(res_1, details="minimum") +print("\n\n") # Run only the sparse retriever on a keyword based query res_2 = transformer_keyword_classifier.run(query="arya stark father") -print("ES Results" + "\n" + "=" * 15) +print(f"\n\n{equal_line}\nKEYWORD QUERY RESULTS\n{equal_line}") print_answers(res_2, details="minimum") ``` +#### Pipeline with Question vs. Statement Classification -```python -# Run only the dense retriever on the full sentence query -res_3 = transformer_keyword_classifier.run(query="which country was jon snow filmed ?") -print("Embedding Retriever Results" + "\n" + "=" * 15) -print_answers(res_3, details="minimum") - -# Run only the sparse retriever on a keyword based query -res_4 = transformer_keyword_classifier.run(query="jon snow country") -print("ES Results" + "\n" + "=" * 15) -print_answers(res_4, details="minimum") -``` - - -```python -# Run only the dense retriever on the full sentence query -res_5 = transformer_keyword_classifier.run(query="who are the younger brothers of arya stark ?") -print("Embedding Retriever Results" + "\n" + "=" * 15) -print_answers(res_5, details="minimum") - -# Run only the sparse retriever on a keyword based query -res_6 = transformer_keyword_classifier.run(query="arya stark younger brothers") -print("ES Results" + "\n" + "=" * 15) -print_answers(res_6, details="minimum") -``` +Above we saw a potential use for keyword vs. question/statement classification: we might choose to use a less resource-intensive retriever for keyword queries than for question/statement queries. But what about question vs. statement classification? -## Question vs Statement Classifier +To illustrate one potential use for question vs. statement classification, we will build a pipeline that looks as follows: -One possible use case of this classifier could be to route queries after the document retrieval to only send questions to QA reader and in case of declarative sentence, just return the DPR/ES results back to user to enhance user experience and only show answers when user explicitly asks it. +1. The pipeline will start with a retriever that **every query** will go through. +2. The pipeline will end with a reader that **only question queries** will go through. -![image]() +In other words, our pipeline will be a **retriever-only pipeline for statement queries**—given the statement "Arya Stark was the daughter of a Lord", all we will get back are the most relevant documents—but it will be a **retriever-reader pipeline for question queries**. - -Below, we define a `TransformersQueryClassifier` and show how to use it: - -Read more about the trained model and dataset used [here](https://huggingface.co/shahrukhx01/question-vs-statement-classifier) +To make things more concrete, our pipeline will start with a retriever, which is then fed into a `TransformersQueryClassifier` that is set to do question vs. statement classification. Note that this means we need to explicitly choose the model, since as mentioned previously a default `TransformersQueryClassifier` performs keyword vs. question/statement classification. The classifier's first branch, which handles question queries, will then be sent to the reader, while the second branch will not be connected to any other nodes. As a result, the last node of the pipeline depends on the type of query: questions go all the way through the reader, while statements only go through the retriever. This pipeline is illustrated below: ```python @@ -287,75 +300,30 @@ transformer_question_classifier.add_node( inputs=["EmbeddingRetriever"], ) transformer_question_classifier.add_node(component=reader, name="QAReader", inputs=["QueryClassifier.output_1"]) -transformer_question_classifier.draw("question_classifier.png") - -# Run only the QA reader on the question query -res_1 = transformer_question_classifier.run(query="Who is the father of Arya Stark?") -print("Embedding Retriever Results" + "\n" + "=" * 15) -print_answers(res_1, details="minimum") -res_2 = transformer_question_classifier.run(query="Arya Stark was the daughter of a Lord.") -print("ES Results" + "\n" + "=" * 15) -print_documents(res_2) +# Visualization of the pipeline +transformer_question_classifier.draw("transformer_question_classifier.png") ``` -## Standalone Query Classifier -Below we run queries classifiers standalone to better understand their outputs on each of the three types of queries - - -```python -# Here we create the keyword vs question/statement query classifier -from haystack.nodes import TransformersQueryClassifier - -queries = [ - "arya stark father", - "jon snow country", - "who is the father of arya stark", - "which country was jon snow filmed?", -] - -keyword_classifier = TransformersQueryClassifier() - -for query in queries: - result = keyword_classifier.run(query=query) - if result[1] == "output_1": - category = "question/statement" - else: - category = "keyword" - - print(f"Query: {query}, raw_output: {result}, class: {category}") -``` +And below we see the results of this pipeline: with a question query like "Who is the father of Arya Stark?" we get back answers returned by a reader, but with a statement query like "Arya Stark was the daughter of a Lord" we just get back documents returned by a retriever. ```python -# Here we create the question vs statement query classifier -from haystack.nodes import TransformersQueryClassifier +# Useful for framing headers +equal_line = "=" * 30 -queries = [ - "Lord Eddard was the father of Arya Stark.", - "Jon Snow was filmed in United Kingdom.", - "who is the father of arya stark?", - "Which country was jon snow filmed in?", -] - -question_classifier = TransformersQueryClassifier(model_name_or_path="shahrukhx01/question-vs-statement-classifier") - -for query in queries: - result = question_classifier.run(query=query) - if result[1] == "output_1": - category = "question" - else: - category = "statement" +# Run the retriever + reader on the question query +res_1 = transformer_question_classifier.run(query="Who is the father of Arya Stark?") +print(f"\n\n{equal_line}\nQUESTION QUERY RESULTS\n{equal_line}") +print_answers(res_1, details="minimum") +print("\n\n") - print(f"Query: {query}, raw_output: {result}, class: {category}") +# Run only the retriever on the statement query +res_2 = transformer_question_classifier.run(query="Arya Stark was the daughter of a Lord.") +print(f"\n\n{equal_line}\nSTATEMENT QUERY RESULTS\n{equal_line}") +print_documents(res_2) ``` -## Conclusion - -The query classifier gives you more possibility to be more creative with the pipelines and use different retrieval nodes in a flexible fashion. Moreover, as in the case of Question vs Statement classifier you can also choose the queries which you want to send to the reader. - -Finally, you also have the possible of bringing your own classifier and plugging it into either `TransformersQueryClassifier(model_name_or_path="")` or using the `SklearnQueryClassifier(model_name_or_path="url_to_classifier_or_file_path_as_pickle", vectorizer_name_or_path="url_to_vectorizer_or_file_path_as_pickle")` - ## About us This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany diff --git a/haystack/json-schemas/haystack-pipeline-master.schema.json b/haystack/json-schemas/haystack-pipeline-master.schema.json index ff55116564..866b310490 100644 --- a/haystack/json-schemas/haystack-pipeline-master.schema.json +++ b/haystack/json-schemas/haystack-pipeline-master.schema.json @@ -3775,6 +3775,11 @@ "batch_size": { "title": "Batch Size", "type": "integer" + }, + "scale_score": { + "title": "Scale Score", + "default": true, + "type": "boolean" } }, "required": [ diff --git a/haystack/nodes/ranker/sentence_transformers.py b/haystack/nodes/ranker/sentence_transformers.py index 2fed690adb..69319bc91d 100644 --- a/haystack/nodes/ranker/sentence_transformers.py +++ b/haystack/nodes/ranker/sentence_transformers.py @@ -44,7 +44,7 @@ def __init__( use_gpu: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, batch_size: Optional[int] = None, - scale_score: bool = True + scale_score: bool = True, ): """ :param model_name_or_path: Directory of a saved model or the name of a public model e.g. @@ -211,7 +211,6 @@ def predict_batch( reverse=True, ) - # is this step needed? sorted_documents = [(score, doc) for score, doc in sorted_scores_and_documents if isinstance(doc, Document)] sorted_documents = self._add_scores_to_documents(sorted_documents[:top_k], logits_dim) @@ -238,7 +237,9 @@ def predict_batch( ) # rank documents according to scores - sorted_documents = [(score, doc) for score, doc in sorted_scores_and_documents if isinstance(doc, Document)] + sorted_documents = [ + (score, doc) for score, doc in sorted_scores_and_documents if isinstance(doc, Document) + ] sorted_documents = self._add_scores_to_documents(sorted_documents[:top_k], logits_dim) result.append(sorted_documents) diff --git a/test/nodes/test_ranker.py b/test/nodes/test_ranker.py index 3784a282c3..8531a1bebb 100644 --- a/test/nodes/test_ranker.py +++ b/test/nodes/test_ranker.py @@ -184,7 +184,7 @@ def test_ranker_returns_normalized_score(ranker): content="""Aaron Aaron ( or ; ""Ahärôn"") is a prophet, high priest, and the brother of Moses in the Abrahamic religions. Knowledge of Aaron, along with his brother Moses, comes exclusively from religious texts, such as the Bible and Quran. The Hebrew Bible relates that, unlike Moses, who grew up in the Egyptian royal court, Aaron and his elder sister Miriam remained with their kinsmen in the eastern border-land of Egypt (Goshen). When Moses first confronted the Egyptian king about the Israelites, Aaron served as his brother's spokesman (""prophet"") to the Pharaoh. Part of the Law (Torah) that Moses received from""", meta={"name": "0"}, id="1", - ), + ) ] results = ranker.predict(query=query, documents=docs) @@ -202,7 +202,7 @@ def test_ranker_returns_raw_score_when_no_scaling(): content="""Aaron Aaron ( or ; ""Ahärôn"") is a prophet, high priest, and the brother of Moses in the Abrahamic religions. Knowledge of Aaron, along with his brother Moses, comes exclusively from religious texts, such as the Bible and Quran. The Hebrew Bible relates that, unlike Moses, who grew up in the Egyptian royal court, Aaron and his elder sister Miriam remained with their kinsmen in the eastern border-land of Egypt (Goshen). When Moses first confronted the Egyptian king about the Israelites, Aaron served as his brother's spokesman (""prophet"") to the Pharaoh. Part of the Law (Torah) that Moses received from""", meta={"name": "0"}, id="1", - ), + ) ] results = ranker.predict(query=query, documents=docs) @@ -218,7 +218,7 @@ def test_ranker_returns_raw_score_for_two_logits(ranker_two_logits): content="""Aaron Aaron (oder ; "Ahärôn") ist ein Prophet, Hohepriester und der Bruder von Moses in den abrahamitischen Religionen. Aaron ist ebenso wie sein Bruder Moses ausschließlich aus religiösen Texten wie der Bibel und dem Koran bekannt. Die hebräische Bibel berichtet, dass Aaron und seine ältere Schwester Mirjam im Gegensatz zu Mose, der am ägyptischen Königshof aufwuchs, bei ihren Verwandten im östlichen Grenzland Ägyptens (Goschen) blieben. Als Mose den ägyptischen König zum ersten Mal mit den Israeliten konfrontierte, fungierte Aaron als Sprecher ("Prophet") seines Bruders gegenüber dem Pharao. Ein Teil des Gesetzes (Tora), das Mose von""", meta={"name": "0"}, id="1", - ), + ) ] results = ranker_two_logits.predict(query=query, documents=docs) From c6f363e3979954fb6a6190a5bf76a9d32823090c Mon Sep 17 00:00:00 2001 From: mathislucka Date: Thu, 23 Jun 2022 07:28:51 +0200 Subject: [PATCH 05/10] fix mypy --- haystack/nodes/ranker/sentence_transformers.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/haystack/nodes/ranker/sentence_transformers.py b/haystack/nodes/ranker/sentence_transformers.py index 69319bc91d..63cc6e9c17 100644 --- a/haystack/nodes/ranker/sentence_transformers.py +++ b/haystack/nodes/ranker/sentence_transformers.py @@ -136,7 +136,17 @@ def predict(self, query: str, documents: List[Document], top_k: Optional[int] = return sorted_documents - def _add_scores_to_documents(self, sorted_scores_and_documents, logits_dim): + def _add_scores_to_documents( + self, + sorted_scores_and_documents: List[Tuple], + logits_dim: int + ) -> List[Document]: + """ + Normalize and add scores to retrieved result documents. + + :param sorted_scores_and_documents: List of score, Document Tuples. + :param logits_dim: Dimensionality of the returned scores. + """ sorted_documents = [] for raw_score, doc in sorted_scores_and_documents: if logits_dim >= 2: From 33f5d5cb3acd7785df988864e7e1b99bb67a0cda Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 23 Jun 2022 05:31:33 +0000 Subject: [PATCH 06/10] Update Documentation & Code Style --- haystack/nodes/ranker/sentence_transformers.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/haystack/nodes/ranker/sentence_transformers.py b/haystack/nodes/ranker/sentence_transformers.py index 63cc6e9c17..1e3e488508 100644 --- a/haystack/nodes/ranker/sentence_transformers.py +++ b/haystack/nodes/ranker/sentence_transformers.py @@ -136,11 +136,7 @@ def predict(self, query: str, documents: List[Document], top_k: Optional[int] = return sorted_documents - def _add_scores_to_documents( - self, - sorted_scores_and_documents: List[Tuple], - logits_dim: int - ) -> List[Document]: + def _add_scores_to_documents(self, sorted_scores_and_documents: List[Tuple], logits_dim: int) -> List[Document]: """ Normalize and add scores to retrieved result documents. From a6428effec9701e03f0214dc35e7e7290a444f9a Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Mon, 27 Jun 2022 10:16:49 +0200 Subject: [PATCH 07/10] fix mypy --- haystack/nodes/ranker/sentence_transformers.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/haystack/nodes/ranker/sentence_transformers.py b/haystack/nodes/ranker/sentence_transformers.py index 1e3e488508..8beca169be 100644 --- a/haystack/nodes/ranker/sentence_transformers.py +++ b/haystack/nodes/ranker/sentence_transformers.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Union, Tuple, Iterator +from typing import List, Optional, Union, Tuple, Iterator, Any import logging from pathlib import Path @@ -83,10 +83,11 @@ def __init__( # we use sigmoid activation function to scale the score in case there is only a single label # we do not apply any scaling when scale_score is set to False num_labels = self.transformer_model.num_labels + self.activation_function: torch.nn.Module if num_labels == 1 and scale_score: - self.activation_function == torch.nn.Sigmoid() + self.activation_function = torch.nn.Sigmoid() else: - self.activation_function == torch.nn.Identity() + self.activation_function = torch.nn.Identity() if len(self.devices) > 1: self.model = DataParallel(self.transformer_model, device_ids=self.devices) @@ -136,7 +137,7 @@ def predict(self, query: str, documents: List[Document], top_k: Optional[int] = return sorted_documents - def _add_scores_to_documents(self, sorted_scores_and_documents: List[Tuple], logits_dim: int) -> List[Document]: + def _add_scores_to_documents(self, sorted_scores_and_documents: List[Tuple[Any, Document]], logits_dim: int) -> List[Document]: """ Normalize and add scores to retrieved result documents. @@ -219,9 +220,9 @@ def predict_batch( # is this step needed? sorted_documents = [(score, doc) for score, doc in sorted_scores_and_documents if isinstance(doc, Document)] - sorted_documents = self._add_scores_to_documents(sorted_documents[:top_k], logits_dim) + sorted_documents_with_scores = self._add_scores_to_documents(sorted_documents[:top_k], logits_dim) - return sorted_documents + return sorted_documents_with_scores else: # Group predictions together grouped_predictions = [] @@ -246,9 +247,9 @@ def predict_batch( sorted_documents = [ (score, doc) for score, doc in sorted_scores_and_documents if isinstance(doc, Document) ] - sorted_documents = self._add_scores_to_documents(sorted_documents[:top_k], logits_dim) + sorted_documents_with_scores = self._add_scores_to_documents(sorted_documents[:top_k], logits_dim) - result.append(sorted_documents) + result.append(sorted_documents_with_scores) return result From df4e13db577d8c4f1bba4b925166e2953e7bfa76 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 27 Jun 2022 08:22:31 +0000 Subject: [PATCH 08/10] Update Documentation & Code Style --- haystack/nodes/ranker/sentence_transformers.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/haystack/nodes/ranker/sentence_transformers.py b/haystack/nodes/ranker/sentence_transformers.py index 8beca169be..cb8d7a2f90 100644 --- a/haystack/nodes/ranker/sentence_transformers.py +++ b/haystack/nodes/ranker/sentence_transformers.py @@ -137,7 +137,9 @@ def predict(self, query: str, documents: List[Document], top_k: Optional[int] = return sorted_documents - def _add_scores_to_documents(self, sorted_scores_and_documents: List[Tuple[Any, Document]], logits_dim: int) -> List[Document]: + def _add_scores_to_documents( + self, sorted_scores_and_documents: List[Tuple[Any, Document]], logits_dim: int + ) -> List[Document]: """ Normalize and add scores to retrieved result documents. From 333ee79074cc6d230c89b80ea80d4d126bdecad1 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Mon, 27 Jun 2022 10:49:56 +0200 Subject: [PATCH 09/10] relax ranker test tolerance --- test/nodes/test_ranker.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/nodes/test_ranker.py b/test/nodes/test_ranker.py index 8531a1bebb..a532528a4c 100644 --- a/test/nodes/test_ranker.py +++ b/test/nodes/test_ranker.py @@ -190,7 +190,7 @@ def test_ranker_returns_normalized_score(ranker): results = ranker.predict(query=query, documents=docs) score = results[0].score precomputed_score = 5.8601767e-05 - assert math.isclose(precomputed_score, score) + assert math.isclose(precomputed_score, score, rel_tol=0.001) def test_ranker_returns_raw_score_when_no_scaling(): @@ -208,7 +208,7 @@ def test_ranker_returns_raw_score_when_no_scaling(): results = ranker.predict(query=query, documents=docs) score = results[0].score precomputed_score = -9.744687 - assert math.isclose(precomputed_score, score) + assert math.isclose(precomputed_score, score, rel_tol=0.001) def test_ranker_returns_raw_score_for_two_logits(ranker_two_logits): @@ -224,4 +224,4 @@ def test_ranker_returns_raw_score_for_two_logits(ranker_two_logits): results = ranker_two_logits.predict(query=query, documents=docs) score = results[0].score precomputed_score = -3.61354 - assert math.isclose(precomputed_score, score) + assert math.isclose(precomputed_score, score, rel_tol=0.001) From 4079eda2b938911b3bb07272be32f46f823165e7 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Mon, 27 Jun 2022 11:40:24 +0200 Subject: [PATCH 10/10] update ranker test score --- test/nodes/test_ranker.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/nodes/test_ranker.py b/test/nodes/test_ranker.py index a532528a4c..d7b8e9a19a 100644 --- a/test/nodes/test_ranker.py +++ b/test/nodes/test_ranker.py @@ -189,8 +189,8 @@ def test_ranker_returns_normalized_score(ranker): results = ranker.predict(query=query, documents=docs) score = results[0].score - precomputed_score = 5.8601767e-05 - assert math.isclose(precomputed_score, score, rel_tol=0.001) + precomputed_score = 5.8796231e-05 + assert math.isclose(precomputed_score, score, rel_tol=0.01) def test_ranker_returns_raw_score_when_no_scaling():