Implemented BLEU score, wrote unit tests and documentation for it.

comet-ml · Jan 9, 2025 · e7ff1dc · e7ff1dc
1 parent 9ae31b9
commit e7ff1dc
Show file tree

Hide file tree

Showing 5 changed files with 546 additions and 5 deletions.
diff --git a/apps/opik-documentation/documentation/docs/cookbook/dspy.ipynb b/apps/opik-documentation/documentation/docs/cookbook/dspy.ipynb
@@ -157,3 +157,4 @@
  "nbformat": 4,
  "nbformat_minor": 4
 }
+
diff --git a/apps/opik-documentation/documentation/docs/evaluation/metrics/heuristic_metrics.md b/apps/opik-documentation/documentation/docs/evaluation/metrics/heuristic_metrics.md
@@ -16,6 +16,7 @@ You can use the following heuristic metrics:
 | RegexMatch  | Checks if the output matches a specified regular expression pattern                               |
 | IsJson      | Checks if the output is a valid JSON object                                                       |
 | Levenshtein | Calculates the Levenshtein distance between the output and an expected string                     |
+| BLEU        | Calculates the BLEU score for output text against one or more reference texts                     |
 
 ## Score an LLM response
 
@@ -97,3 +98,46 @@ metric = LevenshteinRatio()
 score = metric.score(output="Hello world !", reference="hello")
 print(score)
 ```
+
+### BLEU
+
+The `BLEU` metric can be used to check if the output of an LLM is a valid translation of a reference text. `score()` computes the sentence-level BLEU score for a single candidate against one or more reference translations. It can be used in the following way:
+
+```python
+from opik.evaluation.metrics import BLEU
+
+metric = BLEU()
+
+score = metric.score(output="Hello world!", reference="Hello world")
+print(score)
+```
+
+You can also configure the `BLEU` metric when instantiating it:
+
+```python
+from opik.evaluation.metrics import BLEU
+
+metric = BLEU(n_grams=4, smoothing_method="method1", epsilon=0.1, alpha=5.0, k=5.0)
+
+score = metric.score(output="Hello world !", reference="Hello world")
+print(score)
+```
+
+`score_corpus()` computes the corpus-level BLEU score for multiple candidate sentences and their corresponding references. It can be used in the following way:
+
+```python
+from opik.evaluation.metrics import BLEU
+
+bleu_metric = BLEU()
+
+outputs = ["This is a test.", "Another test sentence."]
+
+references_list = [
+    ["This is a test.", "This is also a test."],
+    ["Another test sentence.", "Yet another test sentence."],
+]
+
+result = bleu_metric.score_corpus(outputs, references_list)
+
+print(f"Corpus BLEU score: {result.value:.4f}, Reason: {result.reason}")
+```
diff --git a/sdks/python/src/opik/evaluation/metrics/__init__.py b/sdks/python/src/opik/evaluation/metrics/__init__.py
@@ -3,6 +3,7 @@
 from .heuristics.is_json import IsJson
 from .heuristics.levenshtein_ratio import LevenshteinRatio
 from .heuristics.regex_match import RegexMatch
+from .heuristics.bleu import BLEU
 from .llm_judges.answer_relevance.metric import AnswerRelevance
 from .llm_judges.context_precision.metric import ContextPrecision
 from .llm_judges.context_recall.metric import ContextRecall
@@ -29,4 +30,5 @@
     "RegexMatch",
     "MetricComputationError",
     "BaseMetric",
+    "BLEU",
 ]
Original file line number	Diff line number	Diff line change
Expand Up		@@ -157,3 +157,4 @@
		"nbformat": 4,
		"nbformat_minor": 4
		}