Skip to content

Commit

Permalink
fix(metrics): fixed NDCG calculation and updated previous tests (#17236)
Browse files Browse the repository at this point in the history
  • Loading branch information
alexeyrodriguez authored Dec 11, 2024
1 parent b004ea0 commit 21f6e34
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 39 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -371,24 +371,16 @@ def compute(
mode = self.mode
expected_set = set(expected_ids)

# Calculate DCG
dcg = sum(
discounted_gain(rel=docid in expected_set, i=i, mode=mode)
for i, docid in enumerate(retrieved_ids, start=1)
)

# Calculate IDCG using min(len(retrieved_ids), len(expected_ids))
# Since we can't achieve better than perfect ranking of all relevant docs
ideal_length = min(len(retrieved_ids), len(expected_ids))
idcg = sum(
discounted_gain(rel=True, i=i, mode=mode)
for i in range(1, ideal_length + 1)
for i in range(1, len(expected_ids) + 1)
)

# Handle edge case where there are no relevant documents
if idcg == 0:
return RetrievalMetricResult(score=0.0)

ndcg_score = dcg / idcg
return RetrievalMetricResult(score=ndcg_score)

Expand Down
52 changes: 22 additions & 30 deletions llama-index-core/tests/evaluation/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,65 +147,57 @@ def test_ap(expected_ids, retrieved_ids, expected_result):
@pytest.mark.parametrize(
("expected_ids", "retrieved_ids", "mode", "expected_result"),
[
# Case 1: Perfect ranking
(
["id1", "id2"],
["id1", "id2", "id3"],
["id3", "id1", "id2", "id4"],
"linear",
1.0, # Perfect ranking of all relevant docs
(1 / log2(1 + 1) + 1 / log2(2 + 1) + 1 / log2(3 + 1))
/ (1 / log2(1 + 1) + 1 / log2(2 + 1) + 1 / log2(3 + 1)),
),
# Case 2: Partial match with imperfect ranking
(
["id1", "id2", "id3"],
["id2", "id4", "id1"],
["id1", "id2", "id3", "id4"],
["id5", "id1"],
"linear",
(1 / log2(2) + 1 / log2(4)) / (1 / log2(2) + 1 / log2(3) + 1 / log2(4)),
(1 / log2(2 + 1))
/ (1 / log2(1 + 1) + 1 / log2(2 + 1) + 1 / log2(3 + 1) + 1 / log2(4 + 1)),
),
# Case 3: No relevant docs retrieved
(
["id1", "id2"],
["id3", "id4", "id5"],
["id3", "id4"],
"linear",
0.0,
),
# Case 4: More relevant docs than retrieved
(
["id1", "id2", "id3", "id4"],
["id1", "id2"],
["id2", "id1", "id7"],
"linear",
1.0, # Perfect ranking within retrieved limit
(1 / log2(1 + 1) + 1 / log2(2 + 1)) / (1 / log2(1 + 1) + 1 / log2(2 + 1)),
),
# Case 5: Single relevant doc
(
["id1"],
["id1", "id2", "id3"],
"linear",
1.0,
["id3", "id1", "id2", "id4"],
"exponential",
(1 / log2(1 + 1) + 1 / log2(2 + 1) + 1 / log2(3 + 1))
/ (1 / log2(1 + 1) + 1 / log2(2 + 1) + 1 / log2(3 + 1)),
),
# Case 6: Exponential mode test
(
["id1", "id2"],
["id2", "id1", "id3"],
["id1", "id2", "id3", "id4"],
["id1", "id2", "id5"],
"exponential",
((2**1 - 1) / log2(2) + (2**1 - 1) / log2(3))
/ ((2**1 - 1) / log2(2) + (2**1 - 1) / log2(3)),
(1 / log2(1 + 1) + 1 / log2(2 + 1))
/ (1 / log2(1 + 1) + 1 / log2(2 + 1) + 1 / log2(3 + 1) + 1 / log2(4 + 1)),
),
# Case 7: All irrelevant docs
(
[],
["id1", "id2", "id3"],
"linear",
1.0, # When no relevant docs exist, any ranking is perfect
["id1", "id2"],
["id1", "id7", "id15", "id2"],
"exponential",
(1 / log2(1 + 1) + 1 / log2(4 + 1)) / (1 / log2(1 + 1) + 1 / log2(2 + 1)),
),
],
)
def test_ndcg(expected_ids, retrieved_ids, mode, expected_result):
ndcg = NDCG()
ndcg.mode = mode
if not expected_ids:
# For empty expected_ids, return 1.0 as any ranking is perfect
assert expected_result == 1.0
return
result = ndcg.compute(expected_ids=expected_ids, retrieved_ids=retrieved_ids)
assert result.score == pytest.approx(expected_result)

Expand Down

0 comments on commit 21f6e34

Please sign in to comment.