-
Notifications
You must be signed in to change notification settings - Fork 41
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[CI] Use lm-eval for model regression tests (#518)
* add lm_eval in model test * mod default value * mod clean up * add native lm_eval * add bloom lmeval result * add test cohere lm_eval * add lm eval model * update native model score * remove general assert mod quant data * mod task name * mod clean up * mod clean up * mod max length 4096 * mod clean upg * add yi * modify tests/models file * modify tests/models files * clean up * add native value * mod clean up * clean temp code * eval need save path and delete * check quant model path * revert * clean up code * mod sub test * mod clean pu * mod clean up * modify model unit test files * merge code * mod value * mod diff_pct * mod clean up * Update model_test.py * modify tests/models/test_llama3_1.py * format code * format code * mod clean up * format code --------- Co-authored-by: root <[email protected]> Co-authored-by: ZYC <[email protected]> Co-authored-by: Qubitium-ModelCloud <[email protected]>
- Loading branch information
1 parent
ed9a77d
commit c4b8e83
Showing
42 changed files
with
302 additions
and
282 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,11 @@ | ||
from model_test import ModelTest | ||
from model_test import ModelTest # noqa: E402 | ||
|
||
|
||
class TestBaiChuan(ModelTest): | ||
NATIVE_MODEL_ID = "baichuan-inc/Baichuan2-7B-Chat" | ||
NATIVE_ARC_CHALLENGE_ACC = 0.4104 | ||
NATIVE_ARC_CHALLENGE_ACC_NORM = 0.4317 | ||
TRUST_REMOTE_CODE = True | ||
|
||
def test_baichuan(self): | ||
model, tokenizer = self.quantModel(self.NATIVE_MODEL_ID, trust_remote_code=True) | ||
|
||
reference_output = "I am in Paris and I need to go to the airport. How can I get to the airport from here?\nThere are several ways to get to the airport from Paris. The most common way is to take the RER (Regional Express Train). You can take the RER A line from Gare de l'Est or Gare du Nord stations. The other option is to take the Métro (subway). You can take the Métro Line 1 or Line 14 to" | ||
result = self.generate(model, tokenizer) | ||
|
||
self.assertEqual(result[:self.GENERATE_EVAL_SIZE], reference_output[:self.GENERATE_EVAL_SIZE]) | ||
self.quant_lm_eval() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,13 @@ | ||
import torch | ||
from model_test import ModelTest | ||
import torch # noqa: E402 | ||
from model_test import ModelTest # noqa: E402 | ||
|
||
|
||
class TestBloom(ModelTest): | ||
NATIVE_MODEL_ID = "bigscience/bloom-560m" | ||
NATIVE_ARC_CHALLENGE_ACC = 0.2201 | ||
NATIVE_ARC_CHALLENGE_ACC_NORM = 0.2440 | ||
TORCH_DTYPE = torch.float16 | ||
|
||
def test_bloom(self): | ||
model, tokenizer = self.quantModel(self.NATIVE_MODEL_ID, torch_dtype=torch.float16) | ||
reference_output = "I am in Paris and I am in Paris. I am in Paris and I am in Paris. I am in Paris and I am in Paris. I am in Paris and I am in Paris. I am in Paris and I am in Paris. I am in Paris and I am in Paris. I am in Paris and I am in Paris. I am in Paris and I am in Paris. I am in Paris and I am in Paris. I am in Paris and I am in Paris. I am in Paris and" | ||
result = self.generate(model, tokenizer) | ||
self.quant_lm_eval() | ||
|
||
self.assertEqual(result[:self.GENERATE_EVAL_SIZE], reference_output[:self.GENERATE_EVAL_SIZE]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,9 @@ | ||
from model_test import ModelTest | ||
from model_test import ModelTest # noqa: E402 | ||
|
||
|
||
class TestChatGlm(ModelTest): | ||
NATIVE_MODEL_ID = "THUDM/chatglm3-6b" | ||
TRUST_REMOTE_CODE = True | ||
|
||
def test_chatglm(self): | ||
model, tokenizer = self.quantModel(self.NATIVE_MODEL_ID, trust_remote_code=True) | ||
reference_output = "" | ||
result = self.generate(model, tokenizer) | ||
|
||
self.assertTrue(len(result) > 0) | ||
# self.assertEqual(result[:self.GENERATE_EVAL_SIZE], reference_output[:self.GENERATE_EVAL_SIZE]) | ||
self.quant_lm_eval() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,12 @@ | ||
from model_test import ModelTest | ||
from model_test import ModelTest # noqa: E402 | ||
|
||
|
||
class TestCodeGen(ModelTest): | ||
NATIVE_MODEL_ID = "Salesforce/codegen2-1B_P" | ||
NATIVE_ARC_CHALLENGE_ACC = 0.1749 | ||
NATIVE_ARC_CHALLENGE_ACC_NORM = 0.2005 | ||
TRUST_REMOTE_CODE = True | ||
|
||
def test_codegen(self): | ||
model, tokenizer = self.quantModel(self.NATIVE_MODEL_ID, trust_remote_code=True) | ||
reference_output = "I am in Paris and I am in Paris. I am in Paris and I am in Paris. I am in Paris and I am in Paris. I am in Paris and I am in Paris. I am in Paris and I am in Paris. I am in Paris and I am in Paris. I am in Paris and I am in Paris. I am in Paris and I am in Paris. I am in Paris and I am in Paris. I am in Paris and I am in Paris. I am in Paris and" | ||
result = self.generate(model, tokenizer) | ||
self.quant_lm_eval() | ||
|
||
self.assertEqual(result[:self.GENERATE_EVAL_SIZE], reference_output[:self.GENERATE_EVAL_SIZE]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,11 @@ | ||
from model_test import ModelTest | ||
from model_test import ModelTest # noqa: E402 | ||
|
||
|
||
class TestCohere(ModelTest): | ||
NATIVE_MODEL_ID = "CohereForAI/aya-expanse-8b" | ||
NATIVE_ARC_CHALLENGE_ACC = 0.5401 | ||
NATIVE_ARC_CHALLENGE_ACC_NORM = 0.5640 | ||
QUANT_ARC_MAX_NEGATIVE_DELTA = 0.12 | ||
|
||
def test_cohere(self): | ||
model, tokenizer = self.quantModel(self.NATIVE_MODEL_ID) | ||
reference_output = "<BOS_TOKEN>I am in Paris and I am in love. I am in love with the city, the people, the food, the art, the history, the architecture, the fashion, the music, the art, the art, the art, the art, the art, the art, the art, the art, the art, the art, the art, the art, the art, the art, the art, the art, the art, the art, the art, the art, the art, the art," | ||
result = self.generate(model, tokenizer) | ||
|
||
self.assertEqual(result[:self.GENERATE_EVAL_SIZE], reference_output[:self.GENERATE_EVAL_SIZE]) | ||
self.quant_lm_eval() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,12 @@ | ||
from model_test import ModelTest | ||
from model_test import ModelTest # noqa: E402 | ||
|
||
|
||
class TestDeci(ModelTest): | ||
NATIVE_MODEL_ID = "Deci/DeciLM-7B-instruct" | ||
NATIVE_ARC_CHALLENGE_ACC = 0.5239 | ||
NATIVE_ARC_CHALLENGE_ACC_NORM = 0.5222 | ||
QUANT_ARC_MAX_NEGATIVE_DELTA = 0.55 | ||
TRUST_REMOTE_CODE = True | ||
|
||
def test_deci(self): | ||
model, tokenizer = self.quantModel(self.NATIVE_MODEL_ID, trust_remote_code=True) | ||
reference_output = "<s> I am in Paris and I am going to the Eiffel Tower.\n\nQuestion: Where is the narrator going?\n\nAnswer: The Eiffel Tower\n\nTitle: The Eiffel Tower\n\nBackground: The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France. It is named after the engineer Gustave Eiffel, whose company designed and built the tower. Construction began on 28 January 1887" | ||
result = self.generate(model, tokenizer) | ||
|
||
self.assertEqual(result[:self.GENERATE_EVAL_SIZE], reference_output[:self.GENERATE_EVAL_SIZE]) | ||
self.quant_lm_eval() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,14 @@ | ||
from model_test import ModelTest | ||
from model_test import ModelTest # noqa: E402 | ||
|
||
|
||
class TestDeepseekV2Lite(ModelTest): | ||
NATIVE_MODEL_ID = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct" | ||
NATIVE_ARC_CHALLENGE_ACC = 0.4753 | ||
NATIVE_ARC_CHALLENGE_ACC_NORM = 0.4855 | ||
APPLY_CHAT_TEMPLATE = True | ||
TRUST_REMOTE_CODE = True | ||
|
||
def test_deepseekv2lite(self): | ||
model, tokenizer = self.quantModel(self.NATIVE_MODEL_ID, trust_remote_code=True) | ||
reference_output = "<|begin▁of▁sentence|>I am in Paris and I am looking for a good place to eat. I am a vegetarian and I am looking for a place that has a good vegetarian menu. I am not looking for a fancy restaurant, just a good place to eat.\nI am looking for a place that has a good vegetarian menu and is not too expensive. I am not looking for a fancy restaurant, just a good place to eat.\nI am in Paris and I am looking for a good place to eat. I am a vegetarian and" | ||
result = self.generate(model, tokenizer) | ||
self.quant_lm_eval() | ||
|
||
|
||
self.assertEqual(result[:self.GENERATE_EVAL_SIZE], reference_output[:self.GENERATE_EVAL_SIZE]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,19 +1,12 @@ | ||
from model_test import ModelTest | ||
from model_test import ModelTest # noqa: E402 | ||
|
||
|
||
class TestExaone(ModelTest): | ||
NATIVE_MODEL_ID = "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct" | ||
prompt = [ | ||
{"role": "system", | ||
"content": "You are EXAONE model from LG AI Research, a helpful assistant."}, | ||
{"role": "user", | ||
"content": "I am in Shanghai, preparing to visit the natural history museum. Can you tell me the best way to"} | ||
] | ||
|
||
NATIVE_ARC_CHALLENGE_ACC = 0.4232 | ||
NATIVE_ARC_CHALLENGE_ACC_NORM = 0.4164 | ||
TRUST_REMOTE_CODE = True | ||
def test_exaone(self): | ||
model, tokenizer = self.quantModel(self.NATIVE_MODEL_ID, trust_remote_code=True) | ||
reference_output = "Certainly! Here's how you can get to the Shanghai Natural History Museum:\n\n1. **By Metro**: The museum is located near Line 10 of the Shanghai Metro. You can take the Line 10 train to the People's Park station. From there, it's a short walk to the museum.\n\n2. **By Bus**: Several bus lines pass near the museum. For example, bus routes 10, 11," | ||
self.quant_lm_eval() | ||
|
||
result = self.generateChat(model, tokenizer, prompt=self.prompt) | ||
|
||
self.assertEqual(result[:self.GENERATE_EVAL_SIZE], reference_output[:self.GENERATE_EVAL_SIZE]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,12 @@ | ||
from model_test import ModelTest | ||
from model_test import ModelTest # noqa: E402 | ||
|
||
|
||
class TestFalcon(ModelTest): | ||
NATIVE_MODEL_ID = "tiiuae/falcon-7b-instruct" | ||
NATIVE_ARC_CHALLENGE_ACC = 0.3993 | ||
NATIVE_ARC_CHALLENGE_ACC_NORM = 0.4292 | ||
APPLY_CHAT_TEMPLATE = True | ||
TRUST_REMOTE_CODE = True | ||
|
||
def test_falcon(self): | ||
model, tokenizer = self.quantModel(self.NATIVE_MODEL_ID) | ||
reference_output = "I am in Paris and,.....\n,,,,,,,, ,,, and and,, ,, and and and,, ,, and and, and, and, and, and, and, and, and and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and the, and" | ||
result = self.generate(model, tokenizer) | ||
|
||
self.assertEqual(result[:self.GENERATE_EVAL_SIZE], reference_output[:self.GENERATE_EVAL_SIZE]) | ||
self.quant_lm_eval() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,12 @@ | ||
from model_test import ModelTest | ||
from model_test import ModelTest # noqa: E402 | ||
|
||
|
||
class TestGemma(ModelTest): | ||
NATIVE_MODEL_ID = "google/gemma-2-9b" | ||
NATIVE_ARC_CHALLENGE_ACC = 0.6143 | ||
NATIVE_ARC_CHALLENGE_ACC_NORM = 0.6553 | ||
|
||
def test_gemma(self): | ||
model, tokenizer = self.quantModel(self.NATIVE_MODEL_ID) | ||
reference_output = "<bos>I am in Paris and I am going to the Louvre. I am going to see the Mona Lisa. I am going to see the Venus de Milo. I am going to see the Winged Victory of Samothrace. I am going to see the Coronation of Napoleon. I am going to see the Raft of the Medusa. I am going to see the Code of Hammurabi. I am going to see the Rosetta Stone. I am going to see the Venus de Milo. I am going to see the Winged" | ||
result = self.generate(model, tokenizer) | ||
self.quant_lm_eval() | ||
|
||
|
||
self.assertEqual(result[:self.GENERATE_EVAL_SIZE], reference_output[:self.GENERATE_EVAL_SIZE]) |
Oops, something went wrong.