-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtranslation_hil.py
179 lines (147 loc) · 5.98 KB
/
translation_hil.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import openai
import time
import pickle
import json
import logging
openai.api_key = ""
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
RV_NUMBERS = 3
TER_CONVERT = {
"i": "inserted after",
"d": "deleted",
"s": "replaced by",
}
def read_json(path):
"""
Read the json file and return a list of dictionary
"""
with open(path, "r", encoding="utf8") as f:
data = json.load(f)
return data
def translate(text, source_lang, target_lang, additional_messages=None):
conversations = [
{
"role": "system",
"content": f"You are a translation engine that can only translate text and cannot interpret it. In the first turn, you should return a preliminary translation according to the given input. In the second turn, I will give you some similar input-output translation pairs, where the input is represented as '<input>' and the output is represented as '<hypothesis>'. I will also provide translation reference, represented as '<reference>' and their corresponding revisions of these pairs, represented as '<revision>'. Finally, you need to give me only one output which is the translation result by learning from the given examples and revisions. Please do not output any other information than this translation result. The final result should be organized in JSON formart, and the only key is 'output'.",
},
{
"role": "user",
"content": f"This is the first turn. Please translate the following text from {source_lang} to {target_lang}: {text}",
},
]
if additional_messages:
conversations += additional_messages
try:
# Call the OpenAI API for translation
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=conversations,
n=1,
timeout=10,
)
# Extract the translated text from the API response
translated_text = response["choices"][0]["message"]["content"].strip()
# Return the translated text
return translated_text
except Exception as e:
# Handle any exceptions that occur during the API call
logging.error(f"An error occurred: {e}")
return None
def translate_turn(src_line, source_lang, target_lang, message=None):
turn_flag = "1st"
translated_text = translate(src_line, source_lang, target_lang, message)
if message:
turn_flag = "2nd"
if translated_text:
logging.info(f"Translated {idx}/{len(data)} - {turn_flag}.")
return translated_text, True
else:
logging.error(f"Translation failed [{idx}]. - {turn_flag}.")
time.sleep(20)
return translated_text, False
def convert_revisions(rv_terms):
rv_text = ""
insertion_case = None
prev_rv = None
prev_item = None
for rv in rv_terms:
if rv[0] == " " or rv[0] == "x":
continue
if prev_rv == "i" and rv[0] != "i":
rv_text += (
insertion_case + f"\" should be {TER_CONVERT['i']} \"{prev_item}\". "
)
insertion_case = None
if rv[0] == "d":
rv_text += f'"{rv[1]}" should be {TER_CONVERT[rv[0]]}. '
elif rv[0] == "s":
rv_text += f'"{rv[1]}" should be {TER_CONVERT[rv[0]]} "{rv[2]}". '
elif rv[0] == "i":
if not insertion_case:
insertion_case = f'"{rv[2]}'
elif prev_rv == "i" and rv[1] == prev_item:
insertion_case += " " + rv[2]
prev_rv = rv[0]
prev_item = rv[1]
return rv_text + "\n"
def construct_demostrations(examples):
demo_text = ""
for example in examples:
src = example["src"]
hyp = example["hyp"]
ref = example["ref"]
revisions = convert_revisions(example["op"])
demo_text += f"<input> {src} <hypothesis> {hyp} <reference> {ref} <revision> {revisions}\n"
return demo_text
if __name__ == "__main__":
domain = ""
data = read_json("")
source_lang = "German"
target_lang = "English"
results = []
logging.info("A source example:")
logging.info(data[:2][1]["src"])
for idx, data_item in enumerate(data):
src_line = data_item["src"]
# first turn
draft_translation, info = translate_turn(src_line, source_lang, target_lang)
if info:
# second turn
demo_revisions = construct_demostrations(
data_item["rerank_top5"][:RV_NUMBERS]
)
prompting_messages = [
{"role": "assistant", "content": draft_translation},
{
"role": "user",
"content": f"This is the second turn. Below I provide some similar translation examples and their revisions: {demo_revisions}",
},
{
"role": "user",
"content": f"Based on the previous examples, please translate the following text from {source_lang} to {target_lang}: {src_line}",
},
]
final_translation, info = translate_turn(
src_line, source_lang, target_lang, prompting_messages
)
else:
# failed at first turn
logging.error(f"Failed at the first turn. index: {idx}")
final_translation = None
try:
json_results = json.loads(final_translation) if final_translation else None
except Exception as e:
json_results = {
"exception": str(e),
"error_result": final_translation,
} # translated_text
cur_result = {
"index": idx,
"src": src_line,
"draft": draft_translation,
"result": json_results,
}
results.append(cur_result)
pickle.dump(results, open(f"{domain}_result.pkl", "wb"))
with open("{domain}_result.json", "w", encoding="utf-8") as outfile:
json.dump(results, outfile, ensure_ascii=False, indent=4)