-
Notifications
You must be signed in to change notification settings - Fork 4
/
gpt_bodyparts.py
130 lines (112 loc) · 4.98 KB
/
gpt_bodyparts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import os
import openai
import logging
import hydra
from omegaconf import DictConfig, OmegaConf
import sinc.launch.prepare # noqa
from hydra.utils import to_absolute_path
from pathlib import Path
from tqdm import tqdm
from sinc.utils.file_io import read_json, write_json
import time
from rich import print
logger = logging.getLogger(__name__)
openai.api_key = '<api-key>'
@hydra.main(config_path="configs", config_name="gpt_bdpts")
def _gpt_extract(cfg: DictConfig):
return gpt_extract(cfg)
# body_parts = ['left arm', 'right arm', 'left leg', 'global orientation',
# 'right leg', 'torso', 'left hand', 'right hand', 'left ankle', 'right ankle', 'left foot',
# 'right foot', 'head', 'neck', 'right shoulder', 'left shoulder', 'pelvis', 'spine']
# # fine_bp = list(body_parts)
# coarse_bp = ['left arm', 'right arm', 'left leg', 'global orientation', 'right leg', 'torso']
# coarse_bp_v2 = ['left arm', 'right arm', 'left leg', 'global orientation',
# 'right leg', 'torso', 'head', 'neck', 'pelvis']
# 'What parts of the body are moving when someone is doing the action:'
def gpt_extract(cfg: DictConfig):
from sinc.utils.file_io import write_json
from sinc.utils.text_constants import prompts_to_gpt, final_prompts_to_gpt
from sinc.utils.text_constants import unique_texts_babel_train_val
unique_texts = list(unique_texts_babel_train_val)
responses = {}
responses_full = {}
write_every = 20
elems = 0
unique_texts = unique_texts[6360:]
models_GPT = {'curie':'text-curie-001',
'ada': 'text-ada-001',
'davinci': 'text-davinci-003'
}
assert cfg.gpt_model in models_GPT.keys()
gpt_model = models_GPT[cfg.gpt_model]
n_to_keep = cfg.ntokeep if cfg.ntokeep is not None else len(unique_texts)
if cfg.crawl_data:
for action_text in tqdm(unique_texts[:n_to_keep]):
prompts = [p.replace('[ACTION]', action_text) for p in final_prompts_to_gpt]
response_compl = openai.Completion.create(model=gpt_model,
prompt=prompts,
temperature=0.0,
max_tokens=256,
top_p=1,
frequency_penalty=0,
presence_penalty=0
)
assert len(response_compl['choices']) == len(prompts)
lofrespons = [response_compl['choices'][i]['text'].strip() for i in range(len(prompts))]
responses[action_text] = lofrespons
responses_full[action_text] = lofrespons
elems += 1
if elems % write_every == 0:
batch_id = elems // write_every
write_json(responses, f'gpt-labels_batch{batch_id}.json')
responses = {}
time.sleep(30)
if elems == 100:
time.sleep(120)
# final batch
batch_id += 1
write_json(responses, f'gpt-labels_batch{batch_id}.json')
# write html/tex to file
pathout='gpt-labels_full.json'
write_json(responses_full, pathout)
# p2 = '/home/nathanasiou/Desktop/gpt3-labels.json'
# write_json(responses_full, p2)
else:
# process
html_toks = ['<tr>', '<td>', '<tr>', '</tr>', '</td>', '</tr>', '<th>', '</th>']
p = '/home/nathanasiou/Desktop/example-batch.html'
all_texts = []
with open(p, 'r') as f:
for l in f:
for tk in html_toks:
l = l.strip().replace(tk, '')
text = l
if text:
all_texts.append(text)
actions = all_texts[::3]
p1 = all_texts[1::3]
p2 = all_texts[2::3]
import spacy
nlp = spacy.load("en_core_web_sm")
pipeline = ["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"]
from space.info.joints import bp_gpt
p1_annot = {k:[] for k in p1}
p2_annot = {k:[] for k in p2}
for alt, annot in zip([p1, p2], [p1_annot, p2_annot]):
for doc in nlp.pipe(alt, disable=["tok2vec", "ner"]):
for i in doc:
POS = i.pos_
if i.text.lower() in bp_gpt:
annot[doc.text].append(i.text)
from space.utils.file_io import write_json
act_labels = {k: [] for k in actions}
p1_bps = list(p1_annot.values())
p2_bps = list(p2_annot.values())
for i, act in enumerate(actions):
act_labels[act].append(list(set(p1_bps[i])))
act_labels[act].append(list(set(p2_bps[i])))
write_json(act_labels, './acts.json')
write_json(p1_annot, './promp1.json')
write_json(p2_annot, './promp2.json')
if __name__ == '__main__':
_gpt_extract()