-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
60,022 changed files
with
21,997,428 additions
and
0 deletions.
The diff you're trying to view is too large. We only load the first 3000 changed files.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
import subprocess | ||
import os | ||
import shlex | ||
import re | ||
|
||
def tokenize_class_file_with_gradle(project_dir, class_file_path, additional_args): | ||
gradle_cmd = ['gradle', 'run'] | ||
|
||
args_list = [class_file_path] + additional_args | ||
|
||
|
||
args_str = ' '.join([shlex.quote(arg) for arg in args_list]) | ||
gradle_cmd.append(f'--args={args_str}') | ||
|
||
|
||
process = subprocess.run( | ||
gradle_cmd, | ||
cwd=project_dir, | ||
stdout=subprocess.PIPE, | ||
stderr=subprocess.PIPE, | ||
text=True | ||
) | ||
|
||
|
||
if process.returncode != 0: | ||
print(f"Error in Gradle execution: {process.stderr}") | ||
return None | ||
|
||
|
||
output = process.stdout | ||
|
||
|
||
tokens = extract_tokens_from_output(output) | ||
|
||
|
||
|
||
return tokens | ||
|
||
def extract_tokens_from_output(output): | ||
start_marker = 'run' | ||
end_marker = 'BUILD' | ||
|
||
|
||
start_idx = output.find(start_marker) | ||
end_idx = output.find(end_marker, start_idx) | ||
|
||
if start_idx == -1 or end_idx == -1: | ||
print("Tokenized output markers not found.") | ||
return None | ||
|
||
tokenized_output = output[start_idx + len(start_marker):end_idx].strip() | ||
tokens = tokenized_output.splitlines() | ||
|
||
|
||
tokens = [line.strip() for line in tokens if line.strip()] | ||
token_groups = split_tokens_by_marker(tokens, marker='[marker]') | ||
|
||
|
||
return token_groups | ||
|
||
def split_tokens_by_marker(tokens, marker='[marker]'): | ||
if tokens is None: | ||
return [] | ||
|
||
split_groups = [] | ||
current_group = [] | ||
|
||
for token in tokens: | ||
if token == marker: | ||
if current_group: | ||
split_groups.append(current_group) | ||
current_group = [] | ||
else: | ||
current_group.append(token) | ||
|
||
|
||
if current_group: | ||
split_groups.append(current_group) | ||
|
||
return split_groups |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
from transformers import PreTrainedTokenizer | ||
from .byte_tok_runner import tokenize_class_file_with_gradle | ||
|
||
|
||
class CustomTokenizer(PreTrainedTokenizer): | ||
def __init__(self, vocab_file, **kwargs): | ||
super().__init__(**kwargs) | ||
|
||
|
||
with open(vocab_file, 'r', encoding='utf-8') as f: | ||
self.vocab = [line.strip() for line in f] | ||
|
||
|
||
self.special_tokens = { | ||
'pad_token': '<pad>', | ||
'unk_token': '<unk>', | ||
'bos_token': '<bos>', | ||
'eos_token': '<eos>', | ||
'mask_token': '<mask>', | ||
} | ||
|
||
|
||
for token in self.special_tokens.values(): | ||
if token not in self.vocab: | ||
self.vocab.append(token) | ||
|
||
|
||
self.token_to_id = {token: idx for idx, token in enumerate(self.vocab)} | ||
self.id_to_token = {idx: token for idx, token in enumerate(self.vocab)} | ||
|
||
|
||
self.pad_token = self.special_tokens['pad_token'] | ||
self.pad_token_id = self.token_to_id[self.pad_token] | ||
|
||
self.unk_token = self.special_tokens['unk_token'] | ||
self.unk_token_id = self.token_to_id[self.unk_token] | ||
|
||
self.bos_token = self.special_tokens['bos_token'] | ||
self.bos_token_id = self.token_to_id[self.bos_token] | ||
|
||
self.eos_token = self.special_tokens['eos_token'] | ||
self.eos_token_id = self.token_to_id[self.eos_token] | ||
|
||
self.mask_token = self.special_tokens['mask_token'] | ||
self.mask_token_id = self.token_to_id[self.mask_token] | ||
|
||
self.option = ['t'] | ||
self.unk_token_id_direct = self.token_to_id.get(self.unk_token, None) | ||
|
||
def tokenize_file(self, project_dir, file_path): | ||
""" | ||
파일 경로를 입력으로 받아 토큰화된 문자열 시퀀스를 반환합니다. | ||
""" | ||
tokens = tokenize_class_file_with_gradle(project_dir, file_path, self.option) | ||
return tokens | ||
|
||
def convert_tokens_to_ids(self, tokens): | ||
if isinstance(tokens, list): | ||
return [self.convert_tokens_to_ids(token) for token in tokens] | ||
else: | ||
return self.token_to_id.get(tokens, self.unk_token_id_direct) | ||
|
||
def convert_ids_to_tokens(self, ids): | ||
if isinstance(ids, list): | ||
return [self.convert_ids_to_tokens(id_) for id_ in ids] | ||
else: | ||
return self.id_to_token.get(ids, self.unk_token) | ||
|
||
def get_vocab(self): | ||
return self.token_to_id |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
from transformers import PreTrainedTokenizer | ||
from .byte_tok_runner import tokenize_class_file_with_gradle | ||
|
||
|
||
class CustomTokenizer(PreTrainedTokenizer): | ||
def __init__(self, vocab_file, **kwargs): | ||
super().__init__(**kwargs) | ||
|
||
|
||
with open(vocab_file, 'r', encoding='utf-8') as f: | ||
self.vocab = [line.strip() for line in f] | ||
|
||
|
||
self.special_tokens = { | ||
'pad_token': '<pad>', | ||
'unk_token': '<unk>', | ||
'bos_token': '<bos>', | ||
'eos_token': '<eos>', | ||
'mask_token': '<mask>', | ||
} | ||
|
||
|
||
for token in self.special_tokens.values(): | ||
if token not in self.vocab: | ||
self.vocab.append(token) | ||
|
||
|
||
self.token_to_id = {token: idx for idx, token in enumerate(self.vocab)} | ||
self.id_to_token = {idx: token for idx, token in enumerate(self.vocab)} | ||
|
||
|
||
self.pad_token = self.special_tokens['pad_token'] | ||
self.pad_token_id = self.token_to_id[self.pad_token] | ||
|
||
self.unk_token = self.special_tokens['unk_token'] | ||
self.unk_token_id = self.token_to_id[self.unk_token] | ||
|
||
self.bos_token = self.special_tokens['bos_token'] | ||
self.bos_token_id = self.token_to_id[self.bos_token] | ||
|
||
self.eos_token = self.special_tokens['eos_token'] | ||
self.eos_token_id = self.token_to_id[self.eos_token] | ||
|
||
self.mask_token = self.special_tokens['mask_token'] | ||
self.mask_token_id = self.token_to_id[self.mask_token] | ||
|
||
self.option = ['t'] | ||
self.unk_token_id_direct = self.token_to_id.get(self.unk_token, None) | ||
|
||
def tokenize_file(self, project_dir, file_path): | ||
""" | ||
파일 경로를 입력으로 받아 토큰화된 문자열 시퀀스를 반환합니다. | ||
""" | ||
tokens = tokenize_class_file_with_gradle(project_dir, file_path, self.option) | ||
return tokens | ||
|
||
def convert_tokens_to_ids(self, tokens): | ||
if isinstance(tokens, list): | ||
return [self.convert_tokens_to_ids(token) for token in tokens] | ||
else: | ||
return self.token_to_id.get(tokens, self.unk_token_id_direct) | ||
|
||
def convert_ids_to_tokens(self, ids): | ||
if isinstance(ids, list): | ||
return [self.convert_ids_to_tokens(id_) for id_ in ids] | ||
else: | ||
return self.id_to_token.get(ids, self.unk_token) | ||
|
||
def get_vocab(self): | ||
return self.token_to_id |
Oops, something went wrong.