Skip to content

Commit

Permalink
init
Browse files Browse the repository at this point in the history
  • Loading branch information
ehdrb01 committed Dec 3, 2024
1 parent 11f8882 commit 23ce186
Show file tree
Hide file tree
Showing 60,022 changed files with 21,997,428 additions and 0 deletions.
The diff you're trying to view is too large. We only load the first 3000 changed files.
80 changes: 80 additions & 0 deletions Data/byte_tok_runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import subprocess
import os
import shlex
import re

def tokenize_class_file_with_gradle(project_dir, class_file_path, additional_args):
gradle_cmd = ['gradle', 'run']

args_list = [class_file_path] + additional_args


args_str = ' '.join([shlex.quote(arg) for arg in args_list])
gradle_cmd.append(f'--args={args_str}')


process = subprocess.run(
gradle_cmd,
cwd=project_dir,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True
)


if process.returncode != 0:
print(f"Error in Gradle execution: {process.stderr}")
return None


output = process.stdout


tokens = extract_tokens_from_output(output)



return tokens

def extract_tokens_from_output(output):
start_marker = 'run'
end_marker = 'BUILD'


start_idx = output.find(start_marker)
end_idx = output.find(end_marker, start_idx)

if start_idx == -1 or end_idx == -1:
print("Tokenized output markers not found.")
return None

tokenized_output = output[start_idx + len(start_marker):end_idx].strip()
tokens = tokenized_output.splitlines()


tokens = [line.strip() for line in tokens if line.strip()]
token_groups = split_tokens_by_marker(tokens, marker='[marker]')


return token_groups

def split_tokens_by_marker(tokens, marker='[marker]'):
if tokens is None:
return []

split_groups = []
current_group = []

for token in tokens:
if token == marker:
if current_group:
split_groups.append(current_group)
current_group = []
else:
current_group.append(token)


if current_group:
split_groups.append(current_group)

return split_groups
70 changes: 70 additions & 0 deletions Data/tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from transformers import PreTrainedTokenizer
from .byte_tok_runner import tokenize_class_file_with_gradle


class CustomTokenizer(PreTrainedTokenizer):
def __init__(self, vocab_file, **kwargs):
super().__init__(**kwargs)


with open(vocab_file, 'r', encoding='utf-8') as f:
self.vocab = [line.strip() for line in f]


self.special_tokens = {
'pad_token': '<pad>',
'unk_token': '<unk>',
'bos_token': '<bos>',
'eos_token': '<eos>',
'mask_token': '<mask>',
}


for token in self.special_tokens.values():
if token not in self.vocab:
self.vocab.append(token)


self.token_to_id = {token: idx for idx, token in enumerate(self.vocab)}
self.id_to_token = {idx: token for idx, token in enumerate(self.vocab)}


self.pad_token = self.special_tokens['pad_token']
self.pad_token_id = self.token_to_id[self.pad_token]

self.unk_token = self.special_tokens['unk_token']
self.unk_token_id = self.token_to_id[self.unk_token]

self.bos_token = self.special_tokens['bos_token']
self.bos_token_id = self.token_to_id[self.bos_token]

self.eos_token = self.special_tokens['eos_token']
self.eos_token_id = self.token_to_id[self.eos_token]

self.mask_token = self.special_tokens['mask_token']
self.mask_token_id = self.token_to_id[self.mask_token]

self.option = ['t']
self.unk_token_id_direct = self.token_to_id.get(self.unk_token, None)

def tokenize_file(self, project_dir, file_path):
"""
파일 경로를 입력으로 받아 토큰화된 문자열 시퀀스를 반환합니다.
"""
tokens = tokenize_class_file_with_gradle(project_dir, file_path, self.option)
return tokens

def convert_tokens_to_ids(self, tokens):
if isinstance(tokens, list):
return [self.convert_tokens_to_ids(token) for token in tokens]
else:
return self.token_to_id.get(tokens, self.unk_token_id_direct)

def convert_ids_to_tokens(self, ids):
if isinstance(ids, list):
return [self.convert_ids_to_tokens(id_) for id_ in ids]
else:
return self.id_to_token.get(ids, self.unk_token)

def get_vocab(self):
return self.token_to_id
70 changes: 70 additions & 0 deletions Data/tokenizer_bert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from transformers import PreTrainedTokenizer
from .byte_tok_runner import tokenize_class_file_with_gradle


class CustomTokenizer(PreTrainedTokenizer):
def __init__(self, vocab_file, **kwargs):
super().__init__(**kwargs)


with open(vocab_file, 'r', encoding='utf-8') as f:
self.vocab = [line.strip() for line in f]


self.special_tokens = {
'pad_token': '<pad>',
'unk_token': '<unk>',
'bos_token': '<bos>',
'eos_token': '<eos>',
'mask_token': '<mask>',
}


for token in self.special_tokens.values():
if token not in self.vocab:
self.vocab.append(token)


self.token_to_id = {token: idx for idx, token in enumerate(self.vocab)}
self.id_to_token = {idx: token for idx, token in enumerate(self.vocab)}


self.pad_token = self.special_tokens['pad_token']
self.pad_token_id = self.token_to_id[self.pad_token]

self.unk_token = self.special_tokens['unk_token']
self.unk_token_id = self.token_to_id[self.unk_token]

self.bos_token = self.special_tokens['bos_token']
self.bos_token_id = self.token_to_id[self.bos_token]

self.eos_token = self.special_tokens['eos_token']
self.eos_token_id = self.token_to_id[self.eos_token]

self.mask_token = self.special_tokens['mask_token']
self.mask_token_id = self.token_to_id[self.mask_token]

self.option = ['t']
self.unk_token_id_direct = self.token_to_id.get(self.unk_token, None)

def tokenize_file(self, project_dir, file_path):
"""
파일 경로를 입력으로 받아 토큰화된 문자열 시퀀스를 반환합니다.
"""
tokens = tokenize_class_file_with_gradle(project_dir, file_path, self.option)
return tokens

def convert_tokens_to_ids(self, tokens):
if isinstance(tokens, list):
return [self.convert_tokens_to_ids(token) for token in tokens]
else:
return self.token_to_id.get(tokens, self.unk_token_id_direct)

def convert_ids_to_tokens(self, ids):
if isinstance(ids, list):
return [self.convert_ids_to_tokens(id_) for id_ in ids]
else:
return self.id_to_token.get(ids, self.unk_token)

def get_vocab(self):
return self.token_to_id
Loading

0 comments on commit 23ce186

Please sign in to comment.