Skip to content

Commit

Permalink
Merge pull request #139 from shibing624/dev
Browse files Browse the repository at this point in the history
fix #132
  • Loading branch information
shibing624 authored Aug 1, 2023
2 parents 025a159 + 20ecca9 commit 92dc8c9
Show file tree
Hide file tree
Showing 15 changed files with 649,705 additions and 1,058 deletions.
59 changes: 59 additions & 0 deletions build_domain_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# -*- coding: utf-8 -*-
"""
@author:XuMing([email protected])
@description: Build chinese tokenizer from corpus txt
# train sentencepiece model from `corpus.txt` and makes `m.model` and `m.vocab`
# `m.vocab` is just a reference. not used in the segmentation.
# spm.SentencePieceTrainer.train('--input=data/pretrain/tianlongbabu.txt --model_prefix=m --vocab_size=20000')
"""
import argparse

import sentencepiece as spm


def main():
parser = argparse.ArgumentParser()
parser.add_argument('--in_file', default='data/pretrain/tianlongbabu.txt', type=str)
parser.add_argument('--domain_sp_model_name', default='domain_sp', type=str)
parser.add_argument('--max_sentence_length', default=16384, type=int)
parser.add_argument('--pad_id', default=3, type=int)
parser.add_argument('--vocab_size', default=10000, type=int)
parser.add_argument('--model_type', default="BPE", type=str)

args = parser.parse_args()
print(args)

spm.SentencePieceTrainer.train(
input=args.in_file,
model_prefix=args.domain_sp_model_name,
shuffle_input_sentence=False,
train_extremely_large_corpus=True,
max_sentence_length=args.max_sentence_length,
pad_id=args.pad_id,
model_type=args.model_type,
vocab_size=args.vocab_size,
split_digits=True,
split_by_unicode_script=True,
byte_fallback=True,
allow_whitespace_only_pieces=True,
remove_extra_whitespaces=False,
normalization_rule_name="nfkc",
)

# makes segmenter instance and loads the model file (m.model)
sp = spm.SentencePieceProcessor()
model_file = args.domain_sp_model_name + '.model'
sp.load(model_file)

# encode: text => id
print(sp.encode_as_pieces('慕容复来到河边,this is a test'))
print(sp.encode_as_ids('this is a test'))

# decode: id => text
print(sp.decode_pieces(['▁This', '▁is', '▁a', '▁t', 'est']))
# print(sp.decode_ids([209, 31, 9, 375, 586]))


if __name__ == '__main__':
main()
29 changes: 17 additions & 12 deletions convert_alpaca.py → convert_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,31 +12,36 @@
parser = argparse.ArgumentParser()
parser.add_argument("--in_file", type=str)
parser.add_argument("--out_file", type=str)
parser.add_argument("--data_type", type=str, default='alpaca')
args = parser.parse_args()

print(args)
data_files = {"train": args.in_file}
raw_datasets = load_dataset('json', data_files=data_files)
ds = raw_datasets['train']


def process(examples):
ids = []
def process_alpaca(examples):
convs = []
langs = []
id = 0
for instruction, inp, output in zip(examples['instruction'], examples['input'], examples['output']):
if len(inp.strip()) > 1:
instruction = instruction + '\nInput:\n' + inp
q = instruction
a = output
convs.append([
{"from": "human", "value": q},
{"from": "gpt", "value": a},
{"from": "gpt", "value": a}
])
id += 1
ids.append(f'alpaca_{id}')
langs.append('zh')
return {'id': ids, 'conversations': convs, 'lang': langs}
return {"conversations": convs}


if args.data_type in ['alpaca']:
ds = ds.map(process_alpaca, batched=True, remove_columns=ds.column_names, desc="Running process")
else:
# Other sharegpt dataset, need rename to conversations and remove unused columns
if "items" in ds.column_names:
ds = ds.rename(columns={"items": "conversations"})
columns_to_remove = ds.column_names.copy()
columns_to_remove.remove('conversations')
ds = ds.remove_columns(columns_to_remove)

dataset = raw_datasets['train'].map(process, batched=True, remove_columns=raw_datasets['train'].column_names)
dataset.to_json(f"{args.out_file}", lines=True, force_ascii=False)
ds.to_json(f"{args.out_file}", lines=True, force_ascii=False)
1,000 changes: 0 additions & 1,000 deletions data/finetune/sharegpt_zh_1K.json

This file was deleted.

1,000 changes: 1,000 additions & 0 deletions data/finetune/sharegpt_zh_1K_format.jsonl

Large diffs are not rendered by default.

Loading

0 comments on commit 92dc8c9

Please sign in to comment.