Skip to content

Commit

Permalink
added sampling for pretraining data
Browse files Browse the repository at this point in the history
  • Loading branch information
VahidooX committed Nov 19, 2023
1 parent 8e37c9d commit b4cef7e
Showing 1 changed file with 41 additions and 41 deletions.
82 changes: 41 additions & 41 deletions scripts/speechlm_sft/sampling_pretraining_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,47 +100,47 @@
SEC = "english/sec_tdd_shuf.jsonl"

# CODING DATASETS
ASMB = "starcoder-repo-len-filt/assembly/assembly_repo_shuf.json"
CPLA = "starcoder-repo-len-filt/c/c_repo_shuf.json"
CSHA = "starcoder-repo-len-filt/c-sharp/c-sharp_repo_shuf.json"
CLIS = "starcoder-repo-len-filt/common-lisp/common-lisp_repo_shuf.json"
CPPP = "starcoder-repo-len-filt/cpp/cpp_repo_shuf.json"
CSSL = "starcoder-repo-len-filt/css/css_repo_shuf.json"
CUDA = "starcoder-repo-len-filt/cuda/cuda_repo_shuf.json"
DART = "starcoder-repo-len-filt/dart/dart_repo_shuf.json"
DOCK = "starcoder-repo-len-filt/dockerfile/dockerfile_repo_shuf.json"
FORT = "starcoder-repo-len-filt/fortran/fortran_repo_shuf.json"
GOPL = "starcoder-repo-len-filt/go/go_repo_shuf.json"
HASK = "starcoder-repo-len-filt/haskell/haskell_repo_shuf.json"
HTML = "starcoder-repo-len-filt/html/html_repo_shuf.json"
JAVA = "starcoder-repo-len-filt/java/java_repo_shuf.json"
JASC = "starcoder-repo-len-filt/javascript/javascript_repo_shuf.json"
JSON = "starcoder-repo-len-filt/json/json_repo_shuf.json"
JULI = "starcoder-repo-len-filt/julia/julia_repo_shuf.json"
JUPY = "starcoder-repo-len-filt/jupyter-scripts-dedup-filtered/jupyter-scripts-dedup-filtered_repo_shuf.json"
LUAL = "starcoder-repo-len-filt/lua/lua_repo_shuf.json"
MAKE = "starcoder-repo-len-filt/makefile/makefile_repo_shuf.json"
MARD = "starcoder-repo-len-filt/markdown/markdown_repo_shuf.json"
MATH = "starcoder-repo-len-filt/mathematica/mathematica_repo_shuf.json"
OMNI = "starcoder-repo-len-filt/python_merged_piiremoval.json"
PASC = "starcoder-repo-len-filt/pascal/pascal_repo_shuf.json"
PERL = "starcoder-repo-len-filt/perl/perl_repo_shuf.json"
PHPL = "starcoder-repo-len-filt/php/php_repo_shuf.json"
PYTH = "starcoder-repo-len-filt/python/python_repo_shuf.json"
RSTL = "starcoder-repo-len-filt/restructuredtext/restructuredtext_repo_shuf.json"
RUBY = "starcoder-repo-len-filt/ruby/ruby_repo_shuf.json"
RUST = "starcoder-repo-len-filt/rust/rust_repo_shuf.json"
SCAL = "starcoder-repo-len-filt/scala/scala_repo_shuf.json"
SHEL = "starcoder-repo-len-filt/shell/shell_repo_shuf.json"
SQLP = "starcoder-repo-len-filt/sql/sql_repo_shuf.json"
SWIF = "starcoder-repo-len-filt/swift_shuf.json"
SYSV = "starcoder-repo-len-filt/systemverilog/systemverilog_repo_shuf.json"
TEXP = "starcoder-repo-len-filt/tex/tex_repo_shuf.json"
TYPE = "starcoder-repo-len-filt/typescript/typescript_repo_shuf.json"
VHDL = "starcoder-repo-len-filt/vhdl/vhdl_repo_shuf.json"
VISU = "starcoder-repo-len-filt/vidual-basic/visual-basic_repo_shuf.json"
XMLL = "starcoder-repo-len-filt/xml_shuf.json"
YAML = "starcoder-repo-len-filt/yaml/yaml_repo_shuf.json"
ASMB = "starcoder-repo-len-filt/assembly/assembly_repo_shuf.jsonl"
CPLA = "starcoder-repo-len-filt/c/c_repo_shuf.jsonl"
CSHA = "starcoder-repo-len-filt/c-sharp/c-sharp_repo_shuf.jsonl"
CLIS = "starcoder-repo-len-filt/common-lisp/common-lisp_repo_shuf.jsonl"
CPPP = "starcoder-repo-len-filt/cpp/cpp_repo_shuf.jsonl"
CSSL = "starcoder-repo-len-filt/css/css_repo_shuf.jsonl"
CUDA = "starcoder-repo-len-filt/cuda/cuda_repo_shuf.jsonl"
DART = "starcoder-repo-len-filt/dart/dart_repo_shuf.jsonl"
DOCK = "starcoder-repo-len-filt/dockerfile/dockerfile_repo_shuf.jsonl"
FORT = "starcoder-repo-len-filt/fortran/fortran_repo_shuf.jsonl"
GOPL = "starcoder-repo-len-filt/go/go_repo_shuf.jsonl"
HASK = "starcoder-repo-len-filt/haskell/haskell_repo_shuf.jsonl"
HTML = "starcoder-repo-len-filt/html/html_repo_shuf.jsonl"
JAVA = "starcoder-repo-len-filt/java/java_repo_shuf.jsonl"
JASC = "starcoder-repo-len-filt/javascript/javascript_repo_shuf.jsonl"
JSON = "starcoder-repo-len-filt/json/json_repo_shuf.jsonl"
JULI = "starcoder-repo-len-filt/julia/julia_repo_shuf.jsonl"
JUPY = "starcoder-repo-len-filt/jupyter-scripts-dedup-filtered/jupyter-scripts-dedup-filtered_repo_shuf.jsonl"
LUAL = "starcoder-repo-len-filt/lua/lua_repo_shuf.jsonl"
MAKE = "starcoder-repo-len-filt/makefile/makefile_repo_shuf.jsonl"
MARD = "starcoder-repo-len-filt/markdown/markdown_repo_shuf.jsonl"
MATH = "starcoder-repo-len-filt/mathematica/mathematica_repo_shuf.jsonl"
OMNI = "starcoder-repo-len-filt/python_merged_piiremoval.jsonl"
PASC = "starcoder-repo-len-filt/pascal/pascal_repo_shuf.jsonl"
PERL = "starcoder-repo-len-filt/perl/perl_repo_shuf.jsonl"
PHPL = "starcoder-repo-len-filt/php/php_repo_shuf.jsonl"
PYTH = "starcoder-repo-len-filt/python/python_repo_shuf.jsonl"
RSTL = "starcoder-repo-len-filt/restructuredtext/restructuredtext_repo_shuf.jsonl"
RUBY = "starcoder-repo-len-filt/ruby/ruby_repo_shuf.jsonl"
RUST = "starcoder-repo-len-filt/rust/rust_repo_shuf.jsonl"
SCAL = "starcoder-repo-len-filt/scala/scala_repo_shuf.jsonl"
SHEL = "starcoder-repo-len-filt/shell/shell_repo_shuf.jsonl"
SQLP = "starcoder-repo-len-filt/sql/sql_repo_shuf.jsonl"
SWIF = "starcoder-repo-len-filt/swift_shuf.jsonl"
SYSV = "starcoder-repo-len-filt/systemverilog/systemverilog_repo_shuf.jsonl"
TEXP = "starcoder-repo-len-filt/tex/tex_repo_shuf.jsonl"
TYPE = "starcoder-repo-len-filt/typescript/typescript_repo_shuf.jsonl"
VHDL = "starcoder-repo-len-filt/vhdl/vhdl_repo_shuf.jsonl"
VISU = "starcoder-repo-len-filt/vidual-basic/visual-basic_repo_shuf.jsonl"
XMLL = "starcoder-repo-len-filt/xml_shuf.jsonl"
YAML = "starcoder-repo-len-filt/yaml/yaml_repo_shuf.jsonl"

DATA_BLEND_NONENGLISH = {
AR2240: 0.0015,
Expand Down

0 comments on commit b4cef7e

Please sign in to comment.