Skip to content

Commit

Permalink
Code clean-ups (#171)
Browse files Browse the repository at this point in the history
* misc optimization
* clear cache after translation during scoring
* allow more recompiles
* set rope / position_embeddings at model build
* remove BPTT
* clarify pad_mask(true=yes we pad, so we won't attend) and attn_mask(true=yes we attend)
* preallocate KV cache even in "pytorch" path (same as flash)
* reduce config updates
  • Loading branch information
vince62s authored Jan 3, 2025
1 parent b5e2266 commit 8a8987f
Show file tree
Hide file tree
Showing 42 changed files with 562 additions and 588 deletions.
22 changes: 11 additions & 11 deletions .github/workflows/push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ jobs:
-src_vocab /tmp/eole.vocab.src \
-tgt_vocab /tmp/eole.vocab.tgt \
&& rm -rf /tmp/sample
- name: Test field/transform dump
- name: Testing architecture rnn sample dump...
run: |
# The dumped fields are used later when testing tools
python eole/bin/main.py train \
Expand All @@ -61,7 +61,7 @@ jobs:
-tgt_vocab /tmp/eole.vocab.tgt \
-src_vocab_size 1000 \
-tgt_vocab_size 1000
- name: Test RNN training
- name: Testing architecture rnn training
run: |
python eole/bin/main.py train \
-config eole/tests/data/data.yaml \
Expand All @@ -75,7 +75,7 @@ jobs:
-tensorboard \
-tensorboard_log_dir /tmp/logs_train
python eole/tests/test_events.py --logdir /tmp/logs_train -tensorboard_checks train
- name: Test RNN training and validation
- name: Testing architecture rnn training and validation
run: |
python eole/bin/main.py train \
-config eole/tests/data/data.yaml \
Expand All @@ -90,7 +90,7 @@ jobs:
-tensorboard_log_dir /tmp/logs_train_and_valid
python eole/tests/test_events.py --logdir /tmp/logs_train_and_valid -tensorboard_checks train
python eole/tests/test_events.py --logdir /tmp/logs_train_and_valid -tensorboard_checks valid
- name: Test RNN training with coverage
- name: Testing architecture rnn training w/ coverage
run: |
python eole/bin/main.py train \
-config eole/tests/data/data.yaml \
Expand All @@ -101,7 +101,7 @@ jobs:
-report_every 5 \
-model '{"architecture": "rnn", "hidden_size": 10, "embeddings": {"word_vec_size": 5, "position_encoding_type": None}, "decoder": {"coverage_attn": True, "lambda_coverage": 0.1}}' \
-training '{"batch_size": 10, "num_workers": 0, "bucket_size": 1024, "train_steps": 10}'
- name: Test Transformer training with align
- name: Testing architecture custom transformer training w/ align
run: |
python eole/bin/main.py train \
-config eole/tests/data/align_data.yaml \
Expand All @@ -112,7 +112,7 @@ jobs:
-model '{"layers": 4, "hidden_size": 16, "transformer_ff": 64, "embeddings": {"word_vec_size": 16}, "encoder": {"encoder_type": "transformer", "heads": 2}, "decoder": {"decoder_type": "transformer", "lambda_align": 0.05, "alignment_layer": 2, "alignment_heads": 0, "heads": 2}}' \
-training '{"batch_size": 10, "num_workers": 0, "bucket_size": 1024, "train_steps": 10, "dropout_steps": [0, 3, 7], "dropout": [0.3, 0.2, 0.1], "attention_dropout": [0.2, 0.2, 0.1]}' \
-report_every 5 \
- name : Test Transformer training and validation with dynamic scoring
- name : Testing architecture custom transformer training w/ validation with dynamic scoring
run: |
python3 eole/bin/main.py train \
-config eole/tests/data/data.yaml \
Expand All @@ -129,7 +129,7 @@ jobs:
-tensorboard_log_dir /tmp/logs_dynamic-scoring \
-dump_preds /tmp/dump_preds
python eole/tests/test_events.py --logdir /tmp/logs_dynamic-scoring -tensorboard_checks valid_metrics
- name : Test Transformer training and validation with dynamic scoring and maxrelative
- name : Testing architecture transformer training w/ validation with dynamic scoring and maxrelative
run: |
python3 eole/bin/main.py train \
-config eole/tests/data/data.yaml \
Expand All @@ -146,15 +146,15 @@ jobs:
-tensorboard_log_dir /tmp/logs_dynamic-scoring_and_relative \
-dump_preds /tmp/dump_preds
python eole/tests/test_events.py --logdir /tmp/logs_dynamic-scoring_and_relative -tensorboard_checks valid_metrics
- name : Test Transformer training and validation with dynamic scoring and rotary
- name : Testing architecture transformer training w/ validation with dynamic scoring and rotary
run: |
python3 eole/bin/main.py train \
-config eole/tests/data/data.yaml \
-src_vocab /tmp/eole.vocab.src \
-tgt_vocab /tmp/eole.vocab.tgt \
-src_vocab_size 1000 \
-tgt_vocab_size 1000 \
-model '{"architecture": "transformer", "layers": 4, "heads": 2, "hidden_size": 16, "transformer_ff": 64, "embeddings": {"word_vec_size": 16, "position_encoding_type": "Rotary"}}' \
-model '{"architecture": "transformer", "layers": 4, "heads": 2, "hidden_size": 16, "transformer_ff": 64, "rope_config": {}, "embeddings": {"word_vec_size": 16, "position_encoding_type": "Rotary"}}' \
-training '{"batch_size": 10, "num_workers": 0, "bucket_size": 1024, "train_steps": 10, "valid_steps": 5}' \
-report_every 2 \
-valid_metrics "BLEU" "TER" \
Expand All @@ -163,7 +163,7 @@ jobs:
-tensorboard_log_dir /tmp/logs_dynamic-scoring_and_rotary \
-dump_preds /tmp/dump_preds
python eole/tests/test_events.py --logdir /tmp/logs_dynamic-scoring_and_rotary -tensorboard_checks valid_metrics
- name : Test Transformer training and validation with dynamic scoring and alibi
- name : Testing architecture transformer training w/ validation with dynamic scoring and alibi
run: |
python3 eole/bin/main.py train \
-config eole/tests/data/data.yaml \
Expand All @@ -180,7 +180,7 @@ jobs:
-tensorboard_log_dir /tmp/logs_dynamic-scoring_and_alibi \
-dump_preds /tmp/dump_preds
python eole/tests/test_events.py --logdir /tmp/logs_dynamic-scoring_and_alibi -tensorboard_checks valid_metrics
- name: Test LM training
- name: Testing architecture custom decoder only training
run: |
python eole/bin/main.py train \
-config eole/tests/data/lm_data.yaml \
Expand Down
21 changes: 13 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,30 +2,35 @@

[![Documentation](https://img.shields.io/badge/docs-latest-blue.svg)](https://eole-nlp.github.io/eole)

Open language modeling toolkit based on [PyTorch](https://pytorch.org).
Open language modeling toolkit based on [PyTorch](https://pytorch.org) initially spun-off of OpenNMT-py

## 👷‍♂️🚧 Work in Progress
We aim to maintain the research-friendly approach of the original project while including latest architectures (LLMs) and various other techniques.
Our goal is to provide a comprehensive yet compact and modular codebase for experimenting with various types of language models (encoder, decoder, seq2seq).

[EOLE](https://github.com/eole-nlp/eole) is a spin-off of the [OpenNMT-py](https://github.com/opennmt/opennmt-py) project. We aim to maintain the research-friendly approach of the original project while updating the structure and expanding it to include new topics related to large language models (LLMs) and various other techniques. Our goal is to provide a comprehensive yet compact and modular codebase for experimenting with various types of language models (encoder, decoder, seq2seq).
## Latest developments

---
- **Web-based (Google translator-like) interface** featuring the latest EuroLLM-8B-Instruct LLM: read more [here](https://github.com/eole-nlp/eole/tree/main/recipes/eurollm)
- **Estimator layer** which enables to rescore multiple beams in the same model. Read article [here](https://medium.com/p/05b00b271a47) and [here](https://medium.com/p/7dccfe167814)
- **Support Hugging Face Tokenizers** for better compatiblity
- **New recipes** for TowerInstruct-llama2 and TowerInstruct-Mistral
- **Support latest models** for Llama3.1, Gemma2, Pixtral
- **Replicate CometKiwi(XL/XXL)** Encoder+Estimator models

### Current State
## Work completed

We have made significant progress in several areas:

- **Configuration Management**: Streamlined through [pydantic](https://docs.pydantic.dev) models.
- **Command Line Entry Points**: Improved using structured subparsers for better organization.
- **Reproducible Recipes**: Provided for widely used models and tasks, ensuring consistency and reliability.
- **Core API Simplification**: Refined around the new configuration objects for ease of use.
- **Revamped Fast API based server**: see above example with EuroLLM-9B-Instruct

### Future Directions

There are still several exciting avenues to explore:

- **Further Simplification and Refactoring**: Continue enhancing the codebase for clarity and efficiency.
- **Inference Server**: Develop a robust solution for model inference.
- **Additional Recipes**: Expand the library of reproducible recipes.
- **Documentation**: Enhance and expand the documentation for better user guidance.
- **Test Coverage**: Improve testing to ensure code reliability and performance.
- **Logging Enhancements**: Implement more sophisticated logging mechanisms.
Expand All @@ -37,7 +42,7 @@ There are still several exciting avenues to explore:

- **Versatile Training and Inference**: Train from scratch, finetune, and infer models of various architectures including Transformer Encoder/Decoder/EncoderDecoder and RNN EncoderDecoder.
- **Dynamic Data Transforms**: Apply on-the-fly transformations in the dataloading logic for both training and inference.
- **Comprehensive LLM Support**: Includes converters for Llama, Mistral, Phi, OpenLlama, Redpajama, MPT-7B, and Falcon models.
- **Comprehensive LLM Support**: Includes converters for Llama, Mistral, Phi, Gemma ...
- **Advanced Quantization**: Support for 8-bit and 4-bit quantization, along with LoRA adapters, with or without checkpointing, as well as mixed precision (FP16).
- **Efficient Finetuning**: Finetune 7B and 13B models on a single RTX 24GB GPU using 4-bit quantization.
- **Flexible Inference**: Perform inference in 4-bit or 8-bit using the same layer quantization methods as in finetuning.
Expand Down
9 changes: 4 additions & 5 deletions eole/bin/convert/convert_HF.py
Original file line number Diff line number Diff line change
Expand Up @@ -836,7 +836,7 @@ def get_weight(checkpoint, tensor_name):
for target in targetlist:
if target in key_maps[arch].keys():
source = key_maps[arch][target]
if type(source) == tuple:
if isinstance(source, tuple):
srckey = source[0]
srcmap = source[1]
else:
Expand All @@ -847,7 +847,7 @@ def get_weight(checkpoint, tensor_name):
)

if w is not None:
if type(source) == tuple:
if isinstance(source, tuple):
w = eval("w" + srcmap).contiguous()
eole_safetensor[
eole_prefix + str(i) + target + param
Expand All @@ -859,9 +859,8 @@ def get_weight(checkpoint, tensor_name):
idx = 1
for p in ["weight", "bias"]:
if ".input_layernorm." + p in key_maps[arch].keys():
if (
type(key_maps[arch][".input_layernorm." + p])
== tuple
if isinstance(
key_maps[arch][".input_layernorm." + p], tuple
):
w = get_weight(
checkpoint,
Expand Down
2 changes: 1 addition & 1 deletion eole/bin/run/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ def infer(self, inputs, settings={}, is_chat=False):
"""
Run inference on the given inputs.
"""
if type(inputs) == str:
if isinstance(inputs, str):
inputs = [inputs]
if not (self.loaded):
self.load()
Expand Down
70 changes: 51 additions & 19 deletions eole/config/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,9 +287,11 @@ class TransformerConfig(Config):

@model_validator(mode="after")
def _validate_transformer_config(self):
"""
if self.position_encoding_type == PositionEncodingType.Rotary:
if self.rope_config is None:
self.rope_config = RotaryPositionConfig()
"""
if self.add_qkvbias and "add_final_linear_bias" not in self.model_fields_set:
self.update(add_final_linear_bias=True)
return self
Expand Down Expand Up @@ -503,40 +505,70 @@ def default_architecture(cls, data: Any) -> Any:
return data

def update_model_opts(self):
if self.embeddings is not None and self.embeddings.word_vec_size > 0:
self.embeddings.src_word_vec_size = self.embeddings.word_vec_size
self.embeddings.tgt_word_vec_size = self.embeddings.word_vec_size
update_dict = {}
if self.embeddings.position_encoding_type == PositionEncodingType.Rotary:
if not self.rope_config:
update_dict["rope_config"] = RotaryPositionConfig()
rope_config = update_dict["rope_config"]
else:
rope_config = self.rope_config
else:
rope_config = None

# Backward compatibility with "fix_word_vecs_*" opts
# We can probably drop this now...
# if hasattr(self, "fix_word_vecs_enc"):
# self.embeddings.freeze_word_vecs_enc = self.embeddings.fix_word_vecs_enc
# if hasattr(self, "fix_word_vecs_dec"):
# self.embeddings.freeze_word_vecs_dec = self.embeddings.fix_word_vecs_dec
if self.embeddings is not None and self.embeddings.word_vec_size > 0:
update_dict["embeddings"] = {
"src_word_vec_size": self.embeddings.word_vec_size,
"tgt_word_vec_size": self.embeddings.word_vec_size,
}
if self.embeddings is not None and "embeddings" in update_dict.keys():
self.embeddings.update(**update_dict.pop("embeddings"))

if (
getattr(self.encoder, "encoder_type", None) == "brnn"
and self.decoder.decoder_type == "rnn"
):
self.decoder.bidirectional_encoder = True
update_dict["decoder"] = {"bidirectional_encoder": True}

if self.encoder is not None:
self.encoder.src_word_vec_size = self.embeddings.src_word_vec_size
update_dict["encoder"] = {
"src_word_vec_size": self.embeddings.src_word_vec_size
}
if getattr(self.encoder, "encoder_type", None) == "transformer":
self.encoder.position_encoding_type = (
self.embeddings.position_encoding_type
update_dict["encoder"].update(
{
"position_encoding_type": self.embeddings.position_encoding_type,
"n_positions": self.embeddings.n_positions,
"rope_config": rope_config,
}
)
self.encoder.n_positions = self.embeddings.n_positions
update_dict[
"position_encoding_type"
] = self.embeddings.position_encoding_type
if self.encoder is not None and "encoder" in update_dict.keys():
self.encoder.update(**update_dict.pop("encoder"))

if self.decoder is not None:
self.decoder.tgt_word_vec_size = self.embeddings.tgt_word_vec_size
update_dict["decoder"] = {
"tgt_word_vec_size": self.embeddings.tgt_word_vec_size
}
if getattr(self.decoder, "decoder_type", None) in [
"transformer",
"transformer_lm",
]:
self.decoder.position_encoding_type = (
self.embeddings.position_encoding_type
update_dict["decoder"].update(
{
"position_encoding_type": self.embeddings.position_encoding_type,
"n_positions": self.embeddings.n_positions,
"rope_config": rope_config,
}
)
self.decoder.n_positions = self.embeddings.n_positions
update_dict[
"position_encoding_type"
] = self.embeddings.position_encoding_type
if self.decoder is not None and "decoder" in update_dict.keys():
self.decoder.update(**update_dict.pop("decoder"))

self.update(**update_dict)

# causing some weird recursion issue in unit test, to investigate
# if self.encoder is not None:
Expand Down Expand Up @@ -584,7 +616,7 @@ def _validate_model_config(self):
return self


class CustomModelConfig(BaseModelConfig):
class CustomModelConfig(TransformerConfig, BaseModelConfig):
"""
Wrap anything that does not fit a set common architecture.
"""
Expand Down
14 changes: 8 additions & 6 deletions eole/config/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,24 +187,26 @@ def _update_with_model_config(self):
quant_type=training_config.quant_type,
)

model_config._validate_model_config()
# training_config._validate_running_config() # not sure it's needed

self.update(
model=model_config,
)

update_dict = {}
if "transforms" not in self.model_fields_set:
self.transforms = self._all_transform = transforms
update_dict["transforms"] = transforms
update_dict["_all_transform"] = transforms
if "transforms_configs" not in self.model_fields_set:
self.transforms_configs = config_dict.get("transforms_configs", {})
update_dict["transforms_configs"] = NestedAllTransformsConfig(
**config_dict.get("transforms_configs", {})
)
if "compute_dtype" not in self.model_fields_set:
self.compute_dtype = config_dict.get("training", {}).get(
"compute_dtype", "fp16"
)
for key, value in config_dict.get("inference", {}).items():
if key not in self.model_fields_set:
setattr(self, key, value)
update_dict[key] = value
self.update(**update_dict)


class BuildVocabConfig(
Expand Down
4 changes: 0 additions & 4 deletions eole/config/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,10 +212,6 @@ class TrainingConfig(
dropout_steps: List[int] = Field(
default=[0], description="Steps at which dropout changes."
)
truncated_decoder: int = Field(
default=0, description="Truncated bptt."
) # deprecated?

label_smoothing: float = Field(
default=0.0,
description="Label smoothing value epsilon. "
Expand Down
Loading

0 comments on commit 8a8987f

Please sign in to comment.