Adding padding to the script that generates embeddings #28

zas97 · 2024-07-03T15:08:21Z

Is there an option to add padding to the sequence used to generate the esm3 embeddings. I'm currently using this script:

from esm.models.esm3 import ESM3
from esm.sdk.api import ESMProtein, SamplingConfig
from esm.utils.constants.models import ESM3_OPEN_SMALL
from time import time


client = ESM3.from_pretrained(ESM3_OPEN_SMALL, device="cpu")
protein = ESMProtein(
    sequence=(
        "FIFLALLGAAVAFPVDDDDKIVGGYTCGANTVPYQVSLNSGYHFCGGSLINSQWVVSAAHCYKSGIQVRLGEDNINVVEG"
        "NEQFISASKSIVHPSYNSNTLNNDIMLIKLKSAASLNSRVASISLPTSCASAGTQCLISGWGNTKSSGTSYPDVLKCLKAP"
        "ILSDSSCKSAYPGQITSNMFCAGYLEGGKDSCQGDSGGPVVCSGKLQGIVSWGSGCAQKNKPGVYTKVCNYVSWIKQTIASN"
    )
)
protein_tensor = client.encode(protein)
print(protein_tensor)
output = client.forward_and_sample(
    protein_tensor, SamplingConfig(return_per_residue_embeddings=True)
)

The text was updated successfully, but these errors were encountered:

ebetica · 2024-07-09T18:34:34Z

What do you mean? Why do you need to add padding? The API doesn't support batched inference yet, but it will be out in the next week or so hopefully.

santiag0m · 2024-07-09T18:58:04Z

As @ebetica said, batching is in the works. In case you need padding for something else, here is how you can do it:

import attr
import torch
import torch.nn.functional as F

from esm.models.esm3 import ESM3
from esm.sdk.api import (
    ESMProtein,
    ESMProteinTensor,
    SamplingConfig,
    SamplingTrackConfig,
)
from esm.tokenization import get_model_tokenizers
from esm.utils.constants.models import ESM3_OPEN_SMALL


def add_padding(protein_tensor: ESMProteinTensor, max_length: int) -> ESMProteinTensor:
    tokenizers = get_model_tokenizers(ESM3_OPEN_SMALL)

    current_length = len(protein_tensor)

    if current_length >= max_length:
        raise ValueError(
            f"Protein length is {current_length} which is greater than the maximum length of {max_length}"
        )

    left_pad = 0
    right_pad = max_length - current_length

    empty_protein_tensor = ESMProteinTensor.empty(
        current_length - 2,  # Account for BOS/EOS that our input already has
        tokenizers=tokenizers,
        device=protein_tensor.device,
    )

    for track in attr.fields(ESMProteinTensor):
        track_tensor = getattr(protein_tensor, track.name)

        if track_tensor is None:
            if track.name == "coordinates":
                continue
            else:
                # Initialize from empty tensor
                track_tensor = getattr(empty_protein_tensor, track.name)

        if track.name == "coordinates":
            pad_token = torch.inf
            new_tensor = F.pad(
                track_tensor,
                (0, 0, 0, 0, left_pad, right_pad),
                value=pad_token,
            )
        elif track.name in ["function", "residue_annotations"]:
            pad_token = getattr(tokenizers, track.name).pad_token_id
            new_tensor = F.pad(
                track_tensor,
                (0, 0, left_pad, right_pad),
                value=pad_token,
            )
        else:
            pad_token = getattr(tokenizers, track.name).pad_token_id
            new_tensor = F.pad(
                track_tensor,
                (
                    left_pad,
                    right_pad,
                ),
                value=pad_token,
            )
        protein_tensor = attr.evolve(protein_tensor, **{track.name: new_tensor})

    return protein_tensor


client = ESM3.from_pretrained(ESM3_OPEN_SMALL, device="cuda")
protein = ESMProtein(
    sequence=(
        "FIFLALLGAAVAFPVDDDDKIVGGYTCGANTVPYQVSLNSGYHFCGGSLINSQWVVSAAHCYKSGIQVRLGEDNINVVEG"
        "NEQFISASKSIVHPSYNSNTLNNDIMLIKLKSAASLNSRVASISLPTSCASAGTQCLISGWGNTKSSGTSYPDVLKCLKAP"
        "ILSDSSCKSAYPGQITSNMFCAGYLEGGKDSCQGDSGGPVVCSGKLQGIVSWGSGCAQKNKPGVYTKVCNYVSWIKQTIASN"
    )
)
protein_tensor = client.encode(protein)
protein_tensor_padded = add_padding(protein_tensor, 1024)
output = client.forward_and_sample(
    protein_tensor_padded,
    SamplingConfig(sequence=SamplingTrackConfig(), return_per_residue_embeddings=True),
)
print(protein_tensor.sequence.shape)
print(protein_tensor_padded.sequence.shape)
print(output.per_residue_embedding.shape)

zas97 · 2024-07-10T06:47:35Z

Thank you, it's what I needed =)

pia-francesca · 2024-10-01T08:17:38Z

Super helpful, thanks! Is 1024 the max length the model can handle?

zas97 changed the title ~~Adding padding to the script that generates sequences~~ Adding padding to the script that generates embeddings Jul 8, 2024

santiag0m mentioned this issue Jul 9, 2024

About Generating Protein Sequence Embeddings with Your Model #2

Closed

zas97 closed this as completed Jul 10, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Adding padding to the script that generates embeddings #28

Adding padding to the script that generates embeddings #28

zas97 commented Jul 3, 2024

ebetica commented Jul 9, 2024

santiag0m commented Jul 9, 2024 •

edited

Loading

zas97 commented Jul 10, 2024

pia-francesca commented Oct 1, 2024

Adding padding to the script that generates embeddings #28

Adding padding to the script that generates embeddings #28

Comments

zas97 commented Jul 3, 2024

ebetica commented Jul 9, 2024

santiag0m commented Jul 9, 2024 • edited Loading

zas97 commented Jul 10, 2024

pia-francesca commented Oct 1, 2024

santiag0m commented Jul 9, 2024 •

edited

Loading