Skip to content

Commit

Permalink
[Bugfix] Fix Idefics3 bug (#10778)
Browse files Browse the repository at this point in the history
Signed-off-by: Jee Jee Li <[email protected]>
  • Loading branch information
jeejeelee authored Nov 29, 2024
1 parent c82b432 commit 3132aac
Showing 1 changed file with 47 additions and 45 deletions.
92 changes: 47 additions & 45 deletions vllm/model_executor/models/idefics3.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,54 +267,56 @@ def input_processor_for_idefics3(ctx: InputContext,
n_images_in_text = []

text = inputs.get("prompt")
if text is not None:
if isinstance(text, str):
text = [text]
elif not isinstance(text, list) and not isinstance(text[0], str):
raise ValueError("Invalid input text. Please provide a string, "
"or a list of strings")

fake_image_token = processor.fake_image_token.content
image_token = processor.image_token.content
global_img_token = processor.global_image_tag

prompt_strings = []
for sample, sample_rows, sample_cols in zip(text, image_rows,
image_cols):
n_images_in_text.append(sample.count(image_token))

# Replace the image token with fake tokens around the expanded
# image token sequence of length `image_seq_len`
image_prompt_strings = []
for n_rows, n_cols in zip(sample_rows, sample_cols):
image_prompt_string = _get_image_prompt_string(
n_rows,
n_cols,
processor.image_seq_len,
image_token=image_token,
fake_token_around_image=fake_image_token,
global_img_token=global_img_token,
)
image_prompt_strings.append(image_prompt_string)

split_sample = sample.split(image_token)
if len(split_sample) == 0:
raise ValueError(
"The image token should be present in the text.")
if text is None:
prompt_token_ids = inputs.get("prompt_token_ids", [])
assert prompt_token_ids
text = tokenizer.decode(prompt_token_ids)

if isinstance(text, str):
text = [text]
elif not isinstance(text, list) and not isinstance(text[0], str):
raise ValueError("Invalid input text. Please provide a string, "
"or a list of strings")

fake_image_token = processor.fake_image_token.content
image_token = processor.image_token.content
global_img_token = processor.global_image_tag

prompt_strings = []
for sample, sample_rows, sample_cols in zip(text, image_rows, image_cols):
n_images_in_text.append(sample.count(image_token))

# Replace the image token with fake tokens around the expanded
# image token sequence of length `image_seq_len`
image_prompt_strings = []
for n_rows, n_cols in zip(sample_rows, sample_cols):
image_prompt_string = _get_image_prompt_string(
n_rows,
n_cols,
processor.image_seq_len,
image_token=image_token,
fake_token_around_image=fake_image_token,
global_img_token=global_img_token,
)
image_prompt_strings.append(image_prompt_string)

# Place in the image prompt strings where the image tokens are
sample = split_sample[0]
for i, image_prompt_string in enumerate(image_prompt_strings):
sample += image_prompt_string + split_sample[i + 1]
prompt_strings.append(sample)
split_sample = sample.split(image_token)
if len(split_sample) == 0:
raise ValueError("The image token should be present in the text.")

prompt_token_ids = tokenizer(text=prompt_strings[0]).input_ids
# Place in the image prompt strings where the image tokens are
sample = split_sample[0]
for i, image_prompt_string in enumerate(image_prompt_strings):
sample += image_prompt_string + split_sample[i + 1]
prompt_strings.append(sample)

return token_inputs(
prompt_token_ids=prompt_token_ids,
prompt=prompt_strings[0],
multi_modal_data=multi_modal_data,
)
prompt_token_ids = tokenizer(text=prompt_strings[0]).input_ids

return token_inputs(
prompt_token_ids=prompt_token_ids,
prompt=prompt_strings[0],
multi_modal_data=multi_modal_data,
)


def _get_max_num_image_patch(image_processor: Idefics3ImageProcessor) -> int:
Expand Down

0 comments on commit 3132aac

Please sign in to comment.