Skip to content

Commit

Permalink
Enhance video processing by adding debug prints and increasing max_pi…
Browse files Browse the repository at this point in the history
…xels and max_num_frames for improved performance
  • Loading branch information
pufanyi committed Dec 31, 2024
1 parent 8164025 commit 90e27f5
Showing 1 changed file with 3 additions and 19 deletions.
22 changes: 3 additions & 19 deletions lmms_eval/models/qwen2_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,9 @@ def __init__(
batch_size: Optional[Union[int, str]] = 1,
use_cache=True,
use_flash_attention_2: Optional[bool] = True,
max_pixels: int = 1605632,
max_pixels: int = 12845056,
min_pixels: int = 3136,
max_num_frames: int = 32,
max_num_frames: int = 256,
use_custom_video_loader: Optional[bool] = False,
fps: Optional[float] = None, # Only applicable if use_custom_video_loader is True
max_image_size: Optional[int] = None, # Only applicable if use_custom_video_loader is True
Expand Down Expand Up @@ -233,16 +233,6 @@ def _collate(x):
height, width = first_frame.shape[:2]
# max_pixels = height * width
message.append({"role": "user", "content": [{"type": "video", "video": visual, "max_pixels": self.max_pixels}, {"type": "text", "text": context}]})
if self.use_custom_video_loader:
visual = read_video_pyav_base64(visual, num_frm=self.max_num_frames, fps=self.fps, img_format="JPEG")
image_contents = list(map(lambda x: f"data:image/jpeg;base64,{x}", visual))
message.append({"role": "user", "content": [{"type": "video", "video": image_contents}, {"type": "text", "text": context}]})
else:
vr = decord.VideoReader(visual)
first_frame = vr[0].asnumpy()
height, width = first_frame.shape[:2]
# max_pixels = height * width
message.append({"role": "user", "content": [{"type": "video", "video": visual, "max_pixels": self.max_pixels}, {"type": "text", "text": context}]})
elif isinstance(visual, Image.Image): # Single image
base64_image = visual.convert("RGB")
buffer = BytesIO()
Expand All @@ -269,13 +259,7 @@ def _collate(x):

texts = [self.processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in messages]
image_inputs, video_inputs = process_vision_info(messages)
if video_inputs is not None:
total_frames = video_inputs[0].shape[0]
indices = np.linspace(0, total_frames - 1, self.max_num_frames, dtype=int)
# Append the last frame index if not already included
if total_frames - 1 not in indices:
indices = np.append(indices, total_frames - 1)
video_inputs[0] = video_inputs[0][indices]

inputs = self.processor(text=texts, images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")

if self.device_map == "auto":
Expand Down

0 comments on commit 90e27f5

Please sign in to comment.