Image-Text-to-Text
Transformers
Safetensors
English
Chinese
feature-extraction
conversational
custom_code
Instructions to use FlashVL/FlashVL-2B-Dynamic-ISS with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use FlashVL/FlashVL-2B-Dynamic-ISS with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("image-text-to-text", model="FlashVL/FlashVL-2B-Dynamic-ISS", trust_remote_code=True) messages = [ { "role": "user", "content": [ {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"}, {"type": "text", "text": "What animal is on the candy?"} ] }, ] pipe(text=messages)# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("FlashVL/FlashVL-2B-Dynamic-ISS", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use FlashVL/FlashVL-2B-Dynamic-ISS with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "FlashVL/FlashVL-2B-Dynamic-ISS" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "FlashVL/FlashVL-2B-Dynamic-ISS", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker
docker model run hf.co/FlashVL/FlashVL-2B-Dynamic-ISS
- SGLang
How to use FlashVL/FlashVL-2B-Dynamic-ISS with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "FlashVL/FlashVL-2B-Dynamic-ISS" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "FlashVL/FlashVL-2B-Dynamic-ISS", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "FlashVL/FlashVL-2B-Dynamic-ISS" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "FlashVL/FlashVL-2B-Dynamic-ISS", "messages": [ { "role": "user", "content": [ { "type": "text", "text": "Describe this image in one sentence." }, { "type": "image_url", "image_url": { "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" } } ] } ] }' - Docker Model Runner
How to use FlashVL/FlashVL-2B-Dynamic-ISS with Docker Model Runner:
docker model run hf.co/FlashVL/FlashVL-2B-Dynamic-ISS
| import torch | |
| import os | |
| import asyncio | |
| import requests | |
| from io import BytesIO | |
| from PIL import Image | |
| from urllib.parse import urlparse | |
| import numpy as np | |
| def split_image_ur(img, max_slice_num, image_size, vit_image_size, force_min_size=False): | |
| if force_min_size: | |
| img = resize_by_patch_size_ur(img, min_size= image_size, max_size= image_size * max_slice_num, patch_size=14) | |
| slice_config = { | |
| "max_slice_nums": max_slice_num, | |
| "scale_resolution": image_size, | |
| "patch_size": 14 | |
| } | |
| source_image, sub_images, _ = do_slice_by_minicpmv_strategy_ur( | |
| img, max_slice_nums=slice_config["max_slice_nums"], scale_resolution=slice_config["scale_resolution"], patch_size=slice_config["patch_size"], vit_image_size=vit_image_size) | |
| splits = [] | |
| splits.append(source_image) | |
| for i in range(len(sub_images)): | |
| for j in range(len(sub_images[0])): | |
| splits.append(sub_images[i][j]) | |
| sliced_images, sliced_shapes = [], [] | |
| for slice_image in splits: | |
| sliced_images.append(slice_image) | |
| sliced_shapes.append(np.array((slice_image.size[0] // slice_config["patch_size"], slice_image.size[1] // slice_config["patch_size"]))) | |
| return sliced_images, sliced_shapes | |
| import math | |
| from PIL import Image | |
| import torch | |
| import torchvision.transforms.functional as F | |
| from torchvision.transforms import InterpolationMode | |
| # Strategy: MiniCPM-V | |
| def do_slice_by_minicpmv_strategy_ur(image, max_slice_nums=9, scale_resolution=1120, patch_size=14, vit_image_size=448, never_split=False): | |
| original_size = image.size | |
| original_width, original_height = original_size | |
| log_ratio = math.log(original_width / original_height) | |
| ratio = original_width * original_height / (scale_resolution * scale_resolution) | |
| multiple = min(math.ceil(ratio), max_slice_nums) | |
| source_image = None | |
| best_grid = None | |
| patches = [] | |
| if multiple <= 1 or never_split: | |
| # dont need to slice, upsample | |
| # best_size = find_best_resize( | |
| # original_size, scale_resolution, patch_size, allow_upscale=True | |
| # ) | |
| best_size = (scale_resolution, scale_resolution) | |
| source_image = image.resize(best_size, Image.BICUBIC) | |
| border_size = (vit_image_size-scale_resolution)/2 | |
| from PIL import ImageOps | |
| source_image = ImageOps.expand(source_image, border=int(border_size), fill=(0,0,0)) | |
| else: | |
| candidate_split_grids_nums = [] | |
| for i in [multiple - 1, multiple, multiple + 1]: | |
| if i == 1 or i > max_slice_nums: | |
| continue | |
| candidate_split_grids_nums.append(i) | |
| # source image, down-sampling and ensure divided by patch_size | |
| # best_resize = find_best_resize(original_size, scale_resolution, patch_size) | |
| # source_image = image.copy().resize(best_resize, Image.BICUBIC) | |
| source_image = image.copy().resize((scale_resolution,scale_resolution), Image.BICUBIC) | |
| candidate_grids = [] | |
| # find best grid | |
| for split_grids_nums in candidate_split_grids_nums: | |
| m = 1 | |
| while m <= split_grids_nums: | |
| if split_grids_nums % m == 0: | |
| candidate_grids.append([m, split_grids_nums // m]) | |
| m += 1 | |
| # print("candidate_grids: ", candidate_grids) | |
| best_grid = [1, 1] | |
| min_error = float("inf") | |
| for grid in candidate_grids: | |
| error = abs(log_ratio - math.log(grid[0] / grid[1])) | |
| if error < min_error: | |
| best_grid = grid | |
| min_error = error | |
| refine_size = get_refine_size( | |
| original_size, best_grid, scale_resolution, patch_size, allow_upscale=True | |
| ) | |
| refine_image = image.resize(refine_size, Image.BICUBIC) | |
| patches = split_to_patches(refine_image, best_grid, scale_resolution, vit_image_size) | |
| return source_image, patches, best_grid | |
| def ensure_divide(length, patch_size): | |
| return max(round(length / patch_size) * patch_size, patch_size) | |
| def find_best_resize(original_size, scale_resolution, patch_size, allow_upscale=False): | |
| width, height = original_size | |
| if (width * height > scale_resolution * scale_resolution) or allow_upscale: | |
| r = width / height | |
| height = int(scale_resolution / math.sqrt(r)) | |
| width = int(height * r) | |
| best_width = ensure_divide(width, patch_size) | |
| best_height = ensure_divide(height, patch_size) | |
| # print(best_width, best_height, scale_resolution) | |
| while best_width * best_height > scale_resolution ** 2: | |
| # print(best_width) | |
| best_width -= patch_size | |
| return (best_width, best_height) | |
| def get_refine_size(original_size, grid, scale_resolution, patch_size, allow_upscale=False): | |
| width, height = original_size | |
| grid_x, grid_y = grid | |
| # refine_width = ensure_divide(width, grid_x) | |
| # refine_height = ensure_divide(height, grid_y) | |
| # grid_width = refine_width / grid_x | |
| # grid_height = refine_height / grid_y | |
| # best_grid_size = find_best_resize( | |
| # (grid_width, grid_height), | |
| # scale_resolution, | |
| # patch_size, | |
| # allow_upscale=allow_upscale, | |
| # ) | |
| refine_size = (scale_resolution * grid_x, scale_resolution * grid_y) | |
| return refine_size | |
| def split_to_patches(image, grid, scale_resolution, vit_image_size): | |
| patches = [] | |
| width, height = image.size | |
| grid_x = int(width / grid[0]) | |
| grid_y = int(height / grid[1]) | |
| from PIL import ImageOps | |
| border_size = (vit_image_size - scale_resolution)/2 | |
| padded_img = ImageOps.expand(image, border=int(border_size), fill=(0,0,0)) | |
| padded_width, padded_height = padded_img.size | |
| for i in range(0, padded_height-vit_image_size+1, scale_resolution): | |
| images = [] | |
| for j in range(0, padded_width-vit_image_size+1, scale_resolution): | |
| box = (j, i, j + vit_image_size, i + vit_image_size) | |
| patch = padded_img.crop(box) | |
| images.append(patch) | |
| patches.append(images) | |
| return patches | |
| def resize_by_patch_size_ur(img, min_size=1152, max_size=2240, patch_size=14): | |
| interpolation=InterpolationMode.BICUBIC | |
| # min_size=756, max_size=756 * 4, patch_size=14 | |
| if isinstance(img, torch.Tensor): | |
| height, width = img.shape[:2] | |
| else: | |
| width, height = img.size | |
| # Check if the shorter side is less than min_size | |
| if min(height, width) < min_size: | |
| # print('less than min_size') | |
| scale_factor = min_size / min(height, width) | |
| new_height = max(min_size, round(height * scale_factor)) | |
| new_width = max(min_size, round(width * scale_factor)) | |
| # print(self.max_size) | |
| # Check if the longer side after resizing is greater than max_size | |
| if max(new_height, new_width) > max_size: | |
| scale_factor = max_size / max(new_height, new_width) | |
| new_height = min(max_size, round(new_height * scale_factor)) | |
| new_width = min(max_size, round(new_width * scale_factor)) | |
| else: | |
| scale_factor = min(max_size / max(height, width), 1) | |
| new_height = round(height * scale_factor) | |
| new_width = round(width * scale_factor) | |
| # # Make sure the new height and width are divisible by patch_size | |
| # new_height = (new_height // patch_size) * patch_size | |
| # new_width = (new_width // patch_size) * patch_size | |
| # Resize the image | |
| # img = F.resize(img, (new_height, new_width), interpolation) | |
| img = img.resize((new_width, new_height), Image.BICUBIC) | |
| return img |