Qwen2-VL是由阿里巴巴达摩院开发并开源的第二代视觉与语言多模态人工智能模型。
Qwen2-VL结合了视觉理解和自然语言处理的能力,使得它能够处理和理解图像、视频以及文本数据。
Qwen2-VL支持多种语言,包括但不限于英语、中文、大多数欧洲语言、日语、韩语、阿拉伯语及越南语等。
Qwen2-VL模型系列包括不同规模的版本,如 2B、7B 和 72B 参数规模的模型,以适应不同的应用需求和计算资源限制。
Qwen2-VL可以处理不同分辨率和长宽比的图片,无需将图片分割成块,并且在各种视觉理解基准测试中表现出色,例如 MathVista(数学推理)、DocVQA(文档图像理解)、RealWorldQA(现实世界空间理解)以及 MTVQA(多语言理解)等。
Qwen2-VL 还能够理解长达20分钟以上的视频内容,这使得它能够在基于视频的问答、对话生成和内容创作等方面发挥作用。
github项目地址:https://github.com/QwenLM/Qwen2-VL。
一、环境安装
1、python环境
建议安装python版本在3.10以上。
2、pip库安装
pip install torch==2.4.0+cu118 torchvision==0.19.0+cu118 torchaudio==2.4.0 --extra-index-url https://download.pytorch.org/whl/cu118
pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install av -i https://pypi.tuna.tsinghua.edu.cn/simple
3、Qwen2-VL-7B-Instruct模型下载:
git lfs install
git clone https://www.modelscope.cn/qwen/Qwen2-VL-7B-Instruct.git
4、Qwen2-VL-2B-Instruct模型下载:
git lfs install
git clone https://www.modelscope.cn/models/qwen/Qwen2-VL-2B-Instruct.git
二、功能测试
1、运行测试:
(1)python代码调用测试
from PIL import Image
import torch
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_infoclass VisionLanguageModel:def __init__(self, model_dir, min_pixels, max_pixels):self.model = Qwen2VLForConditionalGeneration.from_pretrained(model_dir, device_map="auto", torch_dtype=torch.float16)self.processor = AutoProcessor.from_pretrained(model_dir, min_pixels=min_pixels, max_pixels=max_pixels)def prepare_inputs(self, messages):text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)image_inputs, video_inputs = process_vision_info(messages)inputs = self.processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")inputs = inputs.to('cuda')return inputsdef generate_output(self, inputs):generated_ids = self.model.generate(**inputs, max_new_tokens=128)generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]output_text = self.processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)return output_textdef describe_image(self, image_url):messages = [{"role": "user", "content": [{"type": "image", "image": image_url}, {"type": "text", "text": "Describe this image."}]}]inputs = self.prepare_inputs(messages)output_text = self.generate_output(inputs)return output_textdef identify_similarities(self, image_paths):content = [{"type": "image", "image": path} for path in image_paths]content.append({"type": "text", "text": "Identify the similarities between these images."})messages = [{"role": "user", "content": content}]inputs = self.prepare_inputs(messages)output_text = self.generate_output(inputs)return output_textdef describe_video(self, video_path):messages = [{"role": "user", "content": [{"type": "video", "video": video_path, 'max_pixels': 360*420, 'fps': 1.0}, {"type": "text", "text": "Describe this video."}]}]inputs = self.prepare_inputs(messages)output_text = self.generate_output(inputs)return output_text# Usage example
model_dir = "Qwen2-VL-7B-Instruct"
min_pixels = 256*28*28
max_pixels = 1280*28*28vl_model = VisionLanguageModel(model_dir, min_pixels, max_pixels)# Describe an image
image_description = vl_model.describe_image("test.jpeg")
print(image_description)# Identify similarities between images
image_paths = ["image1.jpg", "image2.jpg"]
image_similarities = vl_model.identify_similarities(image_paths)
print(image_similarities)# Describe a video
video_description = vl_model.describe_video("video1.mp4")
print(video_description)
(2)web端测试
import copy
import re
import gc
from argparse import ArgumentParser
from threading import Threadimport gradio as gr
import torch
from qwen_vl_utils import process_vision_info
from transformers import AutoProcessor, Qwen2VLForConditionalGeneration, TextIteratorStreamerDEFAULT_CKPT_PATH = 'Qwen2-VL-7B-Instruct'def _get_args():"""Parse command line arguments."""parser = ArgumentParser(description="Qwen2-VL WebUI Options.")parser.add_argument('-c', '--checkpoint-path', type=str, default=DEFAULT_CKPT_PATH,help='Checkpoint name or path, default to %(default)r')parser.add_argument('--cpu-only', action='store_true', help='Run demo with CPU only')parser.add_argument('--flash-attn2', action='store_true', default=False,help='Enable flash_attention_2 when loading the model.')parser.add_argument('--share', action='store_true', default=False,help='Create a publicly shareable link for the interface.')parser.add_argument('--inbrowser', action='store_true', default=False,help='Automatically launch the interface in a new tab on the default browser.')parser.add_argument('--server-port', type=int, default=7870, help='Demo server port.')parser.add_argument('--server-name', type=str, default='0.0.0.0', help='Demo server name.')return parser.parse_args()def _load_model_processor(args):"""Load model and processor based on provided arguments."""device_map = 'cpu' if args.cpu_only else 'auto'model_load_args = {'torch_dtype': 'auto', 'device_map': device_map}if args.flash_attn2:model_load_args['attn_implementation'] = 'flash_attention_2'model = Qwen2VLForConditionalGeneration.from_pretrained(args.checkpoint_path, **model_load_args)processor = AutoProcessor.from_pretrained(args.checkpoint_path)return model, processordef _parse_text(text):"""Parse markdown-styled text to HTML."""def html_escape(text):"""Escape HTML special characters."""html_escape_table = {"`": r'\`', "<": "<", ">": ">", " ": " ", "*": "*", "_": "_", "-": "-", ".": ".", "!": "!", "(": "(", ")": ")", "$": "$"}return "".join(html_escape_table.get(c, c) for c in text)lines = filter(bool, text.split('\n'))inside_code = Falsefor i, line in enumerate(lines):if '```' in line:inside_code = not inside_codeitems = line.split('`')lines[i] = f'<pre><code class="language-{items[-1]}">' if inside_code else '<br></code></pre>'elif inside_code:lines[i] = html_escape(line)else:lines[i] = '<br>' + linereturn ''.join(lines)def _remove_image_special(text):"""Remove image-related special tags from the text."""text = text.replace('<ref>', '').replace('</ref>', '')return re.sub(r'<box>.*?(</box>|$)', '', text)def _is_video_file(filename):"""Check if the given filename is a video file."""video_extensions = {'.mp4', '.avi', '.mkv', '.mov', '.wmv', '.flv', '.webm', '.mpeg'}return any(filename.lower().endswith(ext) for ext in video_extensions)def _gc():"""Perform garbage collection and empty CUDA cache if available."""gc.collect()if torch.cuda.is_available():torch.cuda.empty_cache()def _transform_messages(original_messages):"""Transform original message structure to suitable format for processing."""def get_content_type(content_item):"""Identify the type of content item."""if 'image' in content_item:return {'type': 'image', 'image': content_item['image']}elif 'text' in content_item:return {'type': 'text', 'text': content_item['text']}elif 'video' in content_item:return {'type': 'video', 'video': content_item['video']}return Nonereturn [{'role': message['role'], 'content': [get_content_type(item) for item in message['content'] if get_content_type(item)]}for message in original_messages]def _launch_demo(args, model, processor):"""Launch the Gradio demo interface."""def call_local_model(model, processor, messages):"""Handle the interaction with the local model."""messages = _transform_messages(messages)text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)image_inputs, video_inputs = process_vision_info(messages)inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors='pt')inputs = inputs.to(model.device)tokenizer = processor.tokenizerstreamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)gen_kwargs = {'max_new_tokens': 512, 'streamer': streamer, **inputs}thread = Thread(target=model.generate, kwargs=gen_kwargs)thread.start()generated_text = ''for new_text in streamer:generated_text += new_textyield generated_textdef create_predict_fn():def predict(_chatbot, task_history):"""Generate responses based on chat and task history."""nonlocal model, processorchat_query = _chatbot[-1][0]query = task_history[-1][0]if len(chat_query) == 0:_chatbot.pop()task_history.pop()return _chatbotprint('User: ' + _parse_text(query))full_response, messages = '', []for q, a in copy.deepcopy(task_history):content = []if isinstance(q, (tuple, list)):file_type = 'video' if _is_video_file(q[0]) else 'image'content.append({file_type: f'file://{q[0]}'})else:content.append({'text': q})messages.extend([{'role': 'user', 'content': content}, {'role': 'assistant', 'content': [{'text': a}]}])if messages:messages.pop()for response in call_local_model(model, processor, messages):_chatbot[-1] = (_parse_text(chat_query), _remove_image_special(_parse_text(response)))yield _chatbotfull_response = _parse_text(response)task_history[-1] = (query, full_response)print('Qwen-VL-Chat: ' + full_response)yield _chatbotreturn predictdef create_regenerate_fn():def regenerate(_chatbot, task_history):"""Regenerate the last response."""nonlocal model, processorif not task_history:return _chatbotquery, last_response = task_history[-1]if last_response is None:return _chatbottask_history[-1] = (query, None)last_q, _ = _chatbot[-1]_chatbot[-1] = (last_q, None)for _chatbot in predict(_chatbot, task_history):yield _chatbotreturn regeneratedef add_text(history, task_history, text):"""Add text input to histories and reset the input box."""task_text = texthistory = history or []task_history = task_history or []history.append((_parse_text(text), None))task_history.append((task_text, None))return history, task_history, ''def add_file(history, task_history, file):"""Add file input to histories."""history = history or []task_history = task_history or []history.append(((file.name,), None))task_history.append(((file.name,), None))return history, task_historydef reset_user_input():"""Reset the user input box."""return gr.update(value='')def reset_state(_chatbot, task_history):"""Clear history and perform garbage collection."""task_history.clear()_chatbot.clear()_gc()return []with gr.Blocks() as demo:gr.Markdown("""\
<p align="center"><img src="https://modelscope.oss-cn-beijing.aliyuncs.com/resource/qwen.png" style="height: 80px"/></p>
<center><font size=8>Qwen2-VL</center>
<center><font size=3>This WebUI is based on Qwen2-VL, developed by Alibaba Cloud.</center>
<center><font size=3>本WebUI基于Qwen2-VL。</center>
""")chatbot = gr.Chatbot(label='Qwen2-VL', elem_classes='control-height', height=500)query = gr.Textbox(lines=2, label='Input')task_history = gr.State([])with gr.Row():addfile_btn = gr.UploadButton('📁 Upload (上传文件)', file_types=['image', 'video'])submit_btn = gr.Button('🚀 Submit (发送)')regen_btn = gr.Button('🤔️ Regenerate (重试)')empty_btn = gr.Button('🧹 Clear History (清除历史)')submit_btn.click(add_text, [chatbot, task_history, query], [chatbot, task_history]).then(create_predict_fn(), [chatbot, task_history], [chatbot], show_progress=True)submit_btn.click(reset_user_input, [], [query])empty_btn.click(reset_state, [chatbot, task_history], [chatbot], show_progress=True)regen_btn.click(create_regenerate_fn(), [chatbot, task_history], [chatbot], show_progress=True)addfile_btn.upload(add_file, [chatbot, task_history, addfile_btn], [chatbot, task_history], show_progress=True)gr.Markdown("""\
<font size=2>Note: This demo is governed by the original license of Qwen2-VL. \
We strongly advise users not to knowingly generate or allow others to knowingly generate harmful content, \
including hate speech, violence, pornography, deception, etc. \
(注:本演示受Qwen2-VL的许可协议限制。我们强烈建议,用户不应传播及不应允许他人传播以下内容,\
包括但不限于仇恨言论、暴力、色情、欺诈相关的有害信息。)
""")demo.queue().launch(share=args.share,inbrowser=args.inbrowser,server_port=args.server_port,server_name=args.server_name,)def main():"""Main function to parse arguments, load the model and processor, and launch the demo."""args = _get_args()model, processor = _load_model_processor(args)_launch_demo(args, model, processor)if __name__ == '__main__':main()
未完......
更多详细的欢迎关注:杰哥新技术