嘉兴公司做网站,制作模板网站,品牌开发者应考虑的因素,客户管理软件哪家好vLLM 是一款专为大语言模型推理加速而设计的框架#xff0c;实现了 KV 缓存内存几乎零浪费#xff0c;解决了内存管理瓶颈问题。
更多 vLLM 中文文档及教程可访问 →https://vllm.hyper.ai/
*在线运行 vLLM 入门教程#xff1a;零基础分步指南
源码 examples/offline_inf…vLLM 是一款专为大语言模型推理加速而设计的框架实现了 KV 缓存内存几乎零浪费解决了内存管理瓶颈问题。
更多 vLLM 中文文档及教程可访问 →https://vllm.hyper.ai/
*在线运行 vLLM 入门教程零基础分步指南
源码 examples/offline_inference/encoder_decoder_multimodal.py
# SPDX-License-Identifier: Apache-2.0
此示例显示了如何使用 vLLM 进行离线推理
文本生成的 enc-dec LMMS 上的显式/隐式提示格式。import time
from collections.abc import Sequence
from dataclasses import asdict
from typing import NamedTuplefrom vllm import LLM, EngineArgs, PromptType, SamplingParams
from vllm.assets.audio import AudioAsset
from vllm.assets.image import ImageAsset
from vllm.utils import FlexibleArgumentParserclass ModelRequestData(NamedTuple):engine_args: EngineArgsprompts: Sequence[PromptType]def run_florence2():engine_args EngineArgs(modelmicrosoft/Florence-2-large,tokenizerfacebook/bart-large,max_num_seqs8,trust_remote_codeTrue,limit_mm_per_prompt{image: 1},dtypehalf,)prompts [{ # implicit prompt with task tokenprompt: DETAILED_CAPTION,multi_modal_data: {image: ImageAsset(stop_sign).pil_image},},{ # explicit encoder/decoder promptencoder_prompt: {prompt: Describe in detail what is shown in the image.,multi_modal_data: {image: ImageAsset(cherry_blossom).pil_image},},decoder_prompt: ,},]return ModelRequestData(engine_argsengine_args,promptsprompts,)def run_mllama():engine_args EngineArgs(modelmeta-llama/Llama-3.2-11B-Vision-Instruct,max_model_len4096,max_num_seqs2,limit_mm_per_prompt{image: 1},dtypehalf,)prompts [{ # Implicit prompt # 隐式提示prompt: |image||begin_of_text|What is the content of this image?, # noqa: E501multi_modal_data: {image: ImageAsset(stop_sign).pil_image,},},{ # Explicit prompt # 显示提示encoder_prompt: {prompt: |image|,multi_modal_data: {image: ImageAsset(stop_sign).pil_image,},},decoder_prompt: |image||begin_of_text|Please describe the image., # noqa: E501},]return ModelRequestData(engine_argsengine_args,promptsprompts,)def run_whisper():engine_args EngineArgs(modelopenai/whisper-large-v3-turbo,max_model_len448,max_num_seqs16,limit_mm_per_prompt{audio: 1},dtypehalf,)prompts [{ # Test implicit prompt # 测试隐式提示prompt: |startoftranscript|,multi_modal_data: {audio: AudioAsset(mary_had_lamb).audio_and_sample_rate,},},{ # Test explicit encoder/decoder prompt # 测试显式 编码/解码提示encoder_prompt: {prompt: ,multi_modal_data: {audio: AudioAsset(winning_call).audio_and_sample_rate,},},decoder_prompt: |startoftranscript|,}]return ModelRequestData(engine_argsengine_args,promptsprompts,)model_example_map {florence2: run_florence2,mllama: run_mllama,whisper: run_whisper,
}def main(args):model args.model_typeif model not in model_example_map:raise ValueError(fModel type {model} is not supported.)req_data model_example_map[model]()engine_args asdict(req_data.engine_args) | {seed: args.seed}llm LLM(**engine_args)prompts req_data.prompts# 创建一个采样参数对象。sampling_params SamplingParams(temperature0,top_p1.0,max_tokens64,)start time.time()# 从提示中生成输出 token 。# 输出是包含提示的对象生成了文本和其他信息。outputs llm.generate(prompts, sampling_params)# 打印输出。for output in outputs:prompt output.promptgenerated_text output.outputs[0].textprint(fDecoder prompt: {prompt!r}, fGenerated text: {generated_text!r})duration time.time() - startprint(Duration:, duration)print(RPS:, len(prompts) / duration)if __name__ __main__:parser FlexibleArgumentParser(descriptionDemo on using vLLM for offline inference with vision language models for text generation)parser.add_argument(--model-type,-m,typestr,defaultmllama,choicesmodel_example_map.keys(),helpHuggingface model_type.)parser.add_argument(--seed,typeint,defaultNone,helpSet the seed when initializing vllm.LLM.)args parser.parse_args()main(args)