Skip to content

Latest commit

 

History

History
271 lines (213 loc) · 7.8 KB

File metadata and controls

271 lines (213 loc) · 7.8 KB

AWQ

::::{Note} Support: MiniCPM-o 4.5 ::::

Method 1 (Use the pre-quantized model with vllm)

1.Download the Model

Download the 4-bit quantized MiniCPM-o-4_5 model with AutoAWQ from HuggingFace

git clone https://huggingface.co/openbmb/MiniCPM-o-4_5-AWQ

2.Run with vllm

import os
from PIL import Image
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams


# Quantized model name or path
MODEL_NAME = "openbmb/MiniCPM-o-4_5-AWQ"

# List of image file paths
IMAGES = [
    "image.png",
]

# Open and convert image
image = Image.open(IMAGES[0]).convert("RGB")

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

# Initialize LLM
llm = LLM(
    model=MODEL_NAME, 
    # gpu_memory_utilization=0.9,
    max_model_len=4096,
    trust_remote_code=True,
    # disable_mm_preprocessor_cache=True,
    # limit_mm_per_prompt={"image": 5}
)

# Build messages
messages = [{
    "role": "user",
    "content": "(<image>./</image>)\nPlease describe the content of this image",
    # "content": "(<image>./</image>)\n请描述这张图片的内容",
}]

# Apply chat template to the messages
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

# Set stop token IDs
# MiniCPM-o 4.5 uses specific terminators
stop_tokens = ['<|tts_eos|>', '<|im_end|>', '</s>']
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens if tokenizer.convert_tokens_to_ids(i) is not None]

# Set generation parameters
sampling_params = SamplingParams(
    stop_token_ids=stop_token_ids,
    temperature=0.7,
    top_p=0.8,
    top_k=100,
    repetition_penalty=1.05,
    max_tokens=1024,
)

# Get model output
outputs = llm.generate({
    "prompt": prompt,
    "multi_modal_data": {
        "image": image
    }
}, sampling_params=sampling_params)
print(outputs[0].outputs[0].text)

Method 2 (Use the pre-quantized model with AutoAWQ)

1.Download the Quantized Model

Download the 4-bit quantized MiniCPM-o-4_5 model from HuggingFace

git clone https://huggingface.co/openbmb/MiniCPM-o-4_5-AWQ

2.Download and build AutoAWQ

Since the official AutoAWQ repository is no longer maintained, please download and build our fork instead.

git clone https://github.com/tc-mb/AutoAWQ.git
cd AutoAWQ
pip install -e .

3.Inference Script

Use the following script to directly use the AWQ quantized model for inference.

import os
from PIL import Image
from transformers import AutoTokenizer, TextStreamer
from awq import AutoAWQForCausalLM
import torch

# Quantized model name or path
model_path = "openbmb/MiniCPM-o-4_5-AWQ"
device = 'cuda'
# List of image file paths
image_path = './assets/airplane.jpeg'

model = AutoAWQForCausalLM.from_quantized(model_path, trust_remote_code=True).to('cuda')
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

response = model.chat(
    image=Image.open(image_path).convert("RGB"),
    msgs=[
        {
            "role": "user",
            "content": "What is in this picture?"
        }
    ],
    tokenizer=tokenizer
)

print('Output:', response)

Method 3 (Quantize the model yourself)

1.Download the Model

Download the MiniCPM-o 4.5 model from HuggingFace

git clone https://huggingface.co/openbmb/MiniCPM-o-4_5

2.Download and build AutoAWQ

Since the official AutoAWQ repository is no longer maintained, please download and build our fork instead.

git clone https://github.com/tc-mb/AutoAWQ.git
cd AutoAWQ
pip install -e .

3.Quantization Script

Run the following quantization script (replace model_path and quant_path with the paths to the original model and the quantized model, respectively).

import os
from datasets import load_dataset, load_from_disk
from awq import AutoAWQForCausalLM
import torch
from transformers import AutoTokenizer
import shutil

# Set the path to the original model (can be a local path or model ID)
model_path = '/openbmb/MiniCPM-o-4_5'

# Path to save the quantized model
quant_path = '/model_quantized/minicpmo4_5_awq'

# Quantization configuration: 4-bit weights, group size 128, GEMM backend
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" } # "w_bit":4 or 8	


# Load the original model and tokenizer
model = AutoAWQForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

# Copy files that exist in model_path but not in quant_path (excluding weight files)
def copy_files_not_in_B(A_path, B_path):
    """
    Copies files from directory A to directory B if they exist in A but not in B.

    :param A_path: Path to the source directory (A).
    :param B_path: Path to the destination directory (B).
    """
    # Ensure source directory exists
    if not os.path.exists(A_path):
        raise FileNotFoundError(f"The directory {A_path} does not exist.")
    if not os.path.exists(B_path):
        os.makedirs(B_path)

    # List all files in directory A except weight files (e.g., .bin or safetensors)
    files_in_A = os.listdir(A_path)
    files_in_A = set([file for file in files_in_A if not (".bin" in file or "safetensors" in file )])
    # List all files in directory B
    files_in_B = set(os.listdir(B_path))

    # Determine which files need to be copied
    files_to_copy = files_in_A - files_in_B

    # Copy each missing file from A to B
    for file in files_to_copy:
        src_file = os.path.join(A_path, file)
        dst_file = os.path.join(B_path, file)
        if os.path.isfile(src_file):
            shutil.copy2(src_file, dst_file)

# Define data loading methods
# Load the Alpaca dataset
def load_alpaca():
    data = load_dataset("tatsu-lab/alpaca", split="train")

    # Convert each example into a chat-style prompt
    def concatenate_data(x):
        if x['input'] and x['instruction']:
            msgs = [
                    {"role": "system", "content": x['instruction']},
                    {"role": "user", "content": x['input']},
                    {"role": "assistant", "content": x['output']},
            ]
        elif x['input']:
            msgs = [
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": x['input']},
                {"role": "assistant", "content": x['output']}
            ]
        else:
            msgs = [
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": x['instruction']},
                {"role": "assistant", "content": x['output']}
            ]
        
        data = tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)
        return {"text": data}
    
    concatenated = data.map(concatenate_data)
    return [text for text in concatenated["text"]][:1024]

# Load Wikitext dataset
def load_wikitext():
    data = load_dataset('wikitext', 'wikitext-2-raw-v1', split="train")
    return [text for text in data["text"] if text.strip() != '' and len(text.split(' ')) > 20]


# Load calibration data
calib_data = load_alpaca()
# Quantize
model.quantize(tokenizer, quant_config=quant_config, calib_data=calib_data)

# shutil.rmtree(quant_path, ignore_errors=True)

# Save the quantized model
model.save_quantized(quant_path)
tokenizer.save_pretrained(quant_path)

copy_files_not_in_B(model_path, quant_path)
print(f'Model is quantized and saved at "{quant_path}"')