How to quantize an ONNX model in Python?

Quantize ONNX model in python

In this video it shows the steps and code to generate a QInt8 QuantType quantized ONNX model in Python.

It quantized the gpt2-model.onnx which was initially created using the torch.onnx.export API.

For previous code one can refer to: https://programmerworld.co/ai-gen-ai-generative-artificial-intelligence/how-to-create-local-onnx-file-for-text-generation-in-python-using-a-ai-llm-model/

I hope you like this video. For any questions, suggestions or appreciation please contact us at: https://programmerworld.co/contact/ or email at: programmerworld1990@gmail.com

Details:

Python Code:

quantize_model.py

from onnxruntime.quantization import quantize_dynamic, QuantType

# Load your ONNX model
onnx_model_path = "gpt2_model.onnx"
quantized_model_path = "gpt2_model_quantized.onnx"

# Perform dynamic quantization
quantize_dynamic(onnx_model_path, quantized_model_path, weight_type=QuantType.QInt8)

print(f"The quantized model has been saved as '{quantized_model_path}'!")

generate_text_from_onnx.py

import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import onnx
from onnxruntime import InferenceSession

# Load the pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Create dummy input for the model
text = "Write a story about sunflowers"
inputs = tokenizer(text, return_tensors="pt")
dummy_input = inputs['input_ids']

# Export the model to ONNX with dynamic axes
# onnx_file_path = "gpt2_model.onnx"
onnx_file_path = "gpt2_model_quantized.onnx"

torch.onnx.export(model, dummy_input, onnx_file_path, 
                  input_names=['input_ids'], 
                  output_names=['logits'],
                  dynamic_axes={'input_ids': {0: 'batch_size', 1: 'sequence_length'}, 
                                'logits': {0: 'batch_size', 1: 'sequence_length'}},
                  opset_version=14)

# Validate the ONNX model
onnx_model = onnx.load(onnx_file_path)
onnx.checker.check_model(onnx_model)
print("The model is valid and has been saved as 'gpt2_model.onnx'!")

# Perform inference using the ONNX model with dynamic input size
session = InferenceSession(onnx_file_path)

# Function to apply top-p (nucleus) sampling
def top_p_sampling(logits, top_p=0.9):
    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
    cumulative_probs = torch.cumsum(torch.nn.functional.softmax(sorted_logits, dim=-1), dim=-1)
    sorted_indices_to_remove = cumulative_probs > top_p
    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
    sorted_indices_to_remove[..., 0] = 0

    for batch_idx in range(sorted_indices.size(0)):
        logits[batch_idx, sorted_indices[batch_idx][sorted_indices_to_remove[batch_idx]]] = -float('Inf')
    return logits

# Function to generate text iteratively with top-p sampling and repetition penalty
def generate_text(session, tokenizer, initial_input, max_length=1000, top_p=0.9, repetition_penalty=1.2):
    input_ids = initial_input
    generated_text = ""
    for _ in range(max_length):
        onnx_inputs = {"input_ids": input_ids.numpy()}
        onnx_outputs = session.run(None, onnx_inputs)
        logits = onnx_outputs[0]
        logits = torch.tensor(logits[:, -1, :]) / repetition_penalty  # Apply repetition penalty
        filtered_logits = top_p_sampling(logits, top_p=top_p)  # Apply top-p sampling
        probabilities = torch.nn.functional.softmax(filtered_logits, dim=-1)
        next_token_id = torch.multinomial(probabilities, 1).unsqueeze(0)
        next_token_id = next_token_id.squeeze(1)  # Ensure correct dimensions
        input_ids = torch.cat([input_ids, next_token_id], dim=-1)
        generated_text = tokenizer.decode(input_ids[0])
        # Optionally, break if EOS token is generated
        if next_token_id.item() == tokenizer.eos_token_id:
            break
    return generated_text

# Generate longer text
generated_text = generate_text(session, tokenizer, dummy_input, max_length=100)
print("Generated text:", generated_text)

Screenshots:

Leave a Reply Cancel reply