Skip to content
This repository was archived by the owner on Apr 24, 2025. It is now read-only.

Commit b34d859

Browse files
Add int4 support (#32)
* Add int4 support * Fix dtypes * Add dtypes test * Add dtype to library * Faster i8 to i4 compression * hotfix * Update the profile-llm script * Add library * fix script * Update readme * Add neural compressor and demo * Use neural compressor as the default method * hotfix * Quantize only quantized models * Add tests * fix issue #27
1 parent 5294a5c commit b34d859

File tree

22 files changed

+422
-63
lines changed

22 files changed

+422
-63
lines changed

CMakeLists.txt

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,20 +37,33 @@ function(get_linux_lsb_release_information)
3737
set(LSB_RELEASE_VERSION "${LSB_RELEASE_VERSION}" PARENT_SCOPE)
3838
endfunction()
3939

40-
set(OV_VERSION_SHORT "2024.1")
41-
set(OV_VERSION "2024.1.0.15008.f4afc983258_x86_64")
40+
set(OV_VERSION_SHORT "nightly")
41+
set(OV_VERSION "2024.3.0.dev20240524_x86_64")
42+
set(OV_STORAGE_URL "https://storage.openvinotoolkit.org/repositories/openvino/packages")
43+
set(OV_NIGHTLY_COMMIT "2024.3.0-15502-66093834e38")
4244

4345
if (WIN32)
4446
if(NOT OV_LIBRARY_URL)
45-
set(OV_LIBRARY_URL "https://storage.openvinotoolkit.org/repositories/openvino/packages/${OV_VERSION_SHORT}/windows/w_openvino_toolkit_windows_${OV_VERSION}.zip")
47+
if (${OV_VERSION_SHORT} STREQUAL "nightly")
48+
set(OV_PLATFORM "${OV_NIGHTLY_COMMIT}")
49+
else()
50+
set(OV_PLATFORM "windows")
51+
endif()
52+
set(OV_LIBRARY_URL "${OV_STORAGE_URL}/${OV_VERSION_SHORT}/${OV_PLATFORM}/w_openvino_toolkit_windows_${OV_VERSION}.zip")
4653
endif()
4754
elseif(UNIX)
4855
if(NOT OV_LIBRARY_URL)
4956
get_linux_lsb_release_information()
5057
if (LSB_RELEASE_ID STREQUAL "Ubuntu")
5158
if (${LSB_RELEASE_VERSION} STREQUAL "18.04" OR ${LSB_RELEASE_VERSION} STREQUAL "20.04" OR ${LSB_RELEASE_VERSION} STREQUAL "22.04")
5259
string(REPLACE ".04" "" LSB_RELEASE_VERSION_SHORT ${LSB_RELEASE_VERSION})
53-
set(OV_LIBRARY_URL "https://storage.openvinotoolkit.org/repositories/openvino/packages/${OV_VERSION_SHORT}/linux/l_openvino_toolkit_ubuntu${LSB_RELEASE_VERSION_SHORT}_${OV_VERSION}.tgz")
60+
if (${OV_VERSION_SHORT} STREQUAL "nightly")
61+
set(OV_PLATFORM "${OV_NIGHTLY_COMMIT}")
62+
else()
63+
set(OV_PLATFORM "linux")
64+
endif()
65+
66+
set(OV_LIBRARY_URL "${OV_STORAGE_URL}/${OV_VERSION_SHORT}/${OV_PLATFORM}/l_openvino_toolkit_ubuntu${LSB_RELEASE_VERSION_SHORT}_${OV_VERSION}.tgz")
5467
else()
5568
message(FATAL_ERROR "Ubuntu version ${LSB_RELEASE_VERSION} is unsupported")
5669
endif()
@@ -63,6 +76,7 @@ else()
6376
message(FATAL_ERROR "Unsupported architecture")
6477
endif ()
6578

79+
message(STATUS "OpenVINO library URL: ${OV_LIBRARY_URL}")
6680

6781
FetchContent_Declare(
6882
openvino

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,8 @@ Some useful links
2525
In our quest to significantly improve the library's performance, we are directing our efforts toward implementing a range of key features, including:
2626

2727
- [x] **8-bit quantization**
28-
- [ ] **4-bit Quantization and GPTQ**
29-
- [ ] **NPU-Native mixed precision inference**
28+
- [x] **4-bit Quantization and GPTQ**
29+
- [x] **NPU-Native mixed precision inference**
3030
- [x] **Float16 support**
3131
- [ ] **BFloat16 (Brain Floating Point Format)**
3232
- [x] **`torch.compile` support**

examples/phi-2.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,15 @@
77
from langchain.chains import LLMChain
88
from langchain.llms import HuggingFacePipeline
99
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, TextStreamer
10-
import intel_npu_acceleration_library
11-
import torch
10+
import intel_npu_acceleration_library as npu_lib
1211

1312
model_id = "microsoft/Phi-2"
1413

1514
model = AutoModelForCausalLM.from_pretrained(model_id, use_cache=True).eval()
1615
tokenizer = AutoTokenizer.from_pretrained(model_id, use_default_system_prompt=True)
1716
streamer = TextStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True)
1817

19-
npu_model = intel_npu_acceleration_library.compile(model, dtype=torch.float16)
18+
npu_model = npu_lib.compile(model, dtype=npu_lib.int4)
2019

2120
pipe = pipeline(
2221
"text-generation",

examples/phi-3-nc.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
#
2+
# Copyright © 2024 Intel Corporation
3+
# SPDX-License-Identifier: Apache 2.0
4+
#
5+
6+
import torch
7+
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TextStreamer
8+
import intel_npu_acceleration_library as npu_lib
9+
import warnings
10+
11+
torch.random.manual_seed(0)
12+
13+
model = AutoModelForCausalLM.from_pretrained(
14+
"microsoft/Phi-3-mini-4k-instruct",
15+
torch_dtype="auto",
16+
trust_remote_code=True,
17+
)
18+
19+
model = npu_lib.compile(model, dtype=npu_lib.int4)
20+
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
21+
streamer = TextStreamer(tokenizer, skip_prompt=True)
22+
23+
messages = [
24+
{
25+
"role": "system",
26+
"content": "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user.",
27+
},
28+
{
29+
"role": "user",
30+
"content": "Can you provide ways to eat combinations of bananas and dragonfruits?",
31+
},
32+
]
33+
34+
pipe = pipeline(
35+
"text-generation",
36+
model=model,
37+
tokenizer=tokenizer,
38+
)
39+
40+
generation_args = {
41+
"max_new_tokens": 500,
42+
"return_full_text": False,
43+
"temperature": 0.0,
44+
"do_sample": False,
45+
"streamer": streamer,
46+
}
47+
48+
with warnings.catch_warnings():
49+
warnings.simplefilter("ignore")
50+
pipe(messages, **generation_args)

include/intel_npu_acceleration_library/conversion.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,19 @@
1313

1414
namespace intel_npu_acceleration_library {
1515

16+
/**
17+
* @brief Compress a int8 vector to I4 format.
18+
*
19+
* @param src pointer to the source int8 buffer
20+
* @param dst pointer to the destination uint8 buffer
21+
* @param size size of the src and dst buffers
22+
*/
23+
void compressToI4(const int8_t* src, uint8_t* dst, size_t size) {
24+
for (size_t i = 0; i < size / 2; i++) {
25+
dst[i] = (src[2 * i] & 0x0F) | ((src[2 * i + 1] & 0x0F) << 4);
26+
}
27+
}
28+
1629
/**
1730
* @brief Convert a int8 vector to fp16 given a scalar scale.
1831
*

intel_npu_acceleration_library/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#
55

66
from .compiler import compile
7+
from .dtypes import int4, int8, float16
78

89

9-
__all__ = ["compile"]
10+
__all__ = ["compile", "int4", "int8", "float16"]

intel_npu_acceleration_library/backend/bindings.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,8 @@ def init_common(lib: ctypes.CDLL):
7979

8080
lib.isNPUAvailable.restype = ctypes.c_bool
8181

82+
lib.compressToI4.argtypes = [c_i8_array, c_u8_array, ctypes.c_int]
83+
8284

8385
def init_network_factory(lib: ctypes.CDLL):
8486
"""Initialize Netowrk factory bindings.
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#
2+
# Copyright © 2024 Intel Corporation
3+
# SPDX-License-Identifier: Apache 2.0
4+
#
5+
6+
from intel_npu_acceleration_library.backend.bindings import lib as backend_lib
7+
import numpy as np
8+
9+
10+
def compress_to_i4(weights: np.ndarray) -> np.ndarray:
11+
"""Compress a int8 array to int4.
12+
13+
Args:
14+
weights (np.ndarray): input array
15+
16+
Returns:
17+
np.ndarray: compressed array
18+
"""
19+
compressed_weights = np.zeros(
20+
(weights.shape[0], weights.shape[1] // 2), dtype=np.uint8
21+
)
22+
23+
backend_lib.compressToI4(weights, compressed_weights, np.prod(weights.shape))
24+
return compressed_weights

intel_npu_acceleration_library/backend/qlinear.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ def __init__(
1717
batch: int,
1818
profile: bool = False,
1919
device: str = "NPU",
20+
dtype: np.dtype = np.int8,
2021
):
2122
"""Initialize the QLinear class.
2223
@@ -26,6 +27,7 @@ def __init__(
2627
batch (int): batch
2728
profile (bool): Enable/Disable profiling. Defaults to False.
2829
device (str): Target device, default to "NPU".
30+
dtype (np.dtype): weights datatype. Defaults to np.int8.
2931
3032
Raises:
3133
RuntimeError: Quantized matmul requires input_channel to be a multiple of 8
@@ -35,7 +37,7 @@ def __init__(
3537
raise RuntimeError(
3638
"Quantized matmul requires input_channel to be a multiple of 8"
3739
)
38-
out = self.linear(self.input, outC, inC, bias=False, wt_dtype=np.int8)
40+
out = self.linear(self.input, outC, inC, bias=False, wt_dtype=dtype)
3941
self.compile(out)
4042

4143
def run(

intel_npu_acceleration_library/backend/qmatmul.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ def __init__(
1717
batch: int,
1818
profile: bool = False,
1919
device: str = "NPU",
20+
dtype: np.dtype = np.int8,
2021
):
2122
"""Initialize the QMatmul class.
2223
@@ -26,9 +27,10 @@ def __init__(
2627
batch (int): batch
2728
profile (bool): Enable/Disable profiling. Defaults to False.
2829
device (str): Target device, default to "NPU".
30+
dtype (np.dtype): weights datatype. Defaults to np.int8.
2931
"""
3032
super().__init__(inC, outC, batch, profile, device)
31-
out = self.linear(self.input, outC, inC, bias=False, wt_dtype=np.int8)
33+
out = self.linear(self.input, outC, inC, bias=False, wt_dtype=dtype)
3234
self.compile(out)
3335

3436
def run(self, X: np.ndarray, W: np.ndarray, scale: np.ndarray) -> np.ndarray:

intel_npu_acceleration_library/backend/runtime.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from intel_npu_acceleration_library.backend import NNFactory
99
from torch.profiler import record_function
1010
from typing import Optional, List, Any, Dict, Deque
11+
from functools import partial
1112
from collections import deque
1213
import numpy as np
1314
import torch
@@ -46,18 +47,27 @@ def run_matmul(
4647

4748
outC, inC = weights.shape[-2:]
4849

50+
if weights.dtype == torch.uint8:
51+
# In case is Int4 we need to double the input channels because weights are compressed
52+
inC *= 2
53+
4954
# Set tensors as contiguous in memory
5055
x = set_contiguous(x)
5156
weights = set_contiguous(weights)
5257
weights = weights.view([-1, weights.shape[-1]])
5358

5459
if weights.dtype.is_floating_point:
5560
op_class = Linear if op_id is not None else MatMul
61+
op_class_name = op_class.__name__
62+
create_op = partial(op_class)
5663
op_args = [weights.to(torch.float16).numpy()]
57-
elif weights.dtype == torch.int8:
64+
elif weights.dtype in (torch.int8, torch.uint8):
5865
if scale is None:
5966
raise RuntimeError("Quantized weights require a not null scale")
6067
op_class = QLinear if op_id is not None else QMatMul
68+
op_class_name = op_class.__name__
69+
np_dtype = np.int8 if weights.dtype == torch.int8 else np.uint8
70+
create_op = partial(op_class, dtype=np_dtype)
6171
if scale is None:
6272
raise RuntimeError(
6373
f"Quantized matmul (weights dtype == {weights.dtype}) requires scale (scale = {scale})"
@@ -90,13 +100,13 @@ def run_matmul(
90100
else:
91101
batch = real_batch
92102

93-
key = f"{str(op_class.__name__)}_{batch}_{inC}_x_{outC}_{inC}_{x_np.dtype}"
103+
key = f"{str(op_class_name)}_{batch}_{inC}_x_{outC}_{inC}_{x_np.dtype}"
94104
models = _model_cache.get(key, None)
95105

96106
if models is None:
97-
_model_cache[key] = deque([op_class(inC, outC, batch)])
107+
_model_cache[key] = deque([create_op(inC, outC, batch)])
98108
elif len(models) < 1:
99-
_model_cache[key].append(op_class(inC, outC, batch))
109+
_model_cache[key].append(create_op(inC, outC, batch))
100110
else:
101111
_model_cache[key].rotate(1)
102112

intel_npu_acceleration_library/compiler.py

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,20 +4,14 @@
44
#
55

66
from intel_npu_acceleration_library.optimizations import horizontal_fusion_linear
7+
from transformers.models.llama.modeling_llama import LlamaMLP, LlamaAttention
8+
from transformers.models.gemma.modeling_gemma import GemmaMLP, GemmaAttention
9+
from neural_compressor.adaptor.torch_utils.model_wrapper import WeightOnlyLinear
10+
from intel_npu_acceleration_library.quantization import quantize_model
11+
from intel_npu_acceleration_library.dtypes import int8, int4
12+
import intel_npu_acceleration_library.nn as nn
713
from torch._dynamo import register_backend
814
from typing import Union, Callable, Any
9-
10-
try:
11-
from transformers.models.llama.modeling_llama import LlamaMLP, LlamaAttention
12-
from transformers.models.gemma.modeling_gemma import GemmaMLP, GemmaAttention
13-
14-
is_transformers_available = True
15-
except ModuleNotFoundError:
16-
# Transformer library is not installed
17-
is_transformers_available = False
18-
19-
20-
import intel_npu_acceleration_library.nn as nn
2115
from typing import List
2216
import torch
2317

@@ -38,7 +32,7 @@ def compile(
3832
Returns:
3933
torch.nn.Module: compiled NPU nn.Module
4034
"""
41-
if not (dtype.is_floating_point or dtype == torch.int8):
35+
if not (dtype.is_floating_point or dtype in (int8, int4)):
4236
raise RuntimeError(
4337
f"intel-npu-acceleration-library library do not support yet the requeste datatype: {dtype}"
4438
)
@@ -48,6 +42,9 @@ def compile(
4842
# General optimizations
4943
apply_horizontal_fusion(model)
5044
optimize_llama_attention(model, dtype)
45+
if dtype in (int8, int4):
46+
# Quantize model
47+
model = quantize_model(model, dtype)
5148

5249
# Model lowering to NPU ops
5350
lower_linear(model, dtype)
@@ -102,13 +99,27 @@ def lower_linear(
10299
layer (torch.nn.Module): Original torch.nn.Linear module
103100
dtype (torch.dtype): Target datatype
104101
102+
Raises:
103+
RuntimeError: unsupported quantization bits
104+
105105
Returns:
106106
Union[torch.nn.Module, None]: Return the new NPU operator or None
107107
"""
108108
if isinstance(layer, torch.nn.Linear):
109109
return nn.Linear.fromTorch(layer, dtype)
110110
if isinstance(layer, torch.nn.Conv2d):
111111
return nn.Conv2d.fromTorch(layer, dtype)
112+
if isinstance(layer, WeightOnlyLinear):
113+
if layer.bits == 4:
114+
return nn.QuantizedLinear(
115+
layer.qweight.to(torch.uint8), layer.scales, layer.bias
116+
)
117+
elif layer.bits == 8:
118+
return nn.QuantizedLinear(
119+
layer.qweight.view(torch.int8), layer.scales, layer.bias
120+
)
121+
else:
122+
raise RuntimeError(f"Unsupported quantization bits: {layer.bits}")
112123
return None
113124

114125

0 commit comments

Comments
 (0)