Skip to content

Commit 27a0907

Browse files
committed
backport MM256_SET_M128I to ggml_v2, updated lite, added support for selecting the GPU for cublas
1 parent 220aa70 commit 27a0907

File tree

7 files changed

+54
-31
lines changed

7 files changed

+54
-31
lines changed

expose.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -47,14 +47,14 @@ extern "C"
4747
}
4848

4949
//first digit is whether configured, second is platform, third is devices
50-
int parseinfo = inputs.clblast_info;
50+
int cl_parseinfo = inputs.clblast_info;
5151

52-
std::string usingclblast = "GGML_OPENCL_CONFIGURED="+std::to_string(parseinfo>0?1:0);
52+
std::string usingclblast = "GGML_OPENCL_CONFIGURED="+std::to_string(cl_parseinfo>0?1:0);
5353
putenv((char*)usingclblast.c_str());
5454

55-
parseinfo = parseinfo%100; //keep last 2 digits
56-
int platform = parseinfo/10;
57-
int devices = parseinfo%10;
55+
cl_parseinfo = cl_parseinfo%100; //keep last 2 digits
56+
int platform = cl_parseinfo/10;
57+
int devices = cl_parseinfo%10;
5858
platformenv = "GGML_OPENCL_PLATFORM="+std::to_string(platform);
5959
deviceenv = "GGML_OPENCL_DEVICE="+std::to_string(devices);
6060
putenv((char*)platformenv.c_str());

expose.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ struct load_model_inputs
3030
const bool use_smartcontext;
3131
const bool unban_tokens;
3232
const int clblast_info = 0;
33+
const int cublas_info = 0;
3334
const int blasbatchsize = 512;
3435
const int debugmode = 0;
3536
const int forceversion = 0;

ggml.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -588,8 +588,9 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
588588
//
589589
// quantization
590590
//
591-
591+
#ifndef MM256_SET_M128I
592592
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
593+
#endif
593594

594595
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
595596
// multiply int8_t, add results pairwise twice

gpttype_adapter.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,16 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
347347
//this is used for the mem_per_token eval, openblas needs more RAM
348348
bool use_scratch = ggml_cpu_has_gpublas();
349349

350+
int cu_parseinfo_maindevice = inputs.cublas_info<0?0:inputs.cublas_info;
351+
350352
printf("System Info: %s\n", llama_print_system_info());
353+
#if defined(GGML_USE_CUBLAS)
354+
if(ggml_cpu_has_gpublas() && cu_parseinfo_maindevice>0)
355+
{
356+
printf("CUBLAS: Set main device to %d\n",cu_parseinfo_maindevice);
357+
ggml_cuda_set_main_device(cu_parseinfo_maindevice);
358+
}
359+
#endif
351360
SetQuantsUnshuffled(false);
352361
if(file_format == FileFormat::GGML || file_format == FileFormat::GGHF || file_format == FileFormat::GGJT || file_format == FileFormat::GGJT_2)
353362
{
@@ -412,6 +421,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
412421
llama_ctx_params.use_mmap = inputs.use_mmap;
413422
llama_ctx_params.use_mlock = inputs.use_mlock;
414423
llama_ctx_params.n_gpu_layers = inputs.gpulayers;
424+
llama_ctx_params.main_gpu = cu_parseinfo_maindevice;
415425

416426
llama_ctx_v3 = llama_init_from_file(modelname.c_str(), llama_ctx_params);
417427

klite.embd

Lines changed: 8 additions & 7 deletions
Large diffs are not rendered by default.

koboldcpp.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ class load_model_inputs(ctypes.Structure):
3030
("use_smartcontext", ctypes.c_bool),
3131
("unban_tokens", ctypes.c_bool),
3232
("clblast_info", ctypes.c_int),
33+
("cublas_info", ctypes.c_int),
3334
("blasbatchsize", ctypes.c_int),
3435
("debugmode", ctypes.c_int),
3536
("forceversion", ctypes.c_int),
@@ -111,7 +112,7 @@ def init_library():
111112
else:
112113
print("Attempting to use CLBlast library for faster prompt ingestion. A compatible clblast will be required.")
113114
use_clblast = True
114-
elif (args.usecublas and args.usecublas!=""):
115+
elif (args.usecublas is not None):
115116
if not file_exists(lib_cublas):
116117
print("Warning: CuBLAS library file not found. Non-BLAS library will be used.")
117118
else:
@@ -166,7 +167,7 @@ def load_model(model_filename):
166167
inputs.batch_size = 8
167168
inputs.max_context_length = maxctx #initial value to use for ctx, can be overwritten
168169
inputs.threads = args.threads
169-
inputs.low_vram = (True if args.usecublas=="lowvram" else False)
170+
inputs.low_vram = (True if (args.usecublas and "lowvram" in args.usecublas) else False)
170171
inputs.blasthreads = args.blasthreads
171172
inputs.f16_kv = True
172173
inputs.use_mmap = (not args.nommap)
@@ -187,6 +188,11 @@ def load_model(model_filename):
187188
if args.useclblast:
188189
clblastids = 100 + int(args.useclblast[0])*10 + int(args.useclblast[1])
189190
inputs.clblast_info = clblastids
191+
inputs.cublas_info = 0
192+
if (args.usecublas and "1" in args.usecublas):
193+
inputs.cublas_info = 1
194+
elif (args.usecublas and "2" in args.usecublas):
195+
inputs.cublas_info = 2
190196
inputs.executable_path = (getdirpath()+"/").encode("UTF-8")
191197
inputs.debugmode = args.debugmode
192198
ret = handle.load_model(inputs)
@@ -805,7 +811,7 @@ def changerunmode(a,b,c):
805811
makeslider(quick_tab, "Context Size:", contextsize_text, context_var, 0, len(contextsize_text)-1, 30, set=2)
806812

807813
# load model
808-
makefileentry(quick_tab, "Model:", "Select Model File", model_var, 40, 170)
814+
makefileentry(quick_tab, "Model:", "Select GGML Model File", model_var, 40, 170)
809815

810816
# Hardware Tab
811817
hardware_tab = tabcontent["Hardware"]
@@ -867,7 +873,7 @@ def togglemiro(a,b,c):
867873
# Model Tab
868874
model_tab = tabcontent["Model"]
869875

870-
makefileentry(model_tab, "Model:", "Select Model File", model_var, 1)
876+
makefileentry(model_tab, "Model:", "Select GGML Model File", model_var, 1)
871877
makefileentry(model_tab, "Lora:", "Select Lora File",lora_var, 3)
872878
makefileentry(model_tab, "Lora Base:", "Select Lora Base File", lora_base_var, 5)
873879

@@ -947,7 +953,7 @@ def switch_old_gui():
947953
if runopts_var.get() == runopts[1]:
948954
args.useclblast = [[0,0], [1,0], [0,1]][int(gpu_choice_var.get())-1]
949955
if runopts_var.get() == runopts[2]:
950-
args.usecublas = "lowvram" if lowvram_var.get() == 1 else "normal"
956+
args.usecublas = ["lowvram"] if lowvram_var.get() == 1 else ["normal"]
951957
if gpulayers_var.get():
952958
args.gpulayers = int(gpulayers_var.get())
953959
if runopts_var.get()==runopts[3]:
@@ -1094,7 +1100,7 @@ def onDropdownChange(event):
10941100
if selrunchoice==runopts[3]:
10951101
args.useclblast = [0,1]
10961102
if selrunchoice==runopts[4]:
1097-
args.usecublas = True
1103+
args.usecublas = ["normal"]
10981104
if selrunchoice==runopts[5]:
10991105
args.noblas = True
11001106
if selrunchoice==runopts[6]:
@@ -1290,7 +1296,8 @@ def main(args):
12901296
parser.add_argument("--blasbatchsize", help="Sets the batch size used in BLAS processing (default 512). Setting it to -1 disables BLAS mode, but keeps other benefits like GPU offload.", type=int,choices=[-1,32,64,128,256,512,1024], default=512)
12911297
parser.add_argument("--stream", help="Uses streaming when generating tokens. Only for the Kobold Lite UI.", action='store_true')
12921298
parser.add_argument("--smartcontext", help="Reserving a portion of context to try processing less frequently.", action='store_true')
1293-
parser.add_argument("--unbantokens", help="Normally, KoboldAI prevents certain tokens such as EOS and Square Brackets. This flag unbans them.", action='store_true')
1299+
parser.add_argument("--unbantokens", help="Normally, KoboldAI prevents the EOS token from being generated. This flag unbans it.", action='store_true')
1300+
parser.add_argument("--bantokens", help="You can manually specify a list of token IDs that the AI cannot use.", metavar=('[elements]'), nargs='+')
12941301
parser.add_argument("--usemirostat", help="Experimental! Replaces your samplers with mirostat. Takes 3 params = [type(0/1/2), tau(5.0), eta(0.1)].",metavar=('[type]', '[tau]', '[eta]'), type=float, nargs=3)
12951302
parser.add_argument("--forceversion", help="If the model file format detection fails (e.g. rogue modified model) you can set this to override the detected format (enter desired version, e.g. 401 for GPTNeoX-Type2).",metavar=('[version]'), type=int, default=0)
12961303
parser.add_argument("--nommap", help="If set, do not use mmap to load newer models", action='store_true')
@@ -1302,7 +1309,7 @@ def main(args):
13021309
compatgroup = parser.add_mutually_exclusive_group()
13031310
compatgroup.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true')
13041311
compatgroup.add_argument("--useclblast", help="Use CLBlast for GPU Acceleration. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2)
1305-
compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires Nvidia GPU. Select lowvram to not allocate VRAM scratch buffer.", default='', const='normal', nargs='?', choices=['normal', 'lowvram'])
1312+
compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires Nvidia GPU. Select lowvram to not allocate VRAM scratch buffer. Enter a number after to select a different main GPU.", nargs='*',metavar=('[lowvram|normal] [main GPU ID]'), choices=['normal', 'lowvram', '0', '1', '2'])
13061313
parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using GPU. Requires GPU.",metavar=('[GPU layers]'), type=int, default=0)
13071314
args = parser.parse_args()
13081315
main(args)

otherarch/ggml_v2.c

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -472,6 +472,9 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
472472
//
473473
// quantization
474474
//
475+
#ifndef MM256_SET_M128I
476+
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
477+
#endif
475478

476479
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
477480
// multiply int8_t, add results pairwise twice
@@ -532,7 +535,7 @@ static inline __m256i bytes_from_bits_32(const uint8_t * x) {
532535
static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
533536
{
534537
const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
535-
const __m256i bytes = _mm256_set_m128i(_mm_srli_epi16(tmp, 4), tmp);
538+
const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp);
536539
const __m256i lowMask = _mm256_set1_epi8( 0xF );
537540
return _mm256_and_si256(lowMask, bytes);
538541
}
@@ -605,7 +608,7 @@ static inline __m256i bytes_from_bits_32(const uint8_t * x) {
605608
bytesh = _mm_or_si128(bytesh, bit_mask);
606609
bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1));
607610
bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1));
608-
return _mm256_set_m128i(bytesh, bytesl);
611+
return MM256_SET_M128I(bytesh, bytesl);
609612
}
610613

611614
// Unpack 32 4-bit fields into 32 bytes
@@ -618,15 +621,15 @@ static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
618621
const __m128i lowMask = _mm_set1_epi8(0xF);
619622
tmpl = _mm_and_si128(lowMask, tmpl);
620623
tmph = _mm_and_si128(lowMask, tmph);
621-
return _mm256_set_m128i(tmph, tmpl);
624+
return MM256_SET_M128I(tmph, tmpl);
622625
}
623626

624627
// add int16_t pairwise and return as float vector
625628
static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) {
626629
const __m128i ones = _mm_set1_epi16(1);
627630
const __m128i summed_pairsl = _mm_madd_epi16(ones, xl);
628631
const __m128i summed_pairsh = _mm_madd_epi16(ones, xh);
629-
const __m256i summed_pairs = _mm256_set_m128i(summed_pairsh, summed_pairsl);
632+
const __m256i summed_pairs = MM256_SET_M128I(summed_pairsh, summed_pairsl);
630633
return _mm256_cvtepi32_ps(summed_pairs);
631634
}
632635

@@ -2246,7 +2249,7 @@ static void ggml_v2_vec_dot_q4_0_q8_0(const int n, float * restrict s, const voi
22462249
const __m128i i32_1 = mul_sum_i8_pairs(bx, by);
22472250

22482251
// Convert int32_t to float
2249-
__m256 p = _mm256_cvtepi32_ps(_mm256_set_m128i(i32_0, i32_1));
2252+
__m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1));
22502253

22512254
// Apply the scale, and accumulate
22522255
acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
@@ -2727,7 +2730,7 @@ static void ggml_v2_vec_dot_q5_0_q8_0(const int n, float * restrict s, const voi
27272730
__m128i bxh = _mm256_extractf128_si256(bx, 1);
27282731
bxl = _mm_or_si128(bxl, bxhil);
27292732
bxh = _mm_or_si128(bxh, bxhih);
2730-
bx = _mm256_set_m128i(bxh, bxl);
2733+
bx = MM256_SET_M128I(bxh, bxl);
27312734

27322735
const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
27332736

@@ -2989,7 +2992,7 @@ static void ggml_v2_vec_dot_q5_1_q8_1(const int n, float * restrict s, const voi
29892992
__m128i bxh = _mm256_extractf128_si256(bx, 1);
29902993
bxl = _mm_or_si128(bxl, bxhil);
29912994
bxh = _mm_or_si128(bxh, bxhih);
2992-
bx = _mm256_set_m128i(bxh, bxl);
2995+
bx = MM256_SET_M128I(bxh, bxl);
29932996

29942997
const __m256 dy = _mm256_broadcast_ss(&y[i].d);
29952998
const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
@@ -17417,7 +17420,7 @@ static void ggml_v2_vec_dot_q4_0_q8_0_v2(const int n, float * restrict s, const
1741717420
}
1741817421

1741917422
// Convert int32_t to float
17420-
__m256 p = _mm256_cvtepi32_ps( _mm256_set_m128i( i32[0], i32[1] ));
17423+
__m256 p = _mm256_cvtepi32_ps( MM256_SET_M128I( i32[0], i32[1] ));
1742117424
// Apply the scale, and accumulate
1742217425
acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
1742317426
}
@@ -17686,7 +17689,7 @@ static void ggml_v2_vec_dot_q4_2_q8_0_v2(const int n, float * restrict s, const
1768617689

1768717690
__m128i bx0 = bytes_from_nibbles_16(x[2*i + 0].qs);
1768817691
__m128i bx1 = bytes_from_nibbles_16(x[2*i + 1].qs);
17689-
__m256i bx = _mm256_set_m128i(bx1, bx0);
17692+
__m256i bx = MM256_SET_M128I(bx1, bx0);
1769017693

1769117694
// Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
1769217695
const __m256i off = _mm256_set1_epi8(8);
@@ -17819,7 +17822,7 @@ static void ggml_v2_vec_dot_q4_3_q8_1_v2(const int n, float * restrict s, const
1781917822

1782017823
const __m128i bx0 = bytes_from_nibbles_16(x[2*i + 0].qs);
1782117824
const __m128i bx1 = bytes_from_nibbles_16(x[2*i + 1].qs);
17822-
const __m256i bx = _mm256_set_m128i(bx1, bx0);
17825+
const __m256i bx = MM256_SET_M128I(bx1, bx0);
1782317826

1782417827
const __m256 dy = _mm256_broadcast_ss(&y[i].d);
1782517828
const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);

0 commit comments

Comments
 (0)