Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit ae32a25

Browse files
committedApr 6, 2023
Rewrite loading code to try to satisfy everyone
Features: - Support all three formats (ggml, ggmf, ggjt). (However, I didn't include the hack needed to support GPT4All files without conversion. Those can still be used after converting them with convert.py from my other PR.) - Support both mmap and read (mmap is used by default, but can be disabled with `--no-mmap`, and is automatically disabled for pre-ggjt files or on platforms where mmap is not supported). - Support multi-file models like before, but automatically determine the number of parts rather than requiring `--n_parts`. - Improve validation and error checking. - Stop using the per-file type field (f16) entirely in favor of just relying on the per-tensor type/size fields. This has no immediate benefit, but makes it easier to experiment with different formats, and should make it easier to support the new GPTQ-for-LLaMa models in the future (I have some work in progress on that front). - Support VirtualLock on Windows (using the same `--mlock` option as on Unix). - Indicate loading progress when using mmap + mlock. (Which led me to the interesting observation that on my Linux machine, with a warm file cache, mlock actually takes some time, whereas mmap without mlock starts almost instantly...) - To help implement this, move mlock support from ggml to the loading code. - madvise/PrefetchVirtualMemory support (based on #740) - Switch from ifstream to the `fopen` family of functions to avoid unnecessary copying and, when mmap is enabled, allow reusing the same file descriptor for both metadata reads and mmap (whereas the existing implementation opens the file a second time to mmap). - Quantization now produces a single-file output even with multi-file inputs (not really a feature as much as 'it was easier this way'). Todo: - **VirtualLock does not work at all** on the one Windows machine I tested it on (it complains about quota). Figure out why. - Verify that using the `fopen` family of functions actually does what I think it does, performance-wise. - More testing. Implementation notes: I tried to factor the code into more discrete pieces than before. Regarding code style: I tried to follow the code style, but I'm naughty and used a few advanced C++ features repeatedly: - Destructors to make it easier to ensure everything gets cleaned up. - Exceptions. I don't even usually use exceptions when writing C++, and I can remove them if desired... but here they make the loading code much more succinct while still properly handling a variety of errors, ranging from API calls failing to integer overflow and allocation failure. Co-authored-by: Pavol Rusnak <[email protected]> (for the bit I copied from #740)
1 parent eeaa7b0 commit ae32a25

File tree

10 files changed

+1167
-814
lines changed

10 files changed

+1167
-814
lines changed
 

‎examples/common.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
11
#include "common.h"
22

3-
#include "ggml.h"
4-
53
#include <cassert>
64
#include <cstring>
75
#include <fstream>
@@ -154,6 +152,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
154152
params.use_color = true;
155153
} else if (arg == "--mlock") {
156154
params.use_mlock = true;
155+
} else if (arg == "--no-mmap") {
156+
params.use_mmap = false;
157157
} else if (arg == "--mtest") {
158158
params.mem_test = true;
159159
} else if (arg == "--verbose-prompt") {
@@ -233,9 +233,12 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
233233
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
234234
fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
235235
fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
236-
if (ggml_mlock_supported()) {
236+
if (llama_mlock_supported()) {
237237
fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
238238
}
239+
if (llama_mmap_supported()) {
240+
fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
241+
}
239242
fprintf(stderr, " --mtest compute maximum memory usage\n");
240243
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
241244
fprintf(stderr, " -m FNAME, --model FNAME\n");

‎examples/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ struct gpt_params {
4747
bool instruct = false; // instruction mode (used for Alpaca models)
4848
bool ignore_eos = false; // do not stop generating after eos
4949
bool perplexity = false; // compute perplexity over the prompt
50+
bool use_mmap = true; // use mmap for faster loads
5051
bool use_mlock = false; // use mlock to keep model in memory
5152
bool mem_test = false; // compute maximum memory usage
5253
bool verbose_prompt = false; // print prompt tokens before generation

‎examples/embedding/embedding.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ int main(int argc, char ** argv) {
3838
lparams.seed = params.seed;
3939
lparams.f16_kv = params.memory_f16;
4040
lparams.logits_all = params.perplexity;
41+
lparams.use_mmap = params.use_mmap;
4142
lparams.use_mlock = params.use_mlock;
4243
lparams.embedding = params.embedding;
4344

‎examples/main/main.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ int main(int argc, char ** argv) {
9797
lparams.n_parts = params.n_parts;
9898
lparams.seed = params.seed;
9999
lparams.f16_kv = params.memory_f16;
100+
lparams.use_mmap = params.use_mmap;
100101
lparams.use_mlock = params.use_mlock;
101102

102103
ctx = llama_init_from_file(params.model.c_str(), lparams);

‎examples/perplexity/perplexity.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ int main(int argc, char ** argv) {
115115
lparams.seed = params.seed;
116116
lparams.f16_kv = params.memory_f16;
117117
lparams.logits_all = params.perplexity;
118+
lparams.use_mmap = params.use_mmap;
118119
lparams.use_mlock = params.use_mlock;
119120
lparams.embedding = params.embedding;
120121

‎ggml.c

Lines changed: 0 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -97,17 +97,6 @@ typedef void* thread_ret_t;
9797
#define static_assert(cond, msg) _Static_assert(cond, msg)
9898
#endif
9999

100-
#define GGML_MLOCK_SUPPORT 0
101-
102-
#ifdef __has_include
103-
#if __has_include(<sys/mman.h>)
104-
#undef GGML_MLOCK_SUPPORT
105-
#define GGML_MLOCK_SUPPORT 1
106-
#include <sys/mman.h>
107-
#endif
108-
#endif
109-
110-
111100
/*#define GGML_PERF*/
112101
#define GGML_DEBUG 0
113102
#define GGML_GELU_FP16
@@ -2690,21 +2679,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
26902679

26912680
static_assert(GGML_OP_COUNT == 35, "GGML_OP_COUNT != 35");
26922681

2693-
//
2694-
// ggml object
2695-
//
2696-
2697-
struct ggml_object {
2698-
size_t offs;
2699-
size_t size;
2700-
2701-
struct ggml_object * next;
2702-
2703-
char padding[8];
2704-
};
2705-
2706-
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
2707-
27082682
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
27092683
static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
27102684

@@ -2716,7 +2690,6 @@ struct ggml_context {
27162690
size_t mem_size;
27172691
void * mem_buffer;
27182692
bool mem_buffer_owned;
2719-
bool mem_buffer_mlocked;
27202693
bool no_alloc;
27212694

27222695
int n_objects;
@@ -3003,7 +2976,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
30032976
/*.mem_size =*/ params.mem_size,
30042977
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
30052978
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
3006-
/*.mem_buffer_mlocked =*/ false,
30072979
/*.no_alloc =*/ params.no_alloc,
30082980
/*.n_objects =*/ 0,
30092981
/*.objects_begin =*/ NULL,
@@ -3036,14 +3008,6 @@ void ggml_free(struct ggml_context * ctx) {
30363008
GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n",
30373009
__func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);
30383010

3039-
#if GGML_MLOCK_SUPPORT
3040-
if (ctx->mem_buffer_mlocked) {
3041-
if (munlock(ctx->mem_buffer, ctx->mem_size)) {
3042-
fprintf(stderr, "%s: failed to munlock buffer: %s\n", __func__, strerror(errno));
3043-
}
3044-
}
3045-
#endif
3046-
30473011
if (ctx->mem_buffer_owned) {
30483012
free(ctx->mem_buffer);
30493013
}
@@ -3072,48 +3036,6 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
30723036
return result;
30733037
}
30743038

3075-
#ifdef __APPLE__
3076-
#define MLOCK_SUGGESTION \
3077-
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
3078-
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
3079-
#else
3080-
#define MLOCK_SUGGESTION \
3081-
"Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
3082-
#endif
3083-
3084-
bool ggml_mlock_supported(void) {
3085-
return GGML_MLOCK_SUPPORT;
3086-
}
3087-
3088-
bool ggml_mlock(
3089-
struct ggml_context * ctx,
3090-
const void *opt_extra_addr,
3091-
size_t opt_extra_len,
3092-
char **err_p) {
3093-
// TODO: Use SetProcessWorkingSetSize() + VirtualLock() on WIN32
3094-
#if GGML_MLOCK_SUPPORT
3095-
if (ctx->mem_buffer_mlocked) {
3096-
return true;
3097-
}
3098-
if (mlock(ctx->mem_buffer, ctx->mem_size) ||
3099-
(opt_extra_len &&
3100-
mlock(opt_extra_addr, opt_extra_len))) {
3101-
if ((*err_p = malloc(1024))) {
3102-
snprintf(*err_p, 1024,
3103-
"failed to mlock %zu-byte buffer: %s\n" MLOCK_SUGGESTION,
3104-
ctx->mem_size + opt_extra_len,
3105-
strerror(errno));
3106-
}
3107-
return false;
3108-
}
3109-
ctx->mem_buffer_mlocked = true;
3110-
return true;
3111-
#else // GGML_MLOCK_SUPPORT
3112-
*err_p = strdup("can't mlock because it's not supported on this system");
3113-
return false;
3114-
#endif // GGML_MLOCK_SUPPORT
3115-
}
3116-
31173039
////////////////////////////////////////////////////////////////////////////////
31183040

31193041
struct ggml_tensor * ggml_new_tensor_impl(

‎ggml.h

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,19 @@ enum ggml_op {
253253
GGML_OP_COUNT,
254254
};
255255

256+
257+
// ggml object
258+
struct ggml_object {
259+
size_t offs;
260+
size_t size;
261+
262+
struct ggml_object * next;
263+
264+
char padding[8];
265+
};
266+
267+
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
268+
256269
// n-dimensional tensor
257270
struct ggml_tensor {
258271
enum ggml_type type;
@@ -344,13 +357,6 @@ size_t ggml_used_mem(const struct ggml_context * ctx);
344357

345358
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
346359

347-
bool ggml_mlock_supported(void);
348-
bool ggml_mlock(
349-
struct ggml_context * ctx,
350-
const void *opt_extra_addr,
351-
size_t opt_extra_len,
352-
char **err_p);
353-
354360
struct ggml_tensor * ggml_new_tensor(
355361
struct ggml_context * ctx,
356362
enum ggml_type type,

‎llama.cpp

100644100755
Lines changed: 763 additions & 726 deletions
Large diffs are not rendered by default.

‎llama.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ extern "C" {
5555
bool f16_kv; // use fp16 for KV cache
5656
bool logits_all; // the llama_eval() call computes all logits, not just the last one
5757
bool vocab_only; // only load the vocabulary, no weights
58+
bool use_mmap; // use mmap if possible
5859
bool use_mlock; // force system to keep model in RAM
5960
bool embedding; // embedding mode only
6061

@@ -66,6 +67,9 @@ extern "C" {
6667

6768
LLAMA_API struct llama_context_params llama_context_default_params();
6869

70+
LLAMA_API bool llama_mmap_supported();
71+
LLAMA_API bool llama_mlock_supported();
72+
6973
// Various functions for loading a ggml llama model.
7074
// Allocate (almost) all memory needed for the model.
7175
// Return NULL on failure

‎llama_util.h

Lines changed: 377 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,377 @@
1+
#ifndef LLAMA_UTIL_H
2+
#define LLAMA_UTIL_H
3+
4+
#define _CRT_SECURE_NO_WARNINGS
5+
6+
#include <cstdio>
7+
#include <cstdint>
8+
#include <cerrno>
9+
#include <cstring>
10+
#include <cstdarg>
11+
#include <cstdlib>
12+
#include <climits>
13+
14+
#include <string>
15+
#include <vector>
16+
17+
#ifdef __has_include
18+
#if __has_include(<unistd.h>)
19+
#include <unistd.h>
20+
#if defined(_POSIX_MAPPED_FILES)
21+
#include <sys/mman.h>
22+
#endif
23+
#endif
24+
#endif
25+
26+
#if defined(_WIN32)
27+
#define WIN32_LEAN_AND_MEAN
28+
#define NOMINMAX
29+
#include <Windows.h>
30+
#include <io.h>
31+
#include <stdio.h> // for _fseeki64
32+
#endif
33+
34+
#define LLAMA_ASSERT(x) \
35+
do { \
36+
if (!(x)) { \
37+
fprintf(stderr, "LLAMA_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
38+
abort(); \
39+
} \
40+
} while (0)
41+
42+
#ifdef __GNUC__
43+
__attribute__((format(printf, 1, 2)))
44+
#endif
45+
static std::string format(const char * fmt, ...) {
46+
va_list ap, ap2;
47+
va_start(ap, fmt);
48+
va_copy(ap2, ap);
49+
int size = vsnprintf(NULL, 0, fmt, ap);
50+
LLAMA_ASSERT(size >= 0 && size < INT_MAX);
51+
std::vector<char> buf(size + 1);
52+
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
53+
LLAMA_ASSERT(size2 == size);
54+
va_end(ap2);
55+
va_end(ap);
56+
return std::string(buf.data(), size);
57+
};
58+
59+
struct llama_file {
60+
// use FILE * so we don't have to re-open the file to mmap
61+
FILE * fp;
62+
size_t size;
63+
64+
llama_file(const char * fname, const char * mode) {
65+
fp = std::fopen(fname, mode);
66+
if (fp == NULL) {
67+
throw format("failed to open %s: %s", fname, std::strerror(errno));
68+
}
69+
seek(0, SEEK_END);
70+
size = tell();
71+
seek(0, SEEK_SET);
72+
}
73+
74+
size_t tell() const {
75+
#ifdef _WIN32
76+
__int64 ret = _ftelli64(fp);
77+
#else
78+
long ret = std::ftell(fp);
79+
#endif
80+
LLAMA_ASSERT(ret != -1); // this really shouldn't fail
81+
return (size_t) ret;
82+
}
83+
84+
void seek(size_t offset, int whence) {
85+
#ifdef _WIN32
86+
int ret = _fseeki64(fp, (__int64) offset, whence);
87+
#else
88+
int ret = std::fseek(fp, (long) offset, whence);
89+
#endif
90+
LLAMA_ASSERT(ret == 0); // same
91+
}
92+
93+
void read_raw(void * ptr, size_t size) {
94+
if (size == 0) {
95+
return;
96+
}
97+
errno = 0;
98+
std::size_t ret = std::fread(ptr, size, 1, fp);
99+
if (ferror(fp)) {
100+
throw format("read error: %s", strerror(errno));
101+
}
102+
if (ret != 1) {
103+
throw std::string("unexpectedly reached end of file");
104+
}
105+
}
106+
107+
std::uint32_t read_u32() {
108+
std::uint32_t ret;
109+
read_raw(&ret, sizeof(ret));
110+
return ret;
111+
}
112+
113+
std::string read_string(std::uint32_t len) {
114+
std::vector<char> chars(len);
115+
read_raw(chars.data(), len);
116+
return std::string(chars.data(), len);
117+
}
118+
119+
void write_raw(const void * ptr, size_t size) {
120+
if (size == 0) {
121+
return;
122+
}
123+
errno = 0;
124+
size_t ret = std::fwrite(ptr, size, 1, fp);
125+
if (ret != 1) {
126+
throw format("write error: %s", strerror(errno));
127+
}
128+
}
129+
130+
void write_u32(std::uint32_t val) {
131+
write_raw(&val, sizeof(val));
132+
}
133+
134+
~llama_file() {
135+
if (fp) {
136+
std::fclose(fp);
137+
}
138+
}
139+
};
140+
141+
#if defined(_WIN32)
142+
static std::string llama_format_win_err(DWORD err) {
143+
LPSTR buf;
144+
size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
145+
NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
146+
if (!size) {
147+
return "FormatMessageA failed";
148+
}
149+
std::string ret(buf, size);
150+
LocalFree(buf);
151+
return ret;
152+
}
153+
#endif
154+
155+
struct llama_mmap {
156+
void * addr;
157+
size_t size;
158+
159+
llama_mmap(const llama_mmap &) = delete;
160+
161+
#ifdef _POSIX_MAPPED_FILES
162+
static constexpr bool SUPPORTED = true;
163+
164+
llama_mmap(struct llama_file * file) {
165+
size = file->size;
166+
int fd = fileno(file->fp);
167+
int flags = MAP_SHARED;
168+
#ifdef __linux__
169+
flags |= MAP_POPULATE;
170+
#endif
171+
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
172+
close(fd);
173+
if (addr == MAP_FAILED) {
174+
throw format("mmap failed: %s", strerror(errno));
175+
}
176+
177+
// Advise the kernel to preload the mapped memory
178+
if (madvise(addr, file->size, MADV_WILLNEED)) {
179+
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
180+
strerror(errno));
181+
}
182+
}
183+
184+
~llama_mmap() {
185+
munmap(addr, size);
186+
}
187+
#elif defined(_WIN32)
188+
static constexpr bool SUPPORTED = true;
189+
190+
llama_mmap(struct llama_file * file) {
191+
size = file->size;
192+
193+
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
194+
195+
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
196+
DWORD error = GetLastError();
197+
CloseHandle(hFile);
198+
199+
if (hMapping == NULL) {
200+
throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
201+
}
202+
203+
addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
204+
error = GetLastError();
205+
CloseHandle(hMapping);
206+
207+
if (addr == NULL) {
208+
throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
209+
}
210+
211+
// Advise the kernel to preload the mapped memory
212+
WIN32_MEMORY_RANGE_ENTRY range;
213+
range.VirtualAddress = addr;
214+
range.NumberOfBytes = (SIZE_T)size;
215+
if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
216+
fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
217+
llama_format_win_err(GetLastError()).c_str());
218+
}
219+
}
220+
221+
~llama_mmap() {
222+
if (!UnmapViewOfFile(addr)) {
223+
fprintf(stderr, "warning: UnmapViewOfFile failed: %s\n",
224+
llama_format_win_err(GetLastError()).c_str());
225+
}
226+
}
227+
#else
228+
static constexpr bool SUPPORTED = false;
229+
230+
llama_mmap(struct llama_file *) {
231+
throw std::string("mmap not supported");
232+
}
233+
#endif
234+
};
235+
236+
// Represents some region of memory being locked using mlock or VirtualLock;
237+
// will automatically unlock on destruction.
238+
struct llama_mlock {
239+
void * addr = NULL;
240+
size_t size = 0;
241+
bool failed_already = false;
242+
243+
llama_mlock() {}
244+
llama_mlock(const llama_mlock &) = delete;
245+
246+
~llama_mlock() {
247+
if (size) {
248+
raw_unlock(addr, size);
249+
}
250+
}
251+
252+
void init(void * addr) {
253+
LLAMA_ASSERT(this->addr == NULL && this->size == 0);
254+
this->addr = addr;
255+
}
256+
257+
void grow_to(size_t target_size) {
258+
LLAMA_ASSERT(addr);
259+
if (failed_already) {
260+
return;
261+
}
262+
size_t granularity = lock_granularity();
263+
target_size = (target_size + granularity - 1) & ~(granularity - 1);
264+
if (target_size > size) {
265+
if (raw_lock((uint8_t *) addr + size, target_size - size)) {
266+
size = target_size;
267+
} else {
268+
failed_already = true;
269+
}
270+
}
271+
}
272+
273+
#ifdef _POSIX_MEMLOCK_RANGE
274+
static constexpr bool SUPPORTED = true;
275+
276+
size_t lock_granularity() {
277+
return (size_t) sysconf(_SC_PAGESIZE);
278+
}
279+
280+
#ifdef __APPLE__
281+
#define MLOCK_SUGGESTION \
282+
"Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or " \
283+
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l).\n"
284+
#else
285+
#define MLOCK_SUGGESTION \
286+
"Try increasing RLIMIT_MLOCK ('ulimit -l' as root).\n"
287+
#endif
288+
289+
bool raw_lock(const void * addr, size_t size) {
290+
if (!mlock(addr, size)) {
291+
return true;
292+
} else {
293+
fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n" MLOCK_SUGGESTION,
294+
size, this->size, std::strerror(errno));
295+
return false;
296+
}
297+
}
298+
299+
#undef MLOCK_SUGGESTION
300+
301+
void raw_unlock(void * addr, size_t size) {
302+
if (munlock(addr, size)) {
303+
fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
304+
}
305+
}
306+
#elif defined(_WIN32)
307+
static constexpr bool SUPPORTED = true;
308+
309+
size_t lock_granularity() {
310+
SYSTEM_INFO si;
311+
GetSystemInfo(&si);
312+
return (size_t) si.dwPageSize;
313+
}
314+
315+
bool raw_lock(void * addr, size_t size) {
316+
for (int tries = 1; ; tries++) {
317+
if (VirtualLock(addr, size)) {
318+
return true;
319+
}
320+
if (tries == 2) {
321+
fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
322+
size, this->size, llama_format_win_err(GetLastError()).c_str());
323+
return false;
324+
}
325+
326+
// It failed but this was only the first try; increase the working
327+
// set size and try again.
328+
SIZE_T min_ws_size, max_ws_size;
329+
if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
330+
fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
331+
llama_format_win_err(GetLastError()).c_str());
332+
return false;
333+
}
334+
// No way to make this atomic, so hopefully nobody is doing the
335+
// same on other threads.
336+
max_ws_size += size;
337+
if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
338+
fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
339+
llama_format_win_err(GetLastError()).c_str());
340+
return false;
341+
}
342+
}
343+
}
344+
345+
void raw_unlock(void * addr, size_t size) {
346+
if (!VirtualUnlock(addr, size)) {
347+
fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
348+
llama_format_win_err(GetLastError()).c_str());
349+
}
350+
}
351+
#else
352+
static constexpr bool SUPPORTED = false;
353+
354+
void raw_lock(const void * addr, size_t size) {
355+
fprintf(stderr, "warning: mlock not supported on this system\n");
356+
}
357+
358+
void raw_unlock(const void * addr, size_t size) {}
359+
#endif
360+
};
361+
362+
// Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
363+
struct llama_buffer {
364+
uint8_t * addr = NULL;
365+
size_t size = 0;
366+
367+
void resize(size_t size) {
368+
delete[] addr;
369+
addr = new uint8_t[size];
370+
this->size = size;
371+
}
372+
373+
~llama_buffer() {
374+
delete[] addr;
375+
}
376+
};
377+
#endif

0 commit comments

Comments
 (0)
Please sign in to comment.