Skip to content

Commit 26c0846

Browse files
committedMar 11, 2023
Initial release
0 parents  commit 26c0846

File tree

9 files changed

+13094
-0
lines changed

9 files changed

+13094
-0
lines changed
 

‎.gitignore‎

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
*.o
2+
*.a
3+
.cache/
4+
.vs/
5+
.vscode/
6+
.DS_Store
7+
8+
build/
9+
build-em/
10+
build-debug/
11+
build-release/
12+
build-static/
13+
build-no-accel/
14+
build-sanitize-addr/
15+
build-sanitize-thread/
16+
17+
/main
18+
/quantize
19+
20+
arm_neon.h
21+
compile_commands.json

‎Makefile‎

Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
ifndef UNAME_S
2+
UNAME_S := $(shell uname -s)
3+
endif
4+
5+
ifndef UNAME_P
6+
UNAME_P := $(shell uname -p)
7+
endif
8+
9+
ifndef UNAME_M
10+
UNAME_M := $(shell uname -m)
11+
endif
12+
13+
CCV := $(shell $(CC) --version | head -n 1)
14+
CXXV := $(shell $(CXX) --version | head -n 1)
15+
16+
# Mac OS + Arm can report x86_64
17+
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
18+
ifeq ($(UNAME_S),Darwin)
19+
ifneq ($(UNAME_P),arm)
20+
SYSCTL_M := $(shell sysctl -n hw.optional.arm64)
21+
ifeq ($(SYSCTL_M),1)
22+
# UNAME_P := arm
23+
# UNAME_M := arm64
24+
warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789)
25+
endif
26+
endif
27+
endif
28+
29+
#
30+
# Compile flags
31+
#
32+
33+
CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC
34+
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
35+
LDFLAGS =
36+
37+
# OS specific
38+
# TODO: support Windows
39+
ifeq ($(UNAME_S),Linux)
40+
CFLAGS += -pthread
41+
CXXFLAGS += -pthread
42+
endif
43+
ifeq ($(UNAME_S),Darwin)
44+
CFLAGS += -pthread
45+
CXXFLAGS += -pthread
46+
endif
47+
ifeq ($(UNAME_S),FreeBSD)
48+
CFLAGS += -pthread
49+
CXXFLAGS += -pthread
50+
endif
51+
ifeq ($(UNAME_S),Haiku)
52+
CFLAGS += -pthread
53+
CXXFLAGS += -pthread
54+
endif
55+
56+
# Architecture specific
57+
# TODO: probably these flags need to be tweaked on some architectures
58+
# feel free to update the Makefile for your architecture and send a pull request or issue
59+
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
60+
ifeq ($(UNAME_S),Darwin)
61+
CFLAGS += -mf16c
62+
AVX1_M := $(shell sysctl machdep.cpu.features)
63+
ifneq (,$(findstring FMA,$(AVX1_M)))
64+
CFLAGS += -mfma
65+
endif
66+
ifneq (,$(findstring AVX1.0,$(AVX1_M)))
67+
CFLAGS += -mavx
68+
endif
69+
AVX2_M := $(shell sysctl machdep.cpu.leaf7_features)
70+
ifneq (,$(findstring AVX2,$(AVX2_M)))
71+
CFLAGS += -mavx2
72+
endif
73+
else ifeq ($(UNAME_S),Linux)
74+
AVX1_M := $(shell grep "avx " /proc/cpuinfo)
75+
ifneq (,$(findstring avx,$(AVX1_M)))
76+
CFLAGS += -mavx
77+
endif
78+
AVX2_M := $(shell grep "avx2 " /proc/cpuinfo)
79+
ifneq (,$(findstring avx2,$(AVX2_M)))
80+
CFLAGS += -mavx2
81+
endif
82+
FMA_M := $(shell grep "fma " /proc/cpuinfo)
83+
ifneq (,$(findstring fma,$(FMA_M)))
84+
CFLAGS += -mfma
85+
endif
86+
F16C_M := $(shell grep "f16c " /proc/cpuinfo)
87+
ifneq (,$(findstring f16c,$(F16C_M)))
88+
CFLAGS += -mf16c
89+
endif
90+
SSE3_M := $(shell grep "sse3 " /proc/cpuinfo)
91+
ifneq (,$(findstring sse3,$(SSE3_M)))
92+
CFLAGS += -msse3
93+
endif
94+
else ifeq ($(UNAME_S),Haiku)
95+
AVX1_M := $(shell sysinfo -cpu | grep "AVX ")
96+
ifneq (,$(findstring avx,$(AVX1_M)))
97+
CFLAGS += -mavx
98+
endif
99+
AVX2_M := $(shell sysinfo -cpu | grep "AVX2 ")
100+
ifneq (,$(findstring avx2,$(AVX2_M)))
101+
CFLAGS += -mavx2
102+
endif
103+
FMA_M := $(shell sysinfo -cpu | grep "FMA ")
104+
ifneq (,$(findstring fma,$(FMA_M)))
105+
CFLAGS += -mfma
106+
endif
107+
F16C_M := $(shell sysinfo -cpu | grep "F16C ")
108+
ifneq (,$(findstring f16c,$(F16C_M)))
109+
CFLAGS += -mf16c
110+
endif
111+
else
112+
CFLAGS += -mfma -mf16c -mavx -mavx2
113+
endif
114+
endif
115+
ifeq ($(UNAME_M),amd64)
116+
CFLAGS += -mavx -mavx2 -mfma -mf16c
117+
endif
118+
ifneq ($(filter ppc64%,$(UNAME_M)),)
119+
POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
120+
ifneq (,$(findstring POWER9,$(POWER9_M)))
121+
CFLAGS += -mpower9-vector
122+
endif
123+
# Require c++23's std::byteswap for big-endian support.
124+
ifeq ($(UNAME_M),ppc64)
125+
CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN
126+
endif
127+
endif
128+
ifndef WHISPER_NO_ACCELERATE
129+
# Mac M1 - include Accelerate framework
130+
ifeq ($(UNAME_S),Darwin)
131+
CFLAGS += -DGGML_USE_ACCELERATE
132+
LDFLAGS += -framework Accelerate
133+
endif
134+
endif
135+
ifdef WHISPER_OPENBLAS
136+
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
137+
LDFLAGS += -lopenblas
138+
endif
139+
ifdef WHISPER_GPROF
140+
CFLAGS += -pg
141+
CXXFLAGS += -pg
142+
endif
143+
ifneq ($(filter aarch64%,$(UNAME_M)),)
144+
CFLAGS += -mcpu=native
145+
CXXFLAGS += -mcpu=native
146+
endif
147+
ifneq ($(filter armv6%,$(UNAME_M)),)
148+
# Raspberry Pi 1, 2, 3
149+
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
150+
endif
151+
ifneq ($(filter armv7%,$(UNAME_M)),)
152+
# Raspberry Pi 4
153+
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
154+
endif
155+
ifneq ($(filter armv8%,$(UNAME_M)),)
156+
# Raspberry Pi 4
157+
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
158+
endif
159+
160+
#
161+
# Print build information
162+
#
163+
164+
$(info I llama.cpp build info: )
165+
$(info I UNAME_S: $(UNAME_S))
166+
$(info I UNAME_P: $(UNAME_P))
167+
$(info I UNAME_M: $(UNAME_M))
168+
$(info I CFLAGS: $(CFLAGS))
169+
$(info I CXXFLAGS: $(CXXFLAGS))
170+
$(info I LDFLAGS: $(LDFLAGS))
171+
$(info I CC: $(CCV))
172+
$(info I CXX: $(CXXV))
173+
$(info )
174+
175+
default: main quantize
176+
177+
#
178+
# Build library
179+
#
180+
181+
ggml.o: ggml.c ggml.h
182+
$(CC) $(CFLAGS) -c ggml.c -o ggml.o
183+
184+
utils.o: utils.cpp utils.h
185+
$(CXX) $(CXXFLAGS) -c utils.cpp -o utils.o
186+
187+
clean:
188+
rm -f *.o main quantize
189+
190+
main: main.cpp ggml.o utils.o
191+
$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o -o main $(LDFLAGS)
192+
./main -h
193+
194+
quantize: quantize.cpp ggml.o utils.o
195+
$(CXX) $(CXXFLAGS) quantize.cpp ggml.o utils.o -o quantize $(LDFLAGS)
196+
197+
#
198+
# Tests
199+
#
200+
201+
.PHONY: tests
202+
tests:
203+
bash ./tests/run-tests.sh

‎convert-pth-to-ggml.py‎

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
# Convert a LLaMA model checkpoint to a ggml compatible file
2+
#
3+
# Load the model using Torch
4+
# Iterate over all variables and write them to a binary file.
5+
#
6+
# For each variable, write the following:
7+
# - Number of dimensions (int)
8+
# - Name length (int)
9+
# - Dimensions (int[n_dims])
10+
# - Name (char[name_length])
11+
# - Data (float[n_dims])
12+
#
13+
# By default, the bigger matrices are converted to 16-bit floats.
14+
# This can be disabled by adding the "use-f32" CLI argument.
15+
#
16+
# At the start of the ggml file we write the model parameters
17+
# and vocabulary.
18+
#
19+
20+
import sys
21+
import json
22+
import struct
23+
import numpy as np
24+
import torch
25+
26+
from sentencepiece import SentencePieceProcessor
27+
28+
if len(sys.argv) < 3:
29+
print("Usage: convert-ckpt-to-ggml.py dir-model ftype\n")
30+
print(" ftype == 0 -> float32")
31+
print(" ftype == 1 -> float16")
32+
sys.exit(1)
33+
34+
# output in the same directory as the model
35+
dir_model = sys.argv[1]
36+
fname_out = sys.argv[1] + "/ggml-model.bin"
37+
38+
fname_hparams = sys.argv[1] + "/params.json"
39+
fname_model = sys.argv[1] + "/consolidated.00.pth"
40+
fname_tokenizer = sys.argv[1] + "/../tokenizer.model"
41+
42+
# possible data types
43+
# ftype == 0 -> float32
44+
# ftype == 1 -> float16
45+
#
46+
# map from ftype to string
47+
ftype_str = ["f32", "f16"]
48+
49+
ftype = 1
50+
if len(sys.argv) > 2:
51+
ftype = int(sys.argv[2])
52+
if ftype < 0 or ftype > 1:
53+
print("Invalid ftype: " + str(ftype))
54+
sys.exit(1)
55+
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
56+
57+
with open(fname_hparams, "r") as f:
58+
hparams = json.load(f)
59+
60+
tokenizer = SentencePieceProcessor(fname_tokenizer)
61+
62+
hparams.update({"vocab_size": tokenizer.vocab_size()})
63+
64+
print(hparams)
65+
66+
model = torch.load(fname_model, map_location="cpu")
67+
68+
fout = open(fname_out, "wb")
69+
70+
fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
71+
fout.write(struct.pack("i", hparams["vocab_size"]))
72+
fout.write(struct.pack("i", hparams["dim"]))
73+
fout.write(struct.pack("i", hparams["multiple_of"]))
74+
fout.write(struct.pack("i", hparams["n_heads"]))
75+
fout.write(struct.pack("i", hparams["n_layers"]))
76+
fout.write(struct.pack("i", 64)) # rot
77+
fout.write(struct.pack("i", ftype))
78+
79+
# Is this correct??
80+
for i in range(32000):
81+
# TODO: this is probably wrong - not sure how this tokenizer works
82+
text = tokenizer.decode([29889, i]).encode('utf-8')
83+
# remove the first byte (it's always '.')
84+
text = text[1:]
85+
fout.write(struct.pack("i", len(text)))
86+
fout.write(text)
87+
88+
for k, v in model.items():
89+
name = k
90+
shape = v.shape
91+
92+
# skip layers.X.attention.inner_attention.rope.freqs
93+
if name[-5:] == "freqs":
94+
continue
95+
96+
print("Processing variable: " + name + " with shape: ", shape, " and type: ", v.dtype)
97+
98+
#data = tf.train.load_variable(dir_model, name).squeeze()
99+
data = v.numpy().squeeze()
100+
n_dims = len(data.shape);
101+
102+
# for efficiency - transpose some matrices
103+
# "model/h.*/attn/c_attn/w"
104+
# "model/h.*/attn/c_proj/w"
105+
# "model/h.*/mlp/c_fc/w"
106+
# "model/h.*/mlp/c_proj/w"
107+
#if name[-14:] == "/attn/c_attn/w" or \
108+
# name[-14:] == "/attn/c_proj/w" or \
109+
# name[-11:] == "/mlp/c_fc/w" or \
110+
# name[-13:] == "/mlp/c_proj/w":
111+
# print(" Transposing")
112+
# data = data.transpose()
113+
114+
dshape = data.shape
115+
116+
# default type is fp16
117+
ftype_cur = 1
118+
if ftype == 0 or n_dims == 1:
119+
print(" Converting to float32")
120+
data = data.astype(np.float32)
121+
ftype_cur = 0
122+
123+
# header
124+
str = name.encode('utf-8')
125+
fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
126+
for i in range(n_dims):
127+
fout.write(struct.pack("i", dshape[n_dims - 1 - i]))
128+
fout.write(str);
129+
130+
# data
131+
data.tofile(fout)
132+
133+
fout.close()
134+
135+
print("Done. Output file: " + fname_out)
136+
print("")

‎ggml.c‎

Lines changed: 10324 additions & 0 deletions
Large diffs are not rendered by default.

‎ggml.h‎

Lines changed: 758 additions & 0 deletions
Large diffs are not rendered by default.

‎main.cpp‎

Lines changed: 750 additions & 0 deletions
Large diffs are not rendered by default.

‎quantize.cpp‎

Lines changed: 330 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,330 @@
1+
#include "ggml.h"
2+
3+
#include "utils.h"
4+
5+
#include <cassert>
6+
#include <cmath>
7+
#include <cstdio>
8+
#include <cstring>
9+
#include <fstream>
10+
#include <map>
11+
#include <string>
12+
#include <vector>
13+
#include <regex>
14+
15+
// TODO: move somewhere else
16+
#define QK 32
17+
18+
// default hparams (LLaMA76B)
19+
struct llama_hparams {
20+
int32_t n_vocab = 32000;
21+
int32_t n_ctx = 512; // this is provided as user input?
22+
int32_t n_embd = 4096;
23+
int32_t n_mult = 256;
24+
int32_t n_head = 32;
25+
int32_t n_layer = 32;
26+
int32_t n_rot = 64;
27+
int32_t f16 = 1;
28+
};
29+
30+
31+
// quantize a model
32+
bool llama_model_quantize(const std::string & fname_inp, const std::string & fname_out, int itype) {
33+
ggml_type type = GGML_TYPE_Q4_1;
34+
35+
switch (itype) {
36+
case 2: type = GGML_TYPE_Q4_0; break;
37+
case 3: type = GGML_TYPE_Q4_1; break;
38+
default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1;
39+
};
40+
41+
if (type != GGML_TYPE_Q4_0 && type != GGML_TYPE_Q4_1) {
42+
fprintf(stderr, "%s: invalid quantization type %d\n", __func__, type);
43+
return false;
44+
}
45+
46+
gpt_vocab vocab;
47+
48+
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
49+
50+
auto finp = std::ifstream(fname_inp, std::ios::binary);
51+
if (!finp) {
52+
fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
53+
return false;
54+
}
55+
56+
auto fout = std::ofstream(fname_out, std::ios::binary);
57+
if (!fout) {
58+
fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
59+
return false;
60+
}
61+
62+
// verify magic
63+
{
64+
uint32_t magic;
65+
finp.read((char *) &magic, sizeof(magic));
66+
if (magic != 0x67676d6c) {
67+
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
68+
return false;
69+
}
70+
71+
fout.write((char *) &magic, sizeof(magic));
72+
}
73+
74+
llama_hparams hparams;
75+
76+
// load hparams
77+
{
78+
finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
79+
//finp.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
80+
finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
81+
finp.read((char *) &hparams.n_mult, sizeof(hparams.n_mult));
82+
finp.read((char *) &hparams.n_head, sizeof(hparams.n_head));
83+
finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
84+
finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
85+
finp.read((char *) &hparams.f16, sizeof(hparams.f16));
86+
87+
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
88+
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
89+
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
90+
printf("%s: n_mult = %d\n", __func__, hparams.n_mult);
91+
printf("%s: n_head = %d\n", __func__, hparams.n_head);
92+
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
93+
printf("%s: f16 = %d\n", __func__, hparams.f16);
94+
95+
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
96+
//fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
97+
fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd));
98+
fout.write((char *) &hparams.n_mult, sizeof(hparams.n_mult));
99+
fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
100+
fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
101+
fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot));
102+
fout.write((char *) &itype, sizeof(hparams.f16));
103+
}
104+
105+
// load vocab
106+
{
107+
const int32_t n_vocab = hparams.n_vocab;
108+
109+
if (n_vocab != hparams.n_vocab) {
110+
fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
111+
__func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
112+
return false;
113+
}
114+
115+
std::string word;
116+
for (int i = 0; i < n_vocab; i++) {
117+
uint32_t len;
118+
finp.read ((char *) &len, sizeof(len));
119+
fout.write((char *) &len, sizeof(len));
120+
121+
word.resize(len);
122+
finp.read ((char *) word.data(), len);
123+
fout.write((char *) word.data(), len);
124+
125+
vocab.token_to_id[word] = i;
126+
vocab.id_to_token[i] = word;
127+
}
128+
}
129+
130+
// load weights
131+
{
132+
size_t total_size_org = 0;
133+
size_t total_size_new = 0;
134+
135+
std::vector<float> work;
136+
137+
std::vector<uint8_t> data_u8;
138+
std::vector<ggml_fp16_t> data_f16;
139+
std::vector<float> data_f32;
140+
141+
std::vector<int64_t> hist_all(1 << 4, 0);
142+
143+
while (true) {
144+
int32_t n_dims;
145+
int32_t length;
146+
int32_t ftype;
147+
148+
finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
149+
finp.read(reinterpret_cast<char *>(&length), sizeof(length));
150+
finp.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
151+
152+
if (finp.eof()) {
153+
break;
154+
}
155+
156+
int32_t nelements = 1;
157+
int32_t ne[2] = { 1, 1 };
158+
for (int i = 0; i < n_dims; ++i) {
159+
finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
160+
nelements *= ne[i];
161+
}
162+
163+
std::string name(length, 0);
164+
finp.read (&name[0], length);
165+
166+
{
167+
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
168+
printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
169+
}
170+
171+
// regexes of tensor names to be quantized
172+
const std::vector<std::string> k_names = {
173+
".*weight",
174+
};
175+
176+
bool quantize = false;
177+
for (const auto & s : k_names) {
178+
if (std::regex_match(name, std::regex(s))) {
179+
quantize = true;
180+
break;
181+
}
182+
}
183+
184+
// quantize only 2D tensors
185+
quantize &= (n_dims == 2);
186+
187+
if (quantize) {
188+
if (ftype != 0 && ftype != 1) {
189+
fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype);
190+
return false;
191+
}
192+
193+
if (ftype == 1) {
194+
data_f16.resize(nelements);
195+
finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
196+
data_f32.resize(nelements);
197+
for (int i = 0; i < nelements; ++i) {
198+
data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
199+
}
200+
} else {
201+
data_f32.resize(nelements);
202+
finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
203+
}
204+
205+
ftype = itype;
206+
} else {
207+
const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t);
208+
209+
data_u8.resize(nelements*bpe);
210+
finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
211+
}
212+
213+
fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
214+
fout.write(reinterpret_cast<char *>(&length), sizeof(length));
215+
fout.write(reinterpret_cast<char *>(&ftype), sizeof(ftype));
216+
for (int i = 0; i < n_dims; ++i) {
217+
fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
218+
}
219+
fout.write(&name[0], length);
220+
221+
if (quantize) {
222+
printf("quantizing .. ");
223+
work.resize(nelements); // for quantization
224+
225+
size_t cur_size = 0;
226+
std::vector<int64_t> hist_cur(1 << 4, 0);
227+
228+
switch (type) {
229+
case GGML_TYPE_Q4_0:
230+
{
231+
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], QK, hist_cur.data());
232+
} break;
233+
case GGML_TYPE_Q4_1:
234+
{
235+
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], QK, hist_cur.data());
236+
} break;
237+
default:
238+
{
239+
fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type);
240+
return false;
241+
}
242+
}
243+
244+
fout.write(reinterpret_cast<char *>(work.data()), cur_size);
245+
total_size_new += cur_size;
246+
247+
printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
248+
for (int i = 0; i < hist_cur.size(); ++i) {
249+
hist_all[i] += hist_cur[i];
250+
}
251+
252+
for (int i = 0; i < hist_cur.size(); ++i) {
253+
printf("%5.3f ", hist_cur[i] / (float)nelements);
254+
}
255+
printf("\n");
256+
} else {
257+
printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
258+
fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
259+
total_size_new += data_u8.size();
260+
}
261+
262+
total_size_org += nelements * sizeof(float);
263+
}
264+
265+
printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
266+
printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
267+
268+
{
269+
int64_t sum_all = 0;
270+
for (int i = 0; i < hist_all.size(); ++i) {
271+
sum_all += hist_all[i];
272+
}
273+
274+
printf("%s: hist: ", __func__);
275+
for (int i = 0; i < hist_all.size(); ++i) {
276+
printf("%5.3f ", hist_all[i] / (float)sum_all);
277+
}
278+
printf("\n");
279+
}
280+
}
281+
282+
finp.close();
283+
fout.close();
284+
285+
return true;
286+
}
287+
288+
// usage:
289+
// ./llama-quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
290+
//
291+
int main(int argc, char ** argv) {
292+
if (argc != 4) {
293+
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
294+
fprintf(stderr, " type = 2 - q4_0\n");
295+
fprintf(stderr, " type = 3 - q4_1\n");
296+
return 1;
297+
}
298+
299+
const std::string fname_inp = argv[1];
300+
const std::string fname_out = argv[2];
301+
302+
const int itype = atoi(argv[3]);
303+
304+
const int64_t t_main_start_us = ggml_time_us();
305+
306+
int64_t t_quantize_us = 0;
307+
308+
// load the model
309+
{
310+
const int64_t t_start_us = ggml_time_us();
311+
312+
if (!llama_model_quantize(fname_inp, fname_out, itype)) {
313+
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
314+
return 1;
315+
}
316+
317+
t_quantize_us = ggml_time_us() - t_start_us;
318+
}
319+
320+
// report timing
321+
{
322+
const int64_t t_main_end_us = ggml_time_us();
323+
324+
printf("\n");
325+
printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
326+
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
327+
}
328+
329+
return 0;
330+
}

‎utils.cpp‎

Lines changed: 478 additions & 0 deletions
Large diffs are not rendered by default.

‎utils.h‎

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
// Various helper functions and utilities
2+
3+
#pragma once
4+
5+
#include <string>
6+
#include <map>
7+
#include <vector>
8+
#include <random>
9+
#include <thread>
10+
11+
//
12+
// CLI argument parsing
13+
//
14+
15+
struct gpt_params {
16+
int32_t seed = -1; // RNG seed
17+
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
18+
int32_t n_predict = 200; // new tokens to predict
19+
20+
// sampling parameters
21+
int32_t top_k = 100;
22+
float top_p = 0.95f;
23+
float temp = 0.8f;
24+
25+
int32_t n_batch = 8; // batch size for prompt processing
26+
27+
std::string model = "models/lamma-7B/ggml-model.bin"; // model path
28+
std::string prompt;
29+
};
30+
31+
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
32+
33+
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
34+
35+
std::string gpt_random_prompt(std::mt19937 & rng);
36+
37+
//
38+
// Vocab utils
39+
//
40+
41+
struct gpt_vocab {
42+
using id = int32_t;
43+
using token = std::string;
44+
45+
std::map<token, id> token_to_id;
46+
std::map<id, token> id_to_token;
47+
};
48+
49+
void replace(std::string & str, const std::string & needle, const std::string & replacement);
50+
51+
// poor-man's JSON parsing
52+
std::map<std::string, int32_t> json_parse(const std::string & fname);
53+
54+
// split text into tokens
55+
//
56+
// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
57+
//
58+
// Regex (Python):
59+
// r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
60+
//
61+
// Regex (C++):
62+
// R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
63+
//
64+
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
65+
66+
// TODO: this is probably wrong, but I cannot figure out how this tokenizer works ..
67+
// ref: https://github.com/google/sentencepiece
68+
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos);
69+
70+
// load the tokens from encoder.json
71+
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
72+
73+
// sample next token given probabilities for each embedding
74+
//
75+
// - consider only the top K tokens
76+
// - from them, consider only the top tokens with cumulative probability > P
77+
//
78+
// TODO: not sure if this implementation is correct
79+
// TODO: temperature is not implemented
80+
//
81+
gpt_vocab::id gpt_sample_top_k_top_p(
82+
const gpt_vocab & vocab,
83+
const float * logits,
84+
int top_k,
85+
double top_p,
86+
double temp,
87+
std::mt19937 & rng);
88+
89+
//
90+
// Quantization
91+
//
92+
93+
size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist);
94+
size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist);

0 commit comments

Comments
 (0)
Please sign in to comment.