本文最后更新于 2026年5月11日。
llama2 和mistral
C:\Users\weiyo>ollama serve
time=2024-03-20T09:41:00.966+08:00 level=INFO source=images.go:806 msg="total blobs: 24"
time=2024-03-20T09:41:00.967+08:00 level=INFO source=images.go:813 msg="total unused blobs removed: 0"
time=2024-03-20T09:41:00.968+08:00 level=INFO source=routes.go:1082 msg="Listening on 127.0.0.1:11434 (version 0.1.29)"
time=2024-03-20T09:41:00.968+08:00 level=INFO source=payload_common.go:112 msg="Extracting dynamic libraries to C:\\Users\\weiyo\\AppData\\Local\\Temp\\ollama3197242561\\runners ..."
time=2024-03-20T09:41:01.146+08:00 level=INFO source=payload_common.go:139 msg="Dynamic LLM libraries [cpu_avx2 cpu cuda_v11.3 rocm_v5.7]"
[GIN] 2024/03/20 - 09:41:47 | 204 | 0s | 127.0.0.1 | OPTIONS "/api/chat"
time=2024-03-20T09:41:47.775+08:00 level=INFO source=gpu.go:77 msg="Detecting GPU type"
time=2024-03-20T09:41:47.775+08:00 level=INFO source=gpu.go:191 msg="Searching for GPU management library nvml.dll"
time=2024-03-20T09:41:47.784+08:00 level=INFO source=gpu.go:237 msg="Discovered GPU libraries: []"
time=2024-03-20T09:41:47.784+08:00 level=INFO source=cpu_common.go:11 msg="CPU has AVX2"
time=2024-03-20T09:41:47.785+08:00 level=INFO source=cpu_common.go:11 msg="CPU has AVX2"
time=2024-03-20T09:41:47.785+08:00 level=INFO source=llm.go:85 msg="GPU not available, falling back to CPU"
time=2024-03-20T09:41:47.785+08:00 level=INFO source=assets.go:63 msg="Updating PATH to C:\\Users\\weiyo\\AppData\\Local\\Temp\\ollama3197242561\\runners\\cpu_avx2;C:\\ProgramData\\Anaconda3\\condabin;C:\\Program Files\\Common Files\\Oracle\\Java\\javapath;C:\\WINDOWS\\system32;C:\\WINDOWS;C:\\WINDOWS\\System32\\Wbem;C:\\WINDOWS\\System32\\WindowsPowerShell\\v1.0\\;C:\\WINDOWS\\System32\\OpenSSH\\;C:\\Program Files\\dotnet\\;C:\\Program Files\\s;C:\\Program Files\\MATLAB\\R2022a\\runtime\\win64;C:\\Program Files\\MATLAB\\R2022a\\bin;C:\\Program Files\\MATLAB\\R2017a\\runtime\\win64;C:\\Program Files\\MATLAB\\R2017a\\bin;C:\\ProgramData\\Anaconda3;C:\\ProgramData\\Anaconda3\\Scripts;C:\\ProgramData\\Anaconda3\\Library\\bin;C:\\Program Files\\MATLAB\\MATLAB Runtime\\v92\\runtime\\win64;C:\\MinGW\\bin;C:\\Program Files\\s\\green\\node-v18.17.1-win-x64;C:\\Program Files\\Docker\\Docker\\resources\\bin;C:\\Program Files\\Pandoc\\;C:\\Users\\weiyo\\AppData\\Roaming\\Python\\Python39\\Scripts0;C:\\Program Files\\TortoiseSVN\\bin;C:\\Program Files\\s\\green\\cmake-3.25.0-windows-x86_64\\bin;C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\VC\\Tools\\MSVC\\14.33.31629\\bin\\Hostx86\\x64;C:\\Program Files\\s\\green\\ffmpeg-4.4.1-essentials_build\\bin;C:\\Users\\weiyo\\AppData\\Roaming\\Python\\Python310\\Scripts;C:\\msys64\\mingw64\\bin;C:\\msys64\\mingw64\\share;C:\\msys64\\mingw64\\include\\gtk-4.0\\gtk;C:\\msys64\\mingw64\\include\\gtk-4.0;C:\\msys64\\mingw64\\include;C:\\msys64\\mingw64\\lib;C:\\msys64\\mingw64\\include\\glib-2.0;C:\\Program Files\\s\\green\\VSCode-win32-x64-1.76.0;C:\\msys64\\mingw64\\include\\python3.10;C:\\ProgramData\\Anaconda3\\include;C:\\Program Files\\Git\\cmd;C:\\Program Files\\s\\green\\poppler-23.08.0\\Library\\bin;C:\\Program Files (x86)\\ZeroTier\\One\\;%PNPM_HOME%;C:\\Users\\weiyo\\.cargo\\bin;C:\\Users\\weiyo\\AppData\\Local\\Microsoft\\WindowsApps;;C:\\Program Files\\JetBrains\\PyCharm Community Edition 2023.1.2\\bin;;C:\\Users\\weiyo\\AppData\\Local\\Programs\\Ollama"
loading library C:\Users\weiyo\AppData\Local\Temp\ollama3197242561\runners\cpu_avx2\ext_server.dll
time=2024-03-20T09:41:47.792+08:00 level=INFO source=dyn_ext_server.go:90 msg="Loading Dynamic llm server: C:\\Users\\weiyo\\AppData\\Local\\Temp\\ollama3197242561\\runners\\cpu_avx2\\ext_server.dll"
time=2024-03-20T09:41:47.792+08:00 level=INFO source=dyn_ext_server.go:150 msg="Initializing llama server"
llama_model_loader: loaded meta data with 23 key-value pairs and 291 tensors from C:\Users\weiyo\.ollama\models\blobs\sha256-8934d96d3f08982e95922b2b7a2c626a1fe873d7c3b06e8e56d7bc0a1fef9246 (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv 0: general.architecture str = llama
llama_model_loader: - kv 1: general.name str = LLaMA v2
llama_model_loader: - kv 2: llama.context_length u32 = 4096
llama_model_loader: - kv 3: llama.embedding_length u32 = 4096
llama_model_loader: - kv 4: llama.block_count u32 = 32
llama_model_loader: - kv 5: llama.feed_forward_length u32 = 11008
llama_model_loader: - kv 6: llama.rope.dimension_count u32 = 128
llama_model_loader: - kv 7: llama.attention.head_count u32 = 32
llama_model_loader: - kv 8: llama.attention.head_count_kv u32 = 32
llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
llama_model_loader: - kv 10: general.file_type u32 = 2
llama_model_loader: - kv 11: tokenizer.ggml.model str = llama
llama_model_loader: - kv 12: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
llama_model_loader: - kv 13: tokenizer.ggml.scores arr[f32,32000] = [0.000000, 0.000000, 0.000000, 0.0000...
llama_model_loader: - kv 14: tokenizer.ggml.token_type arr[i32,32000] = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
llama_model_loader: - kv 15: tokenizer.ggml.merges arr[str,61249] = ["鈻?t", "e r", "i n", "鈻?a", "e n...
llama_model_loader: - kv 16: tokenizer.ggml.bos_token_id u32 = 1
llama_model_loader: - kv 17: tokenizer.ggml.eos_token_id u32 = 2
llama_model_loader: - kv 18: tokenizer.ggml.unknown_token_id u32 = 0
llama_model_loader: - kv 19: tokenizer.ggml.add_bos_token bool = true
llama_model_loader: - kv 20: tokenizer.ggml.add_eos_token bool = false
llama_model_loader: - kv 21: tokenizer.chat_template str = {% if messages[0]['role'] == 'system'...
llama_model_loader: - kv 22: general.quantization_version u32 = 2
llama_model_loader: - type f32: 65 tensors
llama_model_loader: - type q4_0: 225 tensors
llama_model_loader: - type q6_K: 1 tensors
llm_load_vocab: special tokens definition check successful ( 259/32000 ).
llm_load_print_meta: format = GGUF V3 (latest)
llm_load_print_meta: arch = llama
llm_load_print_meta: vocab type = SPM
llm_load_print_meta: n_vocab = 32000
llm_load_print_meta: n_merges = 0
llm_load_print_meta: n_ctx_train = 4096
llm_load_print_meta: n_embd = 4096
llm_load_print_meta: n_head = 32
llm_load_print_meta: n_head_kv = 32
llm_load_print_meta: n_layer = 32
llm_load_print_meta: n_rot = 128
llm_load_print_meta: n_embd_head_k = 128
llm_load_print_meta: n_embd_head_v = 128
llm_load_print_meta: n_gqa = 1
llm_load_print_meta: n_embd_k_gqa = 4096
llm_load_print_meta: n_embd_v_gqa = 4096
llm_load_print_meta: f_norm_eps = 0.0e+00
llm_load_print_meta: f_norm_rms_eps = 1.0e-05
llm_load_print_meta: f_clamp_kqv = 0.0e+00
llm_load_print_meta: f_max_alibi_bias = 0.0e+00
llm_load_print_meta: n_ff = 11008
llm_load_print_meta: n_expert = 0
llm_load_print_meta: n_expert_used = 0
llm_load_print_meta: pooling type = 0
llm_load_print_meta: rope type = 0
llm_load_print_meta: rope scaling = linear
llm_load_print_meta: freq_base_train = 10000.0
llm_load_print_meta: freq_scale_train = 1
llm_load_print_meta: n_yarn_orig_ctx = 4096
llm_load_print_meta: rope_finetuned = unknown
llm_load_print_meta: model type = 7B
llm_load_print_meta: model ftype = Q4_0
llm_load_print_meta: model params = 6.74 B
llm_load_print_meta: model size = 3.56 GiB (4.54 BPW)
llm_load_print_meta: general.name = LLaMA v2
llm_load_print_meta: BOS token = 1 '<s>'
llm_load_print_meta: EOS token = 2 '</s>'
llm_load_print_meta: UNK token = 0 '<unk>'
llm_load_print_meta: LF token = 13 '<0x0A>'
llm_load_tensors: ggml ctx size = 0.11 MiB
llm_load_tensors: CPU buffer size = 3647.87 MiB
..................................................................................................
llama_new_context_with_model: n_ctx = 2048
llama_new_context_with_model: freq_base = 10000.0
llama_new_context_with_model: freq_scale = 1
llama_kv_cache_init: CPU KV buffer size = 1024.00 MiB
llama_new_context_with_model: KV self size = 1024.00 MiB, K (f16): 512.00 MiB, V (f16): 512.00 MiB
llama_new_context_with_model: CPU input buffer size = 13.02 MiB
llama_new_context_with_model: CPU compute buffer size = 160.00 MiB
llama_new_context_with_model: graph splits (measure): 1
{"function":"initialize","level":"INFO","line":434,"msg":"initializing slots","n_slots":1,"tid":"19432","timestamp":1710898911}
{"function":"initialize","level":"INFO","line":446,"msg":"new slot","n_ctx_slot":2048,"slot_id":0,"tid":"19432","timestamp":1710898911}
time=2024-03-20T09:41:51.663+08:00 level=INFO source=dyn_ext_server.go:162 msg="Starting llama main loop"
{"function":"update_slots","level":"INFO","line":1584,"msg":"all slots are idle and system prompt is empty, clear the KV cache","tid":"25508","timestamp":1710898911}
{"function":"launch_slot_with_data","level":"INFO","line":827,"msg":"slot is processing task","slot_id":0,"task_id":0,"tid":"25508","timestamp":1710898911}
{"function":"update_slots","ga_i":0,"level":"INFO","line":1822,"msg":"slot progression","n_past":0,"n_past_se":0,"n_prompt_tokens_processed":48,"slot_id":0,"task_id":0,"tid":"25508","timestamp":1710898911}
{"function":"update_slots","level":"INFO","line":1846,"msg":"kv cache rm [p0, end)","p0":0,"slot_id":0,"task_id":0,"tid":"25508","timestamp":1710898911}
{"function":"print_timings","level":"INFO","line":264,"msg":"prompt eval time = 3672.08 ms / 48 tokens ( 76.50 ms per token, 13.07 tokens per second)","n_prompt_tokens_processed":48,"n_tokens_second":13.071596401389513,"slot_id":0,"t_prompt_processing":3672.084,"t_token":76.50175,"task_id":0,"tid":"25508","timestamp":1710898930}
{"function":"print_timings","level":"INFO","line":278,"msg":"generation eval time = 15504.40 ms / 89 runs ( 174.21 ms per token, 5.74 tokens per second)","n_decoded":89,"n_tokens_second":5.740307828844659,"slot_id":0,"t_token":174.20668539325843,"t_token_generation":15504.395,"task_id":0,"tid":"25508","timestamp":1710898930}
{"function":"print_timings","level":"INFO","line":287,"msg":" total time = 19176.48 ms","slot_id":0,"t_prompt_processing":3672.084,"t_token_generation":15504.395,"t_total":19176.479,"task_id":0,"tid":"25508","timestamp":1710898930}
{"function":"update_slots","level":"INFO","line":1654,"msg":"slot released","n_cache_tokens":137,"n_ctx":2048,"n_past":136,"n_system_tokens":0,"slot_id":0,"task_id":0,"tid":"25508","timestamp":1710898930,"truncated":false}
[GIN] 2024/03/20 - 09:42:10 | 200 | 23.508785s | 127.0.0.1 | POST "/api/chat"
time=2024-03-20T09:42:50.404+08:00 level=INFO source=routes.go:77 msg="changing loaded model"
time=2024-03-20T09:42:51.229+08:00 level=INFO source=cpu_common.go:11 msg="CPU has AVX2"
time=2024-03-20T09:42:51.230+08:00 level=INFO source=cpu_common.go:11 msg="CPU has AVX2"
time=2024-03-20T09:42:51.231+08:00 level=INFO source=llm.go:85 msg="GPU not available, falling back to CPU"
loading library C:\Users\weiyo\AppData\Local\Temp\ollama3197242561\runners\cpu_avx2\ext_server.dll
time=2024-03-20T09:42:51.235+08:00 level=INFO source=dyn_ext_server.go:90 msg="Loading Dynamic llm server: C:\\Users\\weiyo\\AppData\\Local\\Temp\\ollama3197242561\\runners\\cpu_avx2\\ext_server.dll"
time=2024-03-20T09:42:51.235+08:00 level=INFO source=dyn_ext_server.go:150 msg="Initializing llama server"
llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from C:\Users\weiyo\.ollama\models\blobs\sha256-e8a35b5937a5e6d5c35d1f2a15f161e07eefe5e5bb0a3cdd42998ee79b057730 (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv 0: general.architecture str = llama
llama_model_loader: - kv 1: general.name str = mistralai
llama_model_loader: - kv 2: llama.context_length u32 = 32768
llama_model_loader: - kv 3: llama.embedding_length u32 = 4096
llama_model_loader: - kv 4: llama.block_count u32 = 32
llama_model_loader: - kv 5: llama.feed_forward_length u32 = 14336
llama_model_loader: - kv 6: llama.rope.dimension_count u32 = 128
llama_model_loader: - kv 7: llama.attention.head_count u32 = 32
llama_model_loader: - kv 8: llama.attention.head_count_kv u32 = 8
llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
llama_model_loader: - kv 10: llama.rope.freq_base f32 = 1000000.000000
llama_model_loader: - kv 11: general.file_type u32 = 2
llama_model_loader: - kv 12: tokenizer.ggml.model str = llama
llama_model_loader: - kv 13: tokenizer.ggml.tokens arr[str,32000] = ["<unk>", "<s>", "</s>", "<0x00>", "<...
llama_model_loader: - kv 14: tokenizer.ggml.scores arr[f32,32000] = [0.000000, 0.000000, 0.000000, 0.0000...
llama_model_loader: - kv 15: tokenizer.ggml.token_type arr[i32,32000] = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
llama_model_loader: - kv 16: tokenizer.ggml.merges arr[str,58980] = ["鈻?t", "i n", "e r", "鈻?a", "h e...
llama_model_loader: - kv 17: tokenizer.ggml.bos_token_id u32 = 1
llama_model_loader: - kv 18: tokenizer.ggml.eos_token_id u32 = 2
llama_model_loader: - kv 19: tokenizer.ggml.unknown_token_id u32 = 0
llama_model_loader: - kv 20: tokenizer.ggml.add_bos_token bool = true
llama_model_loader: - kv 21: tokenizer.ggml.add_eos_token bool = false
llama_model_loader: - kv 22: tokenizer.chat_template str = {{ bos_token }}{% for message in mess...
llama_model_loader: - kv 23: general.quantization_version u32 = 2
llama_model_loader: - type f32: 65 tensors
llama_model_loader: - type q4_0: 225 tensors
llama_model_loader: - type q6_K: 1 tensors
llm_load_vocab: special tokens definition check successful ( 259/32000 ).
llm_load_print_meta: format = GGUF V3 (latest)
llm_load_print_meta: arch = llama
llm_load_print_meta: vocab type = SPM
llm_load_print_meta: n_vocab = 32000
llm_load_print_meta: n_merges = 0
llm_load_print_meta: n_ctx_train = 32768
llm_load_print_meta: n_embd = 4096
llm_load_print_meta: n_head = 32
llm_load_print_meta: n_head_kv = 8
llm_load_print_meta: n_layer = 32
llm_load_print_meta: n_rot = 128
llm_load_print_meta: n_embd_head_k = 128
llm_load_print_meta: n_embd_head_v = 128
llm_load_print_meta: n_gqa = 4
llm_load_print_meta: n_embd_k_gqa = 1024
llm_load_print_meta: n_embd_v_gqa = 1024
llm_load_print_meta: f_norm_eps = 0.0e+00
llm_load_print_meta: f_norm_rms_eps = 1.0e-05
llm_load_print_meta: f_clamp_kqv = 0.0e+00
llm_load_print_meta: f_max_alibi_bias = 0.0e+00
llm_load_print_meta: n_ff = 14336
llm_load_print_meta: n_expert = 0
llm_load_print_meta: n_expert_used = 0
llm_load_print_meta: pooling type = 0
llm_load_print_meta: rope type = 0
llm_load_print_meta: rope scaling = linear
llm_load_print_meta: freq_base_train = 1000000.0
llm_load_print_meta: freq_scale_train = 1
llm_load_print_meta: n_yarn_orig_ctx = 32768
llm_load_print_meta: rope_finetuned = unknown
llm_load_print_meta: model type = 7B
llm_load_print_meta: model ftype = Q4_0
llm_load_print_meta: model params = 7.24 B
llm_load_print_meta: model size = 3.83 GiB (4.54 BPW)
llm_load_print_meta: general.name = mistralai
llm_load_print_meta: BOS token = 1 '<s>'
llm_load_print_meta: EOS token = 2 '</s>'
llm_load_print_meta: UNK token = 0 '<unk>'
llm_load_print_meta: LF token = 13 '<0x0A>'
llm_load_tensors: ggml ctx size = 0.11 MiB
llm_load_tensors: CPU buffer size = 3917.87 MiB
..................................................................................................
llama_new_context_with_model: n_ctx = 2048
llama_new_context_with_model: freq_base = 1000000.0
llama_new_context_with_model: freq_scale = 1
llama_kv_cache_init: CPU KV buffer size = 256.00 MiB
llama_new_context_with_model: KV self size = 256.00 MiB, K (f16): 128.00 MiB, V (f16): 128.00 MiB
llama_new_context_with_model: CPU input buffer size = 13.02 MiB
llama_new_context_with_model: CPU compute buffer size = 160.00 MiB
llama_new_context_with_model: graph splits (measure): 1
{"function":"initialize","level":"INFO","line":434,"msg":"initializing slots","n_slots":1,"tid":"10760","timestamp":1710898973}
{"function":"initialize","level":"INFO","line":446,"msg":"new slot","n_ctx_slot":2048,"slot_id":0,"tid":"10760","timestamp":1710898973}
time=2024-03-20T09:42:53.960+08:00 level=INFO source=dyn_ext_server.go:162 msg="Starting llama main loop"
{"function":"update_slots","level":"INFO","line":1584,"msg":"all slots are idle and system prompt is empty, clear the KV cache","tid":"26808","timestamp":1710898973}
{"function":"launch_slot_with_data","level":"INFO","line":827,"msg":"slot is processing task","slot_id":0,"task_id":0,"tid":"26808","timestamp":1710898973}
{"function":"update_slots","ga_i":0,"level":"INFO","line":1822,"msg":"slot progression","n_past":0,"n_past_se":0,"n_prompt_tokens_processed":38,"slot_id":0,"task_id":0,"tid":"26808","timestamp":1710898973}
{"function":"update_slots","level":"INFO","line":1846,"msg":"kv cache rm [p0, end)","p0":0,"slot_id":0,"task_id":0,"tid":"26808","timestamp":1710898973}
{"function":"print_timings","level":"INFO","line":264,"msg":"prompt eval time = 3045.29 ms / 38 tokens ( 80.14 ms per token, 12.48 tokens per second)","n_prompt_tokens_processed":38,"n_tokens_second":12.478273847541107,"slot_id":0,"t_prompt_processing":3045.293,"t_token":80.13928947368422,"task_id":0,"tid":"26808","timestamp":1710898990}
{"function":"print_timings","level":"INFO","line":278,"msg":"generation eval time = 13855.03 ms / 80 runs ( 173.19 ms per token, 5.77 tokens per second)","n_decoded":80,"n_tokens_second":5.7740741903575135,"slot_id":0,"t_token":173.1879375,"t_token_generation":13855.035,"task_id":0,"tid":"26808","timestamp":1710898990}
{"function":"print_timings","level":"INFO","line":287,"msg":" total time = 16900.33 ms","slot_id":0,"t_prompt_processing":3045.293,"t_token_generation":13855.035,"t_total":16900.328,"task_id":0,"tid":"26808","timestamp":1710898990}
{"function":"update_slots","level":"INFO","line":1654,"msg":"slot released","n_cache_tokens":118,"n_ctx":2048,"n_past":117,"n_system_tokens":0,"slot_id":0,"task_id":0,"tid":"26808","timestamp":1710898990,"truncated":false}
[GIN] 2024/03/20 - 09:43:10 | 200 | 20.4589928s | 127.0.0.1 | POST "/api/chat"