12GB-vs-16GB

Created Diff never expires
98 removals
Lines
Total
Removed
Words
Total
Removed
To continue using this feature, upgrade to
Diffchecker logo
Diffchecker Pro
277 lines
94 additions
Lines
Total
Added
Words
Total
Added
To continue using this feature, upgrade to
Diffchecker logo
Diffchecker Pro
275 lines
2025/05/05 13:07:25 routes.go:1233: INFO server config env="map[CUDA_VISIBLE_DEVICES:1 GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_CONTEXT_LENGTH:4096 OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://0.0.0.0:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_KV_CACHE_TYPE: OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/root/.ollama/models OLLAMA_MULTIUSER_CACHE:false OLLAMA_NEW_ENGINE:false OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://* vscode-webview://* vscode-file://*] OLLAMA_SCHED_SPREAD:false ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
2025/05/05 13:49:12 routes.go:1233: INFO server config env="map[CUDA_VISIBLE_DEVICES:0 GPU_DEVICE_ORDINAL: HIP_VISIBLE_DEVICES: HSA_OVERRIDE_GFX_VERSION: HTTPS_PROXY: HTTP_PROXY: NO_PROXY: OLLAMA_CONTEXT_LENGTH:4096 OLLAMA_DEBUG:false OLLAMA_FLASH_ATTENTION:false OLLAMA_GPU_OVERHEAD:0 OLLAMA_HOST:http://0.0.0.0:11434 OLLAMA_INTEL_GPU:false OLLAMA_KEEP_ALIVE:5m0s OLLAMA_KV_CACHE_TYPE: OLLAMA_LLM_LIBRARY: OLLAMA_LOAD_TIMEOUT:5m0s OLLAMA_MAX_LOADED_MODELS:0 OLLAMA_MAX_QUEUE:512 OLLAMA_MODELS:/root/.ollama/models OLLAMA_MULTIUSER_CACHE:false OLLAMA_NEW_ENGINE:false OLLAMA_NOHISTORY:false OLLAMA_NOPRUNE:false OLLAMA_NUM_PARALLEL:0 OLLAMA_ORIGINS:[http://localhost https://localhost http://localhost:* https://localhost:* http://127.0.0.1 https://127.0.0.1 http://127.0.0.1:* https://127.0.0.1:* http://0.0.0.0 https://0.0.0.0 http://0.0.0.0:* https://0.0.0.0:* app://* file://* tauri://* vscode-webview://* vscode-file://*] OLLAMA_SCHED_SPREAD:false ROCR_VISIBLE_DEVICES: http_proxy: https_proxy: no_proxy:]"
time=2025-05-05T13:07:25.139Z level=INFO source=images.go:458 msg="total blobs: 13"
time=2025-05-05T13:49:12.680Z level=INFO source=images.go:458 msg="total blobs: 13"
time=2025-05-05T13:07:25.140Z level=INFO source=images.go:465 msg="total unused blobs removed: 0"
time=2025-05-05T13:49:12.681Z level=INFO source=images.go:465 msg="total unused blobs removed: 0"
time=2025-05-05T13:07:25.140Z level=INFO source=routes.go:1300 msg="Listening on [::]:11434 (version 0.6.7)"
time=2025-05-05T13:49:12.681Z level=INFO source=routes.go:1300 msg="Listening on [::]:11434 (version 0.6.7)"
time=2025-05-05T13:07:25.141Z level=INFO source=gpu.go:217 msg="looking for compatible GPUs"
time=2025-05-05T13:49:12.682Z level=INFO source=gpu.go:217 msg="looking for compatible GPUs"
time=2025-05-05T13:07:25.464Z level=INFO source=types.go:130 msg="inference compute" id=GPU-ff09c0e9-5a28-3ac2-7460-12dded00031a library=cuda variant=v12 compute=8.6 driver=12.8 name="NVIDIA GeForce RTX 3060" total="11.6 GiB" available="11.5 GiB"
time=2025-05-05T13:49:13.053Z level=INFO source=types.go:130 msg="inference compute" id=GPU-426eaf30-0b2f-f612-17a7-2aa3cc3497fb library=cuda variant=v12 compute=12.0 driver=12.8 name="NVIDIA Graphics Device" total="15.5 GiB" available="15.4 GiB"
[GIN] 2025/05/05 - 13:07:27 | 200 | 94.653µs | 127.0.0.1 | HEAD "/"
[GIN] 2025/05/05 - 13:49:14 | 200 | 149.304µs | 127.0.0.1 | HEAD "/"
[GIN] 2025/05/05 - 13:07:27 | 200 | 332.494428ms | 127.0.0.1 | POST "/api/pull"
[GIN] 2025/05/05 - 13:49:15 | 200 | 400.965038ms | 127.0.0.1 | POST "/api/pull"
[GIN] 2025/05/05 - 13:07:29 | 200 | 49.961µs | 127.0.0.1 | HEAD "/"
[GIN] 2025/05/05 - 13:49:17 | 200 | 44.732µs | 127.0.0.1 | HEAD "/"
[GIN] 2025/05/05 - 13:07:29 | 200 | 213.715969ms | 127.0.0.1 | POST "/api/pull"
[GIN] 2025/05/05 - 13:49:17 | 200 | 174.298821ms | 127.0.0.1 | POST "/api/pull"
[GIN] 2025/05/05 - 13:07:31 | 200 | 42.131µs | 127.0.0.1 | HEAD "/"
[GIN] 2025/05/05 - 13:49:18 | 200 | 31.261µs | 127.0.0.1 | HEAD "/"
[GIN] 2025/05/05 - 13:07:31 | 200 | 193.73517ms | 127.0.0.1 | POST "/api/pull"
[GIN] 2025/05/05 - 13:49:19 | 200 | 212.931989ms | 127.0.0.1 | POST "/api/pull"
time=2025-05-05T13:32:21.428Z level=WARN source=ggml.go:152 msg="key not found" key=general.alignment default=32
time=2025-05-05T13:49:59.768Z level=WARN source=ggml.go:152 msg="key not found" key=general.alignment default=32
time=2025-05-05T13:32:21.640Z level=WARN source=ggml.go:152 msg="key not found" key=general.alignment default=32
time=2025-05-05T13:50:00.036Z level=WARN source=ggml.go:152 msg="key not found" key=general.alignment default=32
time=2025-05-05T13:32:21.671Z level=WARN source=ggml.go:152 msg="key not found" key=general.alignment default=32
time=2025-05-05T13:50:00.078Z level=WARN source=ggml.go:152 msg="key not found" key=general.alignment default=32
time=2025-05-05T13:32:21.671Z level=WARN source=ggml.go:152 msg="key not found" key=bert.vision.block_count default=0
time=2025-05-05T13:50:00.079Z level=WARN source=ggml.go:152 msg="key not found" key=bert.vision.block_count default=0
time=2025-05-05T13:32:21.671Z level=WARN source=ggml.go:152 msg="key not found" key=bert.attention.head_count_kv default=1
time=2025-05-05T13:50:00.079Z level=WARN source=ggml.go:152 msg="key not found" key=bert.attention.head_count_kv default=1
time=2025-05-05T13:32:21.671Z level=WARN source=ggml.go:152 msg="key not found" key=bert.attention.key_length default=64
time=2025-05-05T13:50:00.079Z level=WARN source=ggml.go:152 msg="key not found" key=bert.attention.key_length default=64
time=2025-05-05T13:32:21.671Z level=WARN source=ggml.go:152 msg="key not found" key=bert.attention.value_length default=64
time=2025-05-05T13:50:00.079Z level=WARN source=ggml.go:152 msg="key not found" key=bert.attention.value_length default=64
time=2025-05-05T13:32:21.672Z level=WARN source=ggml.go:152 msg="key not found" key=bert.attention.head_count_kv default=1
time=2025-05-05T13:50:00.079Z level=WARN source=ggml.go:152 msg="key not found" key=bert.attention.head_count_kv default=1
time=2025-05-05T13:32:21.672Z level=INFO source=sched.go:723 msg="new model will fit in available VRAM in single GPU, loading" model=/root/.ollama/models/blobs/sha256-daec91ffb5dd0c27411bd71f29932917c49cf529a641d0168496c3a501e3062c gpu=GPU-ff09c0e9-5a28-3ac2-7460-12dded00031a parallel=1 available=12384468992 required="1.6 GiB"
time=2025-05-05T13:50:00.079Z level=INFO source=sched.go:723 msg="new model will fit in available VRAM in single GPU, loading" model=/root/.ollama/models/blobs/sha256-daec91ffb5dd0c27411bd71f29932917c49cf529a641d0168496c3a501e3062c gpu=GPU-426eaf30-0b2f-f612-17a7-2aa3cc3497fb parallel=1 available=16488660992 required="1.6 GiB"
time=2025-05-05T13:32:21.839Z level=INFO source=server.go:105 msg="system memory" total="62.8 GiB" free="60.0 GiB" free_swap="0 B"
time=2025-05-05T13:50:00.236Z level=INFO source=server.go:105 msg="system memory" total="62.8 GiB" free="60.0 GiB" free_swap="0 B"
time=2025-05-05T13:32:21.839Z level=WARN source=ggml.go:152 msg="key not found" key=bert.vision.block_count default=0
time=2025-05-05T13:50:00.236Z level=WARN source=ggml.go:152 msg="key not found" key=bert.vision.block_count default=0
time=2025-05-05T13:32:21.839Z level=WARN source=ggml.go:152 msg="key not found" key=bert.attention.head_count_kv default=1
time=2025-05-05T13:50:00.236Z level=WARN source=ggml.go:152 msg="key not found" key=bert.attention.head_count_kv default=1
time=2025-05-05T13:32:21.839Z level=WARN source=ggml.go:152 msg="key not found" key=bert.attention.key_length default=64
time=2025-05-05T13:50:00.236Z level=WARN source=ggml.go:152 msg="key not found" key=bert.attention.key_length default=64
time=2025-05-05T13:32:21.839Z level=WARN source=ggml.go:152 msg="key not found" key=bert.attention.value_length default=64
time=2025-05-05T13:50:00.236Z level=WARN source=ggml.go:152 msg="key not found" key=bert.attention.value_length default=64
time=2025-05-05T13:32:21.839Z level=WARN source=ggml.go:152 msg="key not found" key=bert.attention.head_count_kv default=1
time=2025-05-05T13:50:00.238Z level=WARN source=ggml.go:152 msg="key not found" key=bert.attention.head_count_kv default=1
time=2025-05-05T13:32:21.839Z level=INFO source=server.go:138 msg=offload library=cuda layers.requested=-1 layers.model=25 layers.offload=25 layers.split="" memory.available="[11.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="1.6 GiB" memory.required.partial="1.6 GiB" memory.required.kv="24.0 MiB" memory.required.allocations="[1.6 GiB]" memory.weights.total="1.0 GiB" memory.weights.repeating="577.2 MiB" memory.weights.nonrepeating="488.3 MiB" memory.graph.full="64.0 MiB" memory.graph.partial="64.0 MiB"
time=2025-05-05T13:50:00.239Z level=INFO source=server.go:138 msg=offload library=cuda layers.requested=-1 layers.model=25 layers.offload=25 layers.split="" memory.available="[15.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="1.6 GiB" memory.required.partial="1.6 GiB" memory.required.kv="24.0 MiB" memory.required.allocations="[1.6 GiB]" memory.weights.total="1.0 GiB" memory.weights.repeating="577.2 MiB" memory.weights.nonrepeating="488.3 MiB" memory.graph.full="64.0 MiB" memory.graph.partial="64.0 MiB"
llama_model_loader: loaded meta data with 33 key-value pairs and 389 tensors from /root/.ollama/models/blobs/sha256-daec91ffb5dd0c27411bd71f29932917c49cf529a641d0168496c3a501e3062c (version GGUF V3 (latest))
llama_model_loader: loaded meta data with 33 key-value pairs and 389 tensors from /root/.ollama/models/blobs/sha256-daec91ffb5dd0c27411bd71f29932917c49cf529a641d0168496c3a501e3062c (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv 0: general.architecture str = bert
llama_model_loader: - kv 0: general.architecture str = bert
llama_model_loader: - kv 1: general.type str = model
llama_model_loader: - kv 1: general.type str = model
llama_model_loader: - kv 2: general.size_label str = 567M
llama_model_loader: - kv 2: general.size_label str = 567M
llama_model_loader: - kv 3: general.license str = mit
llama_model_loader: - kv 3: general.license str = mit
llama_model_loader: - kv 4: general.tags arr[str,4] = ["sentence-transformers", "feature-ex...
llama_model_loader: - kv 4: general.tags arr[str,4] = ["sentence-transformers", "feature-ex...
llama_model_loader: - kv 5: bert.block_count u32 = 24
llama_model_loader: - kv 5: bert.block_count u32 = 24
llama_model_loader: - kv 6: bert.context_length u32 = 8192
llama_model_loader: - kv 6: bert.context_length u32 = 8192
llama_model_loader: - kv 7: bert.embedding_length u32 = 1024
llama_model_loader: - kv 7: bert.embedding_length u32 = 1024
llama_model_loader: - kv 8: bert.feed_forward_length u32 = 4096
llama_model_loader: - kv 8: bert.feed_forward_length u32 = 4096
llama_model_loader: - kv 9: bert.attention.head_count u32 = 16
llama_model_loader: - kv 9: bert.attention.head_count u32 = 16
llama_model_loader: - kv 10: bert.attention.layer_norm_epsilon f32 = 0.000010
llama_model_loader: - kv 10: bert.attention.layer_norm_epsilon f32 = 0.000010
llama_model_loader: - kv 11: general.file_type u32 = 1
llama_model_loader: - kv 11: general.file_type u32 = 1
llama_model_loader: - kv 12: bert.attention.causal bool = false
llama_model_loader: - kv 12: bert.attention.causal bool = false
llama_model_loader: - kv 13: bert.pooling_type u32 = 2
llama_model_loader: - kv 13: bert.pooling_type u32 = 2
llama_model_loader: - kv 14: tokenizer.ggml.model str = t5
llama_model_loader: - kv 14: tokenizer.ggml.model str = t5
llama_model_loader: - kv 15: tokenizer.ggml.pre str = default
llama_model_loader: - kv 15: tokenizer.ggml.pre str = default
llama_model_loader: - kv 16: tokenizer.ggml.tokens arr[str,250002] = ["<s>", "<pad>", "</s>", "<unk>", ","...
llama_model_loader: - kv 16: tokenizer.ggml.tokens arr[str,250002] = ["<s>", "<pad>", "</s>", "<unk>", ","...
llama_model_loader: - kv 17: tokenizer.ggml.scores arr[f32,250002] = [0.000000, 0.000000, 0.000000, 0.0000...
llama_model_loader: - kv 17: tokenizer.ggml.scores arr[f32,250002] = [0.000000, 0.000000, 0.000000, 0.0000...
llama_model_loader: - kv 18: tokenizer.ggml.token_type arr[i32,250002] = [3, 3, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, ...
llama_model_loader: - kv 18: tokenizer.ggml.token_type arr[i32,250002] = [3, 3, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, ...
llama_model_loader: - kv 19: tokenizer.ggml.add_space_prefix bool = true
llama_model_loader: - kv 19: tokenizer.ggml.add_space_prefix bool = true
llama_model_loader: - kv 20: tokenizer.ggml.token_type_count u32 = 1
llama_model_loader: - kv 20: tokenizer.ggml.token_type_count u32 = 1
llama_model_loader: - kv 21: tokenizer.ggml.remove_extra_whitespaces bool = true
llama_model_loader: - kv 21: tokenizer.ggml.remove_extra_whitespaces bool = true
llama_model_loader: - kv 22: tokenizer.ggml.precompiled_charsmap arr[u8,237539] = [0, 180, 2, 0, 0, 132, 0, 0, 0, 0, 0,...
llama_model_loader: - kv 22: tokenizer.ggml.precompiled_charsmap arr[u8,237539] = [0, 180, 2, 0, 0, 132, 0, 0, 0, 0, 0,...
llama_model_loader: - kv 23: tokenizer.ggml.bos_token_id u32 = 0
llama_model_loader: - kv 23: tokenizer.ggml.bos_token_id u32 = 0
llama_model_loader: - kv 24: tokenizer.ggml.eos_token_id u32 = 2
llama_model_loader: - kv 24: tokenizer.ggml.eos_token_id u32 = 2
llama_model_loader: - kv 25: tokenizer.ggml.unknown_token_id u32 = 3
llama_model_loader: - kv 25: tokenizer.ggml.unknown_token_id u32 = 3
llama_model_loader: - kv 26: tokenizer.ggml.seperator_token_id u32 = 2
llama_model_loader: - kv 26: tokenizer.ggml.seperator_token_id u32 = 2
llama_model_loader: - kv 27: tokenizer.ggml.padding_token_id u32 = 1
llama_model_loader: - kv 27: tokenizer.ggml.padding_token_id u32 = 1
llama_model_loader: - kv 28: tokenizer.ggml.cls_token_id u32 = 0
llama_model_loader: - kv 28: tokenizer.ggml.cls_token_id u32 = 0
llama_model_loader: - kv 29: tokenizer.ggml.mask_token_id u32 = 250001
llama_model_loader: - kv 29: tokenizer.ggml.mask_token_id u32 = 250001
llama_model_loader: - kv 30: tokenizer.ggml.add_bos_token bool = true
llama_model_loader: - kv 30: tokenizer.ggml.add_bos_token bool = true
llama_model_loader: - kv 31: tokenizer.ggml.add_eos_token bool = true
llama_model_loader: - kv 31: tokenizer.ggml.add_eos_token bool = true
llama_model_loader: - kv 32: general.quantization_version u32 = 2
llama_model_loader: - kv 32: general.quantization_version u32 = 2
llama_model_loader: - type f32: 244 tensors
llama_model_loader: - type f32: 244 tensors
llama_model_loader: - type f16: 145 tensors
llama_model_loader: - type f16: 145 tensors
print_info: file format = GGUF V3 (latest)
print_info: file format = GGUF V3 (latest)
print_info: file type = F16
print_info: file type = F16
print_info: file size = 1.07 GiB (16.25 BPW)
print_info: file size = 1.07 GiB (16.25 BPW)
load: model vocab missing newline token, using special_pad_id instead
load: model vocab missing newline token, using special_pad_id instead
load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
load: special tokens cache size = 4
load: special tokens cache size = 4
load: token to piece cache size = 2.1668 MB
load: token to piece cache size = 2.1668 MB
print_info: arch = bert
print_info: arch = bert
print_info: vocab_only = 1
print_info: vocab_only = 1
print_info: model type = ?B
print_info: model type = ?B
print_info: model params = 566.70 M
print_info: model params = 566.70 M
print_info: general.name = n/a
print_info: general.name = n/a
print_info: vocab type = UGM
print_info: vocab type = UGM
print_info: n_vocab = 250002
print_info: n_vocab = 250002
print_info: n_merges = 0
print_info: n_merges = 0
print_info: BOS token = 0 '<s>'
print_info: BOS token = 0 '<s>'
print_info: EOS token = 2 '</s>'
print_info: EOS token = 2 '</s>'
print_info: UNK token = 3 '<unk>'
print_info: UNK token = 3 '<unk>'
print_info: SEP token = 2 '</s>'
print_info: SEP token = 2 '</s>'
print_info: PAD token = 1 '<pad>'
print_info: PAD token = 1 '<pad>'
print_info: MASK token = 250001 '[PAD250000]'
print_info: MASK token = 250001 '[PAD250000]'
print_info: LF token = 0 '<s>'
print_info: LF token = 0 '<s>'
print_info: EOG token = 2 '</s>'
print_info: EOG token = 2 '</s>'
print_info: max token length = 48
print_info: max token length = 48
llama_model_load: vocab only - skipping tensors
llama_model_load: vocab only - skipping tensors
time=2025-05-05T13:32:22.374Z level=INFO source=server.go:409 msg="starting llama server" cmd="/usr/bin/ollama runner --model /root/.ollama/models/blobs/sha256-daec91ffb5dd0c27411bd71f29932917c49cf529a641d0168496c3a501e3062c --ctx-size 4096 --batch-size 512 --n-gpu-layers 25 --threads 32 --parallel 1 --port 43807"
time=2025-05-05T13:50:00.840Z level=INFO source=server.go:409 msg="starting llama server" cmd="/usr/bin/ollama runner --model /root/.ollama/models/blobs/sha256-daec91ffb5dd0c27411bd71f29932917c49cf529a641d0168496c3a501e3062c --ctx-size 4096 --batch-size 512 --n-gpu-layers 25 --threads 32 --parallel 1 --port 45529"
time=2025-05-05T13:32:22.375Z level=INFO source=sched.go:450 msg="loaded runners" count=1
time=2025-05-05T13:50:00.841Z level=INFO source=sched.go:450 msg="loaded runners" count=1
time=2025-05-05T13:32:22.375Z level=INFO source=server.go:585 msg="waiting for llama runner to start responding"
time=2025-05-05T13:50:00.841Z level=INFO source=server.go:585 msg="waiting for llama runner to start responding"
time=2025-05-05T13:32:22.375Z level=INFO source=server.go:619 msg="waiting for server to become available" status="llm server error"
time=2025-05-05T13:50:00.841Z level=INFO source=server.go:619 msg="waiting for server to become available" status="llm server error"
time=2025-05-05T13:32:22.405Z level=INFO source=runner.go:853 msg="starting go runner"
time=2025-05-05T13:50:00.861Z level=INFO source=runner.go:853 msg="starting go runner"
load_backend: loaded CPU backend from /usr/lib/ollama/libggml-cpu-haswell.so
load_backend: loaded CPU backend from /usr/lib/ollama/libggml-cpu-haswell.so
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 1 CUDA devices:
ggml_cuda_init: found 1 CUDA devices:
Device 0: NVIDIA GeForce RTX 3060, compute capability 8.6, VMM: yes
Device 0: NVIDIA Graphics Device, compute capability 12.0, VMM: yes
load_backend: loaded CUDA backend from /usr/lib/ollama/cuda_v12/libggml-cuda.so
load_backend: loaded CUDA backend from /usr/lib/ollama/cuda_v12/libggml-cuda.so
time=2025-05-05T13:32:22.524Z level=INFO source=ggml.go:103 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.AVX2=1 CPU.0.F16C=1 CPU.0.FMA=1 CPU.0.BMI2=1 CPU.0.LLAMAFILE=1 CPU.1.LLAMAFILE=1 CUDA.0.ARCHS=500,600,610,700,750,800,860,870,890,900,1200 CUDA.0.USE_GRAPHS=1 CUDA.0.PEER_MAX_BATCH_SIZE=128 compiler=cgo(gcc)
time=2025-05-05T13:50:00.963Z level=INFO source=ggml.go:103 msg=system CPU.0.SSE3=1 CPU.0.SSSE3=1 CPU.0.AVX=1 CPU.0.AVX2=1 CPU.0.F16C=1 CPU.0.FMA=1 CPU.0.BMI2=1 CPU.0.LLAMAFILE=1 CPU.1.LLAMAFILE=1 CUDA.0.ARCHS=500,600,610,700,750,800,860,870,890,900,1200 CUDA.0.USE_GRAPHS=1 CUDA.0.PEER_MAX_BATCH_SIZE=128 compiler=cgo(gcc)
time=2025-05-05T13:32:22.524Z level=INFO source=runner.go:913 msg="Server listening on 127.0.0.1:43807"
time=2025-05-05T13:50:00.963Z level=INFO source=runner.go:913 msg="Server listening on 127.0.0.1:45529"
time=2025-05-05T13:32:22.627Z level=INFO source=server.go:619 msg="waiting for server to become available" status="llm server loading model"
llama_model_load_from_file_impl: using device CUDA0 (NVIDIA Graphics Device) - 15724 MiB free
llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3060) - 11810 MiB free
time=2025-05-05T13:50:01.094Z level=INFO source=server.go:619 msg="waiting for server to become available" status="llm server loading model"
llama_model_loader: loaded meta data with 33 key-value pairs and 389 tensors from /root/.ollama/models/blobs/sha256-daec91ffb5dd0c27411bd71f29932917c49cf529a641d0168496c3a501e3062c (version GGUF V3 (latest))
llama_model_loader: loaded meta data with 33 key-value pairs and 389 tensors from /root/.ollama/models/blobs/sha256-daec91ffb5dd0c27411bd71f29932917c49cf529a641d0168496c3a501e3062c (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv 0: general.architecture str = bert
llama_model_loader: - kv 0: general.architecture str = bert
llama_model_loader: - kv 1: general.type str = model
llama_model_loader: - kv 1: general.type str = model
llama_model_loader: - kv 2: general.size_label str = 567M
llama_model_loader: - kv 2: general.size_label str = 567M
llama_model_loader: - kv 3: general.license str = mit
llama_model_loader: - kv 3: general.license str = mit
llama_model_loader: - kv 4: general.tags arr[str,4] = ["sentence-transformers", "feature-ex...
llama_model_loader: - kv 4: general.tags arr[str,4] = ["sentence-transformers", "feature-ex...
llama_model_loader: - kv 5: bert.block_count u32 = 24
llama_model_loader: - kv 5: bert.block_count u32 = 24
llama_model_loader: - kv 6: bert.context_length u32 = 8192
llama_model_loader: - kv 6: bert.context_length u32 = 8192
llama_model_loader: - kv 7: bert.embedding_length u32 = 1024
llama_model_loader: - kv 7: bert.embedding_length u32 = 1024
llama_model_loader: - kv 8: bert.feed_forward_length u32 = 4096
llama_model_loader: - kv 8: bert.feed_forward_length u32 = 4096
llama_model_loader: - kv 9: bert.attention.head_count u32 = 16
llama_model_loader: - kv 9: bert.attention.head_count u32 = 16
llama_model_loader: - kv 10: bert.attention.layer_norm_epsilon f32 = 0.000010
llama_model_loader: - kv 10: bert.attention.layer_norm_epsilon f32 = 0.000010
llama_model_loader: - kv 11: general.file_type u32 = 1
llama_model_loader: - kv 11: general.file_type u32 = 1
llama_model_loader: - kv 12: bert.attention.causal bool = false
llama_model_loader: - kv 12: bert.attention.causal bool = false
llama_model_loader: - kv 13: bert.pooling_type u32 = 2
llama_model_loader: - kv 13: bert.pooling_type u32 = 2
llama_model_loader: - kv 14: tokenizer.ggml.model str = t5
llama_model_loader: - kv 14: tokenizer.ggml.model str = t5
llama_model_loader: - kv 15: tokenizer.ggml.pre str = default
llama_model_loader: - kv 15: tokenizer.ggml.pre str = default
llama_model_loader: - kv 16: tokenizer.ggml.tokens arr[str,250002] = ["<s>", "<pad>", "</s>", "<unk>", ","...
llama_model_loader: - kv 16: tokenizer.ggml.tokens arr[str,250002] = ["<s>", "<pad>", "</s>", "<unk>", ","...
llama_model_loader: - kv 17: tokenizer.ggml.scores arr[f32,250002] = [0.000000, 0.000000, 0.000000, 0.0000...
llama_model_loader: - kv 17: tokenizer.ggml.scores arr[f32,250002] = [0.000000, 0.000000, 0.000000, 0.0000...
llama_model_loader: - kv 18: tokenizer.ggml.token_type arr[i32,250002] = [3, 3, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, ...
llama_model_loader: - kv 18: tokenizer.ggml.token_type arr[i32,250002] = [3, 3, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, ...
llama_model_loader: - kv 19: tokenizer.ggml.add_space_prefix bool = true
llama_model_loader: - kv 19: tokenizer.ggml.add_space_prefix bool = true
llama_model_loader: - kv 20: tokenizer.ggml.token_type_count u32 = 1
llama_model_loader: - kv 20: tokenizer.ggml.token_type_count u32 = 1
llama_model_loader: - kv 21: tokenizer.ggml.remove_extra_whitespaces bool = true
llama_model_loader: - kv 21: tokenizer.ggml.remove_extra_whitespaces bool = true
llama_model_loader: - kv 22: tokenizer.ggml.precompiled_charsmap arr[u8,237539] = [0, 180, 2, 0, 0, 132, 0, 0, 0, 0, 0,...
llama_model_loader: - kv 22: tokenizer.ggml.precompiled_charsmap arr[u8,237539] = [0, 180, 2, 0, 0, 132, 0, 0, 0, 0, 0,...
llama_model_loader: - kv 23: tokenizer.ggml.bos_token_id u32 = 0
llama_model_loader: - kv 23: tokenizer.ggml.bos_token_id u32 = 0
llama_model_loader: - kv 24: tokenizer.ggml.eos_token_id u32 = 2
llama_model_loader: - kv 24: tokenizer.ggml.eos_token_id u32 = 2
llama_model_loader: - kv 25: tokenizer.ggml.unknown_token_id u32 = 3
llama_model_loader: - kv 25: tokenizer.ggml.unknown_token_id u32 = 3
llama_model_loader: - kv 26: tokenizer.ggml.seperator_token_id u32 = 2
llama_model_loader: - kv 26: tokenizer.ggml.seperator_token_id u32 = 2
llama_model_loader: - kv 27: tokenizer.ggml.padding_token_id u32 = 1
llama_model_loader: - kv 27: tokenizer.ggml.padding_token_id u32 = 1
llama_model_loader: - kv 28: tokenizer.ggml.cls_token_id u32 = 0
llama_model_loader: - kv 28: tokenizer.ggml.cls_token_id u32 = 0
llama_model_loader: - kv 29: tokenizer.ggml.mask_token_id u32 = 250001
llama_model_loader: - kv 29: tokenizer.ggml.mask_token_id u32 = 250001
llama_model_loader: - kv 30: tokenizer.ggml.add_bos_token bool = true
llama_model_loader: - kv 30: tokenizer.ggml.add_bos_token bool = true
llama_model_loader: - kv 31: tokenizer.ggml.add_eos_token bool = true
llama_model_loader: - kv 31: tokenizer.ggml.add_eos_token bool = true
llama_model_loader: - kv 32: general.quantization_version u32 = 2
llama_model_loader: - kv 32: general.quantization_version u32 = 2
llama_model_loader: - type f32: 244 tensors
llama_model_loader: - type f32: 244 tensors
llama_model_loader: - type f16: 145 tensors
llama_model_loader: - type f16: 145 tensors
print_info: file format = GGUF V3 (latest)
print_info: file format = GGUF V3 (latest)
print_info: file type = F16
print_info: file type = F16
print_info: file size = 1.07 GiB (16.25 BPW)
print_info: file size = 1.07 GiB (16.25 BPW)
load: model vocab missing newline token, using special_pad_id instead
load: model vocab missing newline token, using special_pad_id instead
load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
load: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
load: special tokens cache size = 4
load: special tokens cache size = 4
load: token to piece cache size = 2.1668 MB
load: token to piece cache size = 2.1668 MB
print_info: arch = bert
print_info: arch = bert
print_info: vocab_only = 0
print_info: vocab_only = 0
print_info: n_ctx_train = 8192
print_info: n_ctx_train = 8192
print_info: n_embd = 1024
print_info: n_embd = 1024
print_info: n_layer = 24
print_info: n_layer = 24
print_info: n_head = 16
print_info: n_head = 16
print_info: n_head_kv = 16
print_info: n_head_kv = 16
print_info: n_rot = 64
print_info: n_rot = 64
print_info: n_swa = 0
print_info: n_swa = 0
print_info: n_swa_pattern = 1
print_info: n_swa_pattern = 1
print_info: n_embd_head_k = 64
print_info: n_embd_head_k = 64
print_info: n_embd_head_v = 64
print_info: n_embd_head_v = 64
print_info: n_gqa = 1
print_info: n_gqa = 1
print_info: n_embd_k_gqa = 1024
print_info: n_embd_k_gqa = 1024
print_info: n_embd_v_gqa = 1024
print_info: n_embd_v_gqa = 1024
print_info: f_norm_eps = 1.0e-05
print_info: f_norm_eps = 1.0e-05
print_info: f_norm_rms_eps = 0.0e+00
print_info: f_norm_rms_eps = 0.0e+00
print_info: f_clamp_kqv = 0.0e+00
print_info: f_clamp_kqv = 0.0e+00
print_info: f_max_alibi_bias = 0.0e+00
print_info: f_max_alibi_bias = 0.0e+00
print_info: f_logit_scale = 0.0e+00
print_info: f_logit_scale = 0.0e+00
print_info: f_attn_scale = 0.0e+00
print_info: f_attn_scale = 0.0e+00
print_info: n_ff = 4096
print_info: n_ff = 4096
print_info: n_expert = 0
print_info: n_expert = 0
print_info: n_expert_used = 0
print_info: n_expert_used = 0
print_info: causal attn = 0
print_info: causal attn = 0
print_info: pooling type = 2
print_info: pooling type = 2
print_info: rope type = 2
print_info: rope type = 2
print_info: rope scaling = linear
print_info: rope scaling = linear
print_info: freq_base_train = 10000.0
print_info: freq_base_train = 10000.0
print_info: freq_scale_train = 1
print_info: freq_scale_train = 1
print_info: n_ctx_orig_yarn = 8192
print_info: n_ctx_orig_yarn = 8192
print_info: rope_finetuned = unknown
print_info: rope_finetuned = unknown
print_info: ssm_d_conv = 0
print_info: ssm_d_conv = 0
print_info: ssm_d_inner = 0
print_info: ssm_d_inner = 0
print_info: ssm_d_state = 0
print_info: ssm_d_state = 0
print_info: ssm_dt_rank = 0
print_info: ssm_dt_rank = 0
print_info: ssm_dt_b_c_rms = 0
print_info: ssm_dt_b_c_rms = 0
print_info: model type = 335M
print_info: model type = 335M
print_info: model params = 566.70 M
print_info: model params = 566.70 M
print_info: general.name = n/a
print_info: general.name = n/a
print_info: vocab type = UGM
print_info: vocab type = UGM
print_info: n_vocab = 250002
print_info: n_vocab = 250002
print_info: n_merges = 0
print_info: n_merges = 0
print_info: BOS token = 0 '<s>'
print_info: BOS token = 0 '<s>'
print_info: EOS token = 2 '</s>'
print_info: EOS token = 2 '</s>'
print_info: UNK token = 3 '<unk>'
print_info: UNK token = 3 '<unk>'
print_info: SEP token = 2 '</s>'
print_info: SEP token = 2 '</s>'
print_info: PAD token = 1 '<pad>'
print_info: PAD token = 1 '<pad>'
print_info: MASK token = 250001 '[PAD250000]'
print_info: MASK token = 250001 '[PAD250000]'
print_info: LF token = 0 '<s>'
print_info: LF token = 0 '<s>'
print_info: EOG token = 2 '</s>'
print_info: EOG token = 2 '</s>'
print_info: max token length = 48
print_info: max token length = 48
load_tensors: loading model tensors, this can take a while... (mmap = true)
load_tensors: loading model tensors, this can take a while... (mmap = true)
load_tensors: offloading 24 repeating layers to GPU
load_tensors: offloading 24 repeating layers to GPU
load_tensors: offloading output layer to GPU
load_tensors: offloading output layer to GPU
load_tensors: offloaded 25/25 layers to GPU
load_tensors: offloaded 25/25 layers to GPU
load_tensors: CUDA0 model buffer size = 577.22 MiB
load_tensors: CUDA0 model buffer size = 577.22 MiB
load_tensors: CPU_Mapped model buffer size = 520.30 MiB
load_tensors: CPU_Mapped model buffer size = 520.30 MiB
llama_context: constructing llama_context
llama_context: constructing llama_context
llama_context: n_seq_max = 1
llama_context: n_seq_max = 1
llama_context: n_ctx = 4096
llama_context: n_ctx = 4096
llama_context: n_ctx_per_seq = 4096
llama_context: n_ctx_per_seq = 4096
llama_context: n_batch = 512
llama_context: n_batch = 512
llama_context: n_ubatch = 512
llama_context: n_ubatch = 512
llama_context: causal_attn = 0
llama_context: causal_attn = 0
llama_context: flash_attn = 0
llama_context: flash_attn = 0
llama_context: freq_base = 10000.0
llama_context: freq_base = 10000.0
llama_context: freq_scale = 1
llama_context: freq_scale = 1
llama_context: n_ctx_per_seq (4096) < n_ctx_train (8192) -- the full capacity of the model will not be utilized
llama_context: n_ctx_per_seq (4096) < n_ctx_train (8192) -- the full capacity of the model will not be utilized
llama_context: CUDA_Host output buffer size = 0.00 MiB
llama_context: CUDA_Host output buffer size = 0.00 MiB
init: kv_size = 4096, offload = 1, type_k = 'f16', type_v = 'f16', n_layer = 24, can_shift = 1
init: kv_size = 4096, offload = 1, type_k = 'f16', type_v = 'f16', n_layer = 24, can_shift = 1
init: CUDA0 KV buffer size = 384.00 MiB
init: CUDA0 KV buffer size = 384.00 MiB
llama_context: KV self size = 384.00 MiB, K (f16): 192.00 MiB, V (f16): 192.00 MiB
llama_context: KV self size = 384.00 MiB, K (f16): 192.00 MiB, V (f16): 192.00 MiB
llama_context: CUDA0 compute buffer size = 27.01 MiB
llama_context: CUDA0 compute buffer size = 27.01 MiB
llama_context: CUDA_Host compute buffer size = 5.01 MiB
llama_context: CUDA_Host compute buffer size = 5.01 MiB
llama_context: graph nodes = 825
llama_context: graph nodes = 825
llama_context: graph splits = 4 (with bs=512), 2 (with bs=1)
llama_context: graph splits = 4 (with bs=512), 2 (with bs=1)
time=2025-05-05T13:32:23.884Z level=INFO source=server.go:624 msg="llama runner started in 1.51 seconds"
time=2025-05-05T13:50:02.351Z level=INFO source=server.go:624 msg="llama runner started in 1.51 seconds"
time=2025-05-05T13:32:23.953Z level=WARN source=ggml.go:152 msg="key not found" key=general.alignment default=32
time=2025-05-05T13:50:02.405Z level=WARN source=ggml.go:152 msg="key not found" key=general.alignment default=32
[GIN] 2025/05/05 - 13:32:24 | 200 | 3.321225965s | 172.18.0.3 | POST "/api/embed"
[GIN] 2025/05/05 - 13:50:02 | 200 | 3.264967612s | 172.18.0.3 | POST "/api/embed"
time=2025-05-05T13:32:24.839Z level=WARN source=ggml.go:152 msg="key not found" key=general.alignment default=32
time=2025-05-05T13:50:03.126Z level=WARN source=ggml.go:152 msg="key not found" key=general.alignment default=32
time=2025-05-05T13:32:24.844Z level=WARN source=ggml.go:152 msg="key not found" key=general.alignment default=32
time=2025-05-05T13:50:03.133Z level=WARN source=ggml.go:152 msg="key not found" key=general.alignment default=32
time=2025-05-05T13:32:24.849Z level=WARN source=ggml.go:152 msg="key not found" key=general.alignment default=32
time=2025-05-05T13:50:03.133Z level=WARN source=ggml.go:152 msg="key not found" key=general.alignment default=32
time=2025-05-05T13:32:24.855Z level=WARN source=ggml.go:152 msg="key not found" key=general.alignment default=32
time=2025-05-05T13:50:03.134Z level=WARN source=ggml.go:152 msg="key not found" key=general.alignment default=32
time=2025-05-05T13:32:25.023Z level=WARN source=ggml.go:152 msg="key not found" key=general.alignment default=32
time=2025-05-05T13:50:03.310Z level=WARN source=ggml.go:152 msg="key not found" key=general.alignment default=32
time=2025-05-05T13:32:25.039Z level=WARN source=ggml.go:152 msg="key not found" key=general.alignment default=32
time=2025-05-05T13:50:03.341Z level=WARN source=ggml.go:152 msg="key not found" key=general.alignment default=32
time=2025-05-05T13:32:25.039Z level=INFO source=sched.go:512 msg="updated VRAM based on existing loaded models" gpu=GPU-ff09c0e9-5a28-3ac2-7460-12dded00031a library=cuda total="11.6 GiB" available="10.0 GiB"
time=2025-05-05T13:50:03.342Z level=INFO source=sched.go:512 msg="updated VRAM based on existing loaded models" gpu=GPU-426eaf30-0b2f-f612-17a7-2aa3cc3497fb library=cuda total="15.5 GiB" available="13.9 GiB"
time=2025-05-05T13:32:25.039Z level=WARN source=ggml.go:152 msg="key not found" key=llama.vision.block_count default=0
time=2025-05-05T13:50:03.343Z level=WARN source=ggml.go:152 msg="key not found" key=llama.vision.block_count default=0
time=2025-05-05T13:32:25.040Z level=WARN source=ggml.go:152 msg="key not found" key=llama.vision.block_count default=0
time=2025-05-05T13:50:03.343Z level=WARN source=ggml.go:152 msg="key not found" key=llama.vision.block_count default=0
time=2025-05-05T13:32:25.042Z level=WARN source=ggml.go:152 msg="key not found" key=llama.vision.block_count default=0
time=2025-05-05T13:50:03.344Z level=WARN source=ggml.go:152 msg="key not found" key=llama.vision.block_count default=0
time=2025-05-05T13:32:25.043Z level=WARN source=ggml.go:152 msg="key not found" key=llama.vision.block_count default=0
time=2025-05-05T13:50:03.345Z level=WARN source=ggml.go:152 msg="key not found" key=llama.vision.block_count default=0
time=2025-05-05T13:32:30.071Z level=WARN source=sched.go:649 msg="gpu VRAM usage didn't recover within timeout" seconds=5.027419298 model=/root/.ollama/models/blobs/sha256-daec91ffb5dd0c27411bd71f29932917c49cf529a641d0168496c3a501e3062c
time=2025-05-05T13:50:08.505Z level=WARN source=sched.go:649 msg="gpu VRAM usage didn't recover within timeout" seconds=5.15983382 model=/root/.ollama/models/blobs/sha256-daec91ffb5dd0c27411bd71f29932917c49cf529a641d0168496c3a501e3062c
time=2025-05-05T13:32:30.320Z level=WARN source=sched.go:649 msg="gpu VRAM usage didn't recover within timeout" seconds=5.276462555 model=/root/.ollama/models/blobs/sha256-daec91ffb5dd0c27411bd71f29932917c49cf529a641d0168496c3a501e3062c
time=2025-05-05T13:50:08.755Z level=WARN source=sched.go:649 msg="gpu VRAM usage didn't recover within timeout" seconds=5.409942632 model=/root/.ollama/models/blobs/sha256-daec91ffb5dd0c27411bd71f29932917c49cf529a641d0168496c3a501e3062c
time=2025-05-05T13:32:30.401Z level=WARN source=ggml.go:152 msg="key not found" key=general.alignment default=32
time=2025-05-05T13:50:08.829Z level=WARN source=ggml.go:152 msg="key not found" key=general.alignment default=32
time=2025-05-05T13:32:30.417Z level=WARN source=ggml.go:152 msg="key not found" key=general.alignment default=32
time=2025-05-05T13:50:08.845Z level=WARN source=ggml.go:152 msg="key not found" key=general.alignment default=32
time=2025-05-05T13:32:30.418Z level=WARN source=ggml.go:152 msg="key not found" key=llama.vision.block_count default=0
time=2025-05-05T13:50:08.845Z level=WARN source=ggml.go:152 msg="key not found" key=llama.vision.block_count default=0
time=2025-05-05T13:32:30.418Z level=WARN source=ggml.go:152 msg="key not found" key=llama.vision.block_count default=0
time=2025-05-05T13:50:08.846Z level=WARN source=ggml.go:152 msg="key not found" key=llama.vision.block_count default=0
time=2025-05-05T13:32:30.418Z level=WARN source=ggml.go:152 msg="key not found" key=llama.vision.block_count default=0
time=2025-05-05T13:50:08.846Z level=INFO source=sched.go:723 msg="new model will fit in available VRAM in single GPU, loading" model=/root/.ollama/models/blobs/sha256-b559938ab7a0392fc9ea9675b82280f2a15669ec3e0e0fc491c9cb0a7681cf94 gpu=GPU-426eaf30-0b2f-f612-17a7-2aa3cc3497fb parallel=1 available=16488660992 required="14.0 GiB"
time=2025-05-05T13:32:30.419Z level=WARN source=ggml.go:152 msg="key not found" key=llama.vision.block_count default=0
time=2025-05-05T13:50:09.005Z level=WARN source=sched.go:649 msg="gpu VRAM usage didn't recover within timeout" seconds=5.65960497 model=/root/.ollama/models/blobs/sha256-daec91ffb5dd0c27411bd71f29932917c49cf529a641d0168496c3a501e3062c
time=2025-05-05T13:32:30.570Z level=WARN source=sched.go:649 msg="gpu VRAM usage didn't recover within timeout" seconds=5.526962138 model=/root/.ollama/models/blobs/sha256-daec91ffb5dd0c27411bd71f29932917c49cf529a641d0168496c3a501e3062c
time=2025-05-05T13:50:09.130Z level=INFO source=server.go:105 msg="system memory" total="62.8 GiB" free="59.7 GiB" free_swap="0 B"
time=2025-05-05T13:32:30.675Z level=INFO source=server.go:105 msg="system memory" total="62.8 GiB" free="59.9 GiB" free_swap="0 B"
time=2025-05-05T13:50:09.130Z level=WARN source=ggml.go:152 msg="key not found" key=llama.vision.block_count default=0
time=2025-05-05T13:32:30.675Z level=WARN source=ggml.go:152 msg="key not found" key=llama.vision.block_count default=0
time=2025-05-05T13:50:09.131Z level=INFO source=server.go:138 msg=offload library=cuda layers.requested=-1 layers.model=41 layers.offload=41 layers.split="" memory.available="[15.4 GiB]" memory.gpu_overhead="0 B" memory.required.full="14.0 GiB" memory.required.partial="14.0 GiB" memory.required.kv="5.0 GiB" memory.required.allocations="[14.0 GiB]" memory.weights.total="6.2 GiB" memory.weights.repeating="5.7 GiB" memory.weights.nonrepeating="525.0 MiB" memory.graph.full="2.1 GiB" memory.graph.partial="2.3 GiB"
time=2025-05-05T13:32:30.677Z level=INFO source=server.go:138 msg=offload library=cuda layers.requested=-1 layers.model=41 layers.offload=31 layers.split="" memory.available="[11.5 GiB]" memory.gpu_overhead="0 B" memory.required.full="14.2 GiB" memory.required.partial="11.3 GiB" memory.required.kv="5.0 GiB" memory.required.allocations="[11.3 GiB]" memory.weights.total="6.2 GiB" memory.weights.repeating="5.7 GiB" memory.weights.nonrepeating="525.0 MiB" memory.graph.full="2.1 GiB" memory.graph.partial="2.3 GiB"
llama_model_loader: loaded meta data with 35 key-value pairs and 363 tensors from /root/.ollama/models/blobs/sha256-b559938ab7a0392fc9ea9675b82280f2a15669ec3e0e0fc491c9cb0a7681cf94 (version GGUF V3 (latest))
llama_model_loader: loaded meta data with 35 key-value pairs and 363 tensors from /root/.ollama/models/blobs/sha256-b559938ab7a0392fc9ea9675b82280f2a15669ec3e0e0fc491c9cb0a7681cf94 (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv 0: general.architecture str = llama
llama_model_loader: - kv 0: general.architecture str = llama
llama_model_loader: - kv 1: general.type str = model
llama_model_loader: - kv 1: general.type str = model
llama_model_loader: - kv 2: general.name str = Mistral Nemo Instruct 2407
llama_model_loader: - kv 2: general.name str = Mistral Nemo Instruct 2407
llama_model_loader: - kv 3: general.version str = 2407
llama_model_loader: - kv 3: general.version str = 2407
llama_model_loader: - kv 4: general.finetune str = Instruct
llama_model_loader: - kv 4: general.finetune str = Instruct
llama_model_loader: - kv 5: general.basename str = Mistral-Nemo
llama_model_loader: - kv 5: general.basename str = Mistral-Nemo
llama_model_loader: - kv 6: general.size_label str = 12B
llama_model_loader: - kv 6: general.size_label str = 12B
llama_model_loader: - kv 7: general.license str = apache-2.0
llama_model_loader: - kv 7: general.license str = apache-2.0
llama_model_loader: - kv 8: general.languages arr[str,9] = ["en", "fr", "de", "es", "it", "pt", ...
llama_model_loader: - kv 8: general.languages arr[str,9] = ["en", "fr", "de", "es", "it", "pt", ...
llama_model_loader: - kv 9: llama.block_count u32 = 40
llama_model_loader: - kv 9: llama.block_count u32 = 40
llama_model_loader: - kv 10: llama.context_length u32 = 1024000
llama_model_loader: - kv 10: llama.context_length u32 = 1024000
llama_model_loader: - kv 11: llama.embedding_length u32 = 5120
llama_model_loader: - kv 11: llama.embedding_length u32 = 5120
llama_model_loader: - kv 12: llama.feed_forward_length u32 = 14336
llama_model_loader: - kv 12: llama.feed_forward_length u32 = 14336
llama_model_loader: - kv 13: llama.attention.head_count u32 = 32
llama_model_loader: - kv 13: llama.attention.head_count u32 = 32
llama_model_loader: - kv 14: llama.attention.head_count_kv u32 = 8
llama_model_loader: - kv 14: llama.attention.head_count_kv u32 = 8
llama_model_loader: - kv 15: llama.rope.freq_base f32 = 1000000.000000
llama_model_loader: - kv 15: llama.rope.freq_base f32 = 1000000.000000
llama_model_loader: - kv 16: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
llama_model_loader: - kv 16: llama.attention.layer_norm_rms_epsilon f32 = 0.000010
llama_model_loader: - kv 17: llama.attention.key_length u32 = 128
llama_model_loader: - kv 17: llama.attention.key_length u32 = 128
llama_model_loader: - kv 18: llama.attention.value_length u32 = 128
llama_model_loader: - kv 18: llama.attention.value_length u32 = 128
llama_model_loader: - kv 19: general.file_type u32 = 2
llama_model_loader: - kv 19: general.file_type u32 = 2
llama_model_loader: - kv 20: llama.vocab_size u32 = 131072
llama_model_loader: - kv 20: llama.vocab_size u32 = 131072
llama_model_loader: - kv 21: llama.rope.dimension_count u32 = 128
llama_model_loader: - kv 21: llama.rope.dimension_
llama_model_loader: - kv 22: tokenizer.ggml.add_