llm: fix ollama ps double-counting mmap'd weights on partial offload (#16709)

* llm: fix ollama ps double-counting mmap'd weights on partial offload With mmap enabled, llama-server reports each CPU_Mapped model buffer as the file-offset span of its CPU-resident tensors. During partial offload that span covers nearly the whole file because the first and last tensors stay on CPU, so the parsed buffer sizes count the offloaded weights twice and ollama ps shows roughly 2x the real size with a false CPU/GPU split. Model weights can never exceed the model file on disk, so trim the excess over the file size from the mmap-backed portion when computing MemorySize. This makes the reported size independent of use_mmap; VRAM accounting and scheduler placement are unchanged. * llm: exclude repacked model buffers from the mmap overlap trim The trim that corrects mmap double-counting computed the overlap from all model buffers, including real copies such as CPU_REPACK. On a CPU-only repacked model that inflated the excess and trimmed the repack out, undercounting by the repack size (llama3.2 reported ~1918 MiB instead of ~3218 MiB). Compute the overlap from file-backed buffers only: mmap views and direct device copies, whose spans can overlap the file on partial offload. Repacked or host-pinned CPU copies are separate allocations that never overlap the on-disk weights, so leave them intact. Adds a CPU_Mapped + CPU_REPACK regression test and corrects the Metal case to the real total.
2026-07-03 03:38:52 +00:00 · 2026-06-24 19:43:20 +01:00 · 2026-06-24 19:43:20 +01:00 · 0463940334
commit 0463940334
parent 570679c9e0
2 changed files with 209 additions and 14 deletions
--- a/llm/llama_server.go
+++ b/llm/llama_server.go
@ -110,19 +110,21 @@ func boundedNumPredict(numPredict, numCtx int) int {
 // llamaServerRunner wraps an upstream llama-server process and implements the LlamaServer interface.
 // It communicates with llama-server over HTTP.
 type llamaServerRunner struct {
-	port             int
-	cmd              *exec.Cmd
-	done             chan struct{}
-	doneErr          error
-	client           *http.Client
-	memoryMu         sync.RWMutex
-	memTotal         uint64 // actual total buffer size parsed from llama-server logs (bytes)
-	memGPU           uint64 // actual GPU buffer size parsed from llama-server logs (bytes)
-	gpuLayers        uint64 // model layers loaded on GPU, parsed from llama-server logs
-	gpuLayerOverflow int    // number of GPU-selected layers partially overflowed to CPU
-	status           *StatusWriter
-	options          api.Options
-	modelPath        string
+	port               int
+	cmd                *exec.Cmd
+	done               chan struct{}
+	doneErr            error
+	client             *http.Client
+	memoryMu           sync.RWMutex
+	memTotal           uint64 // actual total buffer size parsed from llama-server logs (bytes)
+	memGPU             uint64 // actual GPU buffer size parsed from llama-server logs (bytes)
+	memModelFileBacked uint64 // model weight bytes whose buffers mirror the on-disk file (mmap views + direct device copies); excludes repacked copies like CPU_REPACK
+	memCPUMappedModel  uint64 // model weight bytes in mmap-backed CPU buffers (e.g. CPU_Mapped), parsed from llama-server logs
+	gpuLayers          uint64 // model layers loaded on GPU, parsed from llama-server logs
+	gpuLayerOverflow   int    // number of GPU-selected layers partially overflowed to CPU
+	status             *StatusWriter
+	options            api.Options
+	modelPath          string
 	// mediaMarker must match the LLAMA_MEDIA_MARKER value passed to llama-server.
 	// llama.cpp randomizes this by default; Ollama renders stable [img-N] markers
 	// and rewrites them before forwarding the request.
@ -1030,6 +1032,8 @@ func (s *llamaServerRunner) resetLoadAccounting() {

 	s.memTotal = 0
 	s.memGPU = 0
+	s.memModelFileBacked = 0
+	s.memCPUMappedModel = 0
 	s.gpuLayers = 0
 	s.gpuLayerOverflow = 0
 	for k := range s.vramByDevice {
@ -2519,6 +2523,8 @@ func (s *llamaServerRunner) MemorySize() (total, vram uint64) {
 	s.memoryMu.RLock()
 	memTotal := s.memTotal
 	memGPU := s.memGPU
+	memModelFileBacked := s.memModelFileBacked
+	memCPUMappedModel := s.memCPUMappedModel
 	totalLayers := s.totalLayers
 	gpuLayers := s.gpuLayers
 	gpuLayerOverflow := s.gpuLayerOverflow
@ -2526,6 +2532,19 @@ func (s *llamaServerRunner) MemorySize() (total, vram uint64) {

 	if memTotal > 0 {
 		total, vram = memTotal, memGPU
+		// With mmap, llama-server reports each CPU_Mapped model buffer as the
+		// file-offset span of its CPU-resident tensors. During partial offload
+		// that span covers nearly the whole file (the first and last tensors
+		// stay on CPU), re-counting weights already held in device buffers.
+		// Only buffers that mirror the on-disk file can overlap this way;
+		// repacked copies such as CPU_REPACK are separate real allocations and
+		// must be left intact. Weights cannot exceed the model file on disk, so
+		// trim that overlap from the mmap-backed (reclaimable page cache) portion.
+		if memCPUMappedModel > 0 {
+			if info, err := os.Stat(s.modelPath); err == nil && memModelFileBacked > uint64(info.Size()) {
+				total -= min(memCPUMappedModel, memModelFileBacked-uint64(info.Size()))
+			}
+		}
 		if totalLayers > 0 && gpuLayers >= totalLayers && gpuLayerOverflow == 0 {
 			total = vram
 		}
@ -2684,11 +2703,25 @@ func (w *memoryParsingWriter) Write(b []byte) (int, error) {
 }

 func (w *memoryParsingWriter) updateRunnerMemoryLocked() {
-	var total, gpu uint64
+	var total, gpu, modelFileBacked, cpuMappedModel uint64
 	byDevice := make(map[string]uint64)

 	for key, buffer := range w.buffers {
 		total += buffer.bytes
+		if key.kind == "model" {
+			onGPU := isGPUBuffer(key.backend)
+			mmapBacked := strings.HasSuffix(key.backend, "_Mapped")
+			// Device copies and mmap views mirror the on-disk weights, so their
+			// spans can overlap and double-count on partial offload. Repacked or
+			// host-pinned CPU copies (e.g. CPU_REPACK) are separate real
+			// allocations that never overlap the file, so keep them out of the base.
+			if onGPU || mmapBacked {
+				modelFileBacked += buffer.bytes
+			}
+			if !onGPU && mmapBacked {
+				cpuMappedModel += buffer.bytes
+			}
+		}
 		if isGPUBuffer(key.backend) {
 			gpu += buffer.bytes
 			byDevice[deviceName(key.backend)] += buffer.bytes
@ -2697,6 +2730,8 @@ func (w *memoryParsingWriter) updateRunnerMemoryLocked() {

 	w.runner.memTotal = total
 	w.runner.memGPU = gpu
+	w.runner.memModelFileBacked = modelFileBacked
+	w.runner.memCPUMappedModel = cpuMappedModel
 	w.runner.vramByDevice = byDevice
 }

--- a/llm/llama_server_test.go
+++ b/llm/llama_server_test.go
@ -3051,6 +3051,166 @@ func TestMemoryParsingWriterMemorySizeFullOffload(t *testing.T) {
 	}
 }

+func TestMemoryParsingWriterMemorySizeMmapPartialOffload(t *testing.T) {
+	tests := []struct {
+		name          string
+		fileSizeBytes int64 // sparse model file size; 0 means no model file on disk
+		lines         []string
+		wantTotalMiB  float64
+		wantVRAMMiB   float64
+	}{
+		{
+			// Numbers from https://github.com/ollama/ollama/issues/16637: a
+			// 13.26 GiB MoE GGUF offloaded 48/49 layers with mmap on. The
+			// CPU_Mapped buffer spans nearly the whole file because the first
+			// and last tensors stay on CPU, re-counting the weights already
+			// accounted to the CUDA0 buffer.
+			name:          "CUDA partial offload with mmap",
+			fileSizeBytes: 13578 * 1024 * 1024, // 13.26 GiB
+			lines: []string{
+				"load_tensors: offloaded 48/49 layers to GPU\n",
+				"load_tensors:        CUDA0 model buffer size = 12900.00 MiB\n",
+				"load_tensors:   CPU_Mapped model buffer size = 13260.00 MiB\n",
+				"llama_kv_cache:      CUDA0 KV buffer size =   460.00 MiB\n",
+				"sched_reserve:      CUDA0 compute buffer size =   350.00 MiB\n",
+				"sched_reserve:  CUDA_Host compute buffer size =   270.00 MiB\n",
+			},
+			// Weights counted once (13578) + KV + compute, not ~26.6 GiB.
+			wantTotalMiB: 13578 + 460 + 350 + 270,
+			wantVRAMMiB:  12900 + 460 + 350,
+		},
+		{
+			// Captured from llama-server on Apple Silicon (SmolLM2 360M Q8_0,
+			// 368.50 MiB GGUF, -ngl 20 of 33, mmap on): CPU_Mapped and
+			// MTL0_Mapped each span nearly the whole file.
+			name:          "Metal partial offload with mmap",
+			fileSizeBytes: 386400256, // 368.50 MiB
+			lines: []string{
+				"load_tensors: offloaded 20/33 layers to GPU\n",
+				"load_tensors:   CPU_Mapped model buffer size =   364.31 MiB\n",
+				"load_tensors:   CPU_REPACK model buffer size =   129.49 MiB\n",
+				"load_tensors:  MTL0_Mapped model buffer size =   366.80 MiB\n",
+				"llama_context:        CPU  output buffer size =     0.75 MiB\n",
+				"llama_kv_cache:        CPU KV buffer size =    32.50 MiB\n",
+				"llama_kv_cache:       MTL0 KV buffer size =    47.50 MiB\n",
+				"sched_reserve:       MTL0 compute buffer size =    20.76 MiB\n",
+				"sched_reserve:        CPU compute buffer size =    24.51 MiB\n",
+			},
+			// CPU_Mapped (364.31) and MTL0_Mapped (366.80) both span the file, so
+			// the file-backed overlap is 364.31+366.80-368.50 = 362.61; only that
+			// is trimmed from the reclaimable CPU_Mapped page cache. CPU_REPACK is
+			// a real copy and is kept. Result: file once + REPACK + output + KV +
+			// compute.
+			wantTotalMiB: 368.50 + 129.49 + 0.75 + 32.50 + 47.50 + 20.76 + 24.51,
+			wantVRAMMiB:  366.80 + 47.50 + 20.76,
+		},
+		{
+			// dhiltgen's llama3.2 CPU-only case (PR #16709 review): mmap on,
+			// nothing offloaded. CPU_Mapped equals the file and CPU_REPACK is a
+			// real repacked copy. With no device buffer to overlap, the repack
+			// must not be trimmed: report file + repack, not just the file.
+			name:          "CPU-only mmap with repack is not trimmed",
+			fileSizeBytes: 1919 * 1024 * 1024, // ~1918.35 MiB file, CPU_Mapped span fits within
+			lines: []string{
+				"load_tensors: offloaded 0/29 layers to GPU\n",
+				"load_tensors:   CPU_Mapped model buffer size =  1918.35 MiB\n",
+				"load_tensors:   CPU_REPACK model buffer size =  1299.38 MiB\n",
+				"llama_kv_cache:        CPU KV buffer size =   112.00 MiB\n",
+				"sched_reserve:        CPU compute buffer size =    72.00 MiB\n",
+			},
+			wantTotalMiB: 1918.35 + 1299.38 + 112.00 + 72.00,
+			wantVRAMMiB:  0,
+		},
+		{
+			// use_mmap=false: weights are copied into plain CPU buffers and
+			// the REPACK copy legitimately exceeds the file size. No trim.
+			name:          "no mmap is unchanged",
+			fileSizeBytes: 386404992,
+			lines: []string{
+				"load_tensors: offloaded 20/33 layers to GPU\n",
+				"load_tensors:          CPU model buffer size =   234.82 MiB\n",
+				"load_tensors:   CPU_REPACK model buffer size =   129.49 MiB\n",
+				"load_tensors:         MTL0 model buffer size =   132.00 MiB\n",
+				"llama_kv_cache:       MTL0 KV buffer size =    47.50 MiB\n",
+			},
+			wantTotalMiB: 234.82 + 129.49 + 132.00 + 47.50,
+			wantVRAMMiB:  132.00 + 47.50,
+		},
+		{
+			// Model file size unknown (stat failure): keep parsed sizes as-is.
+			name:          "missing model file is unchanged",
+			fileSizeBytes: 0,
+			lines: []string{
+				"load_tensors: offloaded 48/49 layers to GPU\n",
+				"load_tensors:        CUDA0 model buffer size = 12900.00 MiB\n",
+				"load_tensors:   CPU_Mapped model buffer size = 13260.00 MiB\n",
+			},
+			wantTotalMiB: 12900 + 13260,
+			wantVRAMMiB:  12900,
+		},
+		{
+			// Mapped buffers that fit within the file budget cover disjoint
+			// file ranges: nothing is double-counted, nothing to trim.
+			name:          "mapped buffers within file size are unchanged",
+			fileSizeBytes: 13578 * 1024 * 1024,
+			lines: []string{
+				"load_tensors: offloaded 24/49 layers to GPU\n",
+				"load_tensors:        CUDA0 model buffer size =  6500.00 MiB\n",
+				"load_tensors:   CPU_Mapped model buffer size =  7000.00 MiB\n",
+			},
+			wantTotalMiB: 6500 + 7000,
+			wantVRAMMiB:  6500,
+		},
+	}
+
+	withinKiB := func(got, want uint64) bool {
+		if got > want {
+			return got-want <= 1024
+		}
+		return want-got <= 1024
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			runner := &llamaServerRunner{vramByDevice: make(map[string]uint64)}
+			if tt.fileSizeBytes > 0 {
+				modelPath := filepath.Join(t.TempDir(), "model.gguf")
+				f, err := os.Create(modelPath)
+				if err != nil {
+					t.Fatal(err)
+				}
+				if err := f.Truncate(tt.fileSizeBytes); err != nil {
+					f.Close()
+					t.Fatal(err)
+				}
+				if err := f.Close(); err != nil {
+					t.Fatal(err)
+				}
+				runner.modelPath = modelPath
+			}
+
+			w := &memoryParsingWriter{inner: io.Discard, runner: runner}
+			for _, line := range tt.lines {
+				if _, err := w.Write([]byte(line)); err != nil {
+					t.Fatal(err)
+				}
+			}
+
+			total, vram := runner.MemorySize()
+			wantTotal := uint64(tt.wantTotalMiB * 1024 * 1024)
+			wantVRAM := uint64(tt.wantVRAMMiB * 1024 * 1024)
+			if !withinKiB(total, wantTotal) {
+				t.Errorf("MemorySize total = %d (%.2f MiB), want %d (%.2f MiB)",
+					total, float64(total)/1024/1024, wantTotal, tt.wantTotalMiB)
+			}
+			if !withinKiB(vram, wantVRAM) {
+				t.Errorf("MemorySize vram = %d (%.2f MiB), want %d (%.2f MiB)",
+					vram, float64(vram)/1024/1024, wantVRAM, tt.wantVRAMMiB)
+			}
+		})
+	}
+}
+
 func TestVRAMByGPU(t *testing.T) {
 	runner := &llamaServerRunner{
 		vramByDevice: map[string]uint64{