llm: fix ollama ps double-counting mmap'd weights on partial offload (#16709)

* llm: fix ollama ps double-counting mmap'd weights on partial offload

With mmap enabled, llama-server reports each CPU_Mapped model buffer as the
file-offset span of its CPU-resident tensors. During partial offload that span
covers nearly the whole file because the first and last tensors stay on CPU, so
the parsed buffer sizes count the offloaded weights twice and ollama ps shows
roughly 2x the real size with a false CPU/GPU split. Model weights can never
exceed the model file on disk, so trim the excess over the file size from the
mmap-backed portion when computing MemorySize. This makes the reported size
independent of use_mmap; VRAM accounting and scheduler placement are unchanged.

* llm: exclude repacked model buffers from the mmap overlap trim

The trim that corrects mmap double-counting computed the overlap from all
model buffers, including real copies such as CPU_REPACK. On a CPU-only
repacked model that inflated the excess and trimmed the repack out,
undercounting by the repack size (llama3.2 reported ~1918 MiB instead of
~3218 MiB).

Compute the overlap from file-backed buffers only: mmap views and direct
device copies, whose spans can overlap the file on partial offload.
Repacked or host-pinned CPU copies are separate allocations that never
overlap the on-disk weights, so leave them intact. Adds a CPU_Mapped +
CPU_REPACK regression test and corrects the Metal case to the real total.
This commit is contained in:
Philip Sinitsin 2026-06-24 19:43:20 +01:00 committed by GitHub
parent 570679c9e0
commit 0463940334
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 209 additions and 14 deletions

View file

@ -110,19 +110,21 @@ func boundedNumPredict(numPredict, numCtx int) int {
// llamaServerRunner wraps an upstream llama-server process and implements the LlamaServer interface.
// It communicates with llama-server over HTTP.
type llamaServerRunner struct {
port int
cmd *exec.Cmd
done chan struct{}
doneErr error
client *http.Client
memoryMu sync.RWMutex
memTotal uint64 // actual total buffer size parsed from llama-server logs (bytes)
memGPU uint64 // actual GPU buffer size parsed from llama-server logs (bytes)
gpuLayers uint64 // model layers loaded on GPU, parsed from llama-server logs
gpuLayerOverflow int // number of GPU-selected layers partially overflowed to CPU
status *StatusWriter
options api.Options
modelPath string
port int
cmd *exec.Cmd
done chan struct{}
doneErr error
client *http.Client
memoryMu sync.RWMutex
memTotal uint64 // actual total buffer size parsed from llama-server logs (bytes)
memGPU uint64 // actual GPU buffer size parsed from llama-server logs (bytes)
memModelFileBacked uint64 // model weight bytes whose buffers mirror the on-disk file (mmap views + direct device copies); excludes repacked copies like CPU_REPACK
memCPUMappedModel uint64 // model weight bytes in mmap-backed CPU buffers (e.g. CPU_Mapped), parsed from llama-server logs
gpuLayers uint64 // model layers loaded on GPU, parsed from llama-server logs
gpuLayerOverflow int // number of GPU-selected layers partially overflowed to CPU
status *StatusWriter
options api.Options
modelPath string
// mediaMarker must match the LLAMA_MEDIA_MARKER value passed to llama-server.
// llama.cpp randomizes this by default; Ollama renders stable [img-N] markers
// and rewrites them before forwarding the request.
@ -1030,6 +1032,8 @@ func (s *llamaServerRunner) resetLoadAccounting() {
s.memTotal = 0
s.memGPU = 0
s.memModelFileBacked = 0
s.memCPUMappedModel = 0
s.gpuLayers = 0
s.gpuLayerOverflow = 0
for k := range s.vramByDevice {
@ -2519,6 +2523,8 @@ func (s *llamaServerRunner) MemorySize() (total, vram uint64) {
s.memoryMu.RLock()
memTotal := s.memTotal
memGPU := s.memGPU
memModelFileBacked := s.memModelFileBacked
memCPUMappedModel := s.memCPUMappedModel
totalLayers := s.totalLayers
gpuLayers := s.gpuLayers
gpuLayerOverflow := s.gpuLayerOverflow
@ -2526,6 +2532,19 @@ func (s *llamaServerRunner) MemorySize() (total, vram uint64) {
if memTotal > 0 {
total, vram = memTotal, memGPU
// With mmap, llama-server reports each CPU_Mapped model buffer as the
// file-offset span of its CPU-resident tensors. During partial offload
// that span covers nearly the whole file (the first and last tensors
// stay on CPU), re-counting weights already held in device buffers.
// Only buffers that mirror the on-disk file can overlap this way;
// repacked copies such as CPU_REPACK are separate real allocations and
// must be left intact. Weights cannot exceed the model file on disk, so
// trim that overlap from the mmap-backed (reclaimable page cache) portion.
if memCPUMappedModel > 0 {
if info, err := os.Stat(s.modelPath); err == nil && memModelFileBacked > uint64(info.Size()) {
total -= min(memCPUMappedModel, memModelFileBacked-uint64(info.Size()))
}
}
if totalLayers > 0 && gpuLayers >= totalLayers && gpuLayerOverflow == 0 {
total = vram
}
@ -2684,11 +2703,25 @@ func (w *memoryParsingWriter) Write(b []byte) (int, error) {
}
func (w *memoryParsingWriter) updateRunnerMemoryLocked() {
var total, gpu uint64
var total, gpu, modelFileBacked, cpuMappedModel uint64
byDevice := make(map[string]uint64)
for key, buffer := range w.buffers {
total += buffer.bytes
if key.kind == "model" {
onGPU := isGPUBuffer(key.backend)
mmapBacked := strings.HasSuffix(key.backend, "_Mapped")
// Device copies and mmap views mirror the on-disk weights, so their
// spans can overlap and double-count on partial offload. Repacked or
// host-pinned CPU copies (e.g. CPU_REPACK) are separate real
// allocations that never overlap the file, so keep them out of the base.
if onGPU || mmapBacked {
modelFileBacked += buffer.bytes
}
if !onGPU && mmapBacked {
cpuMappedModel += buffer.bytes
}
}
if isGPUBuffer(key.backend) {
gpu += buffer.bytes
byDevice[deviceName(key.backend)] += buffer.bytes
@ -2697,6 +2730,8 @@ func (w *memoryParsingWriter) updateRunnerMemoryLocked() {
w.runner.memTotal = total
w.runner.memGPU = gpu
w.runner.memModelFileBacked = modelFileBacked
w.runner.memCPUMappedModel = cpuMappedModel
w.runner.vramByDevice = byDevice
}

View file

@ -3051,6 +3051,166 @@ func TestMemoryParsingWriterMemorySizeFullOffload(t *testing.T) {
}
}
func TestMemoryParsingWriterMemorySizeMmapPartialOffload(t *testing.T) {
tests := []struct {
name string
fileSizeBytes int64 // sparse model file size; 0 means no model file on disk
lines []string
wantTotalMiB float64
wantVRAMMiB float64
}{
{
// Numbers from https://github.com/ollama/ollama/issues/16637: a
// 13.26 GiB MoE GGUF offloaded 48/49 layers with mmap on. The
// CPU_Mapped buffer spans nearly the whole file because the first
// and last tensors stay on CPU, re-counting the weights already
// accounted to the CUDA0 buffer.
name: "CUDA partial offload with mmap",
fileSizeBytes: 13578 * 1024 * 1024, // 13.26 GiB
lines: []string{
"load_tensors: offloaded 48/49 layers to GPU\n",
"load_tensors: CUDA0 model buffer size = 12900.00 MiB\n",
"load_tensors: CPU_Mapped model buffer size = 13260.00 MiB\n",
"llama_kv_cache: CUDA0 KV buffer size = 460.00 MiB\n",
"sched_reserve: CUDA0 compute buffer size = 350.00 MiB\n",
"sched_reserve: CUDA_Host compute buffer size = 270.00 MiB\n",
},
// Weights counted once (13578) + KV + compute, not ~26.6 GiB.
wantTotalMiB: 13578 + 460 + 350 + 270,
wantVRAMMiB: 12900 + 460 + 350,
},
{
// Captured from llama-server on Apple Silicon (SmolLM2 360M Q8_0,
// 368.50 MiB GGUF, -ngl 20 of 33, mmap on): CPU_Mapped and
// MTL0_Mapped each span nearly the whole file.
name: "Metal partial offload with mmap",
fileSizeBytes: 386400256, // 368.50 MiB
lines: []string{
"load_tensors: offloaded 20/33 layers to GPU\n",
"load_tensors: CPU_Mapped model buffer size = 364.31 MiB\n",
"load_tensors: CPU_REPACK model buffer size = 129.49 MiB\n",
"load_tensors: MTL0_Mapped model buffer size = 366.80 MiB\n",
"llama_context: CPU output buffer size = 0.75 MiB\n",
"llama_kv_cache: CPU KV buffer size = 32.50 MiB\n",
"llama_kv_cache: MTL0 KV buffer size = 47.50 MiB\n",
"sched_reserve: MTL0 compute buffer size = 20.76 MiB\n",
"sched_reserve: CPU compute buffer size = 24.51 MiB\n",
},
// CPU_Mapped (364.31) and MTL0_Mapped (366.80) both span the file, so
// the file-backed overlap is 364.31+366.80-368.50 = 362.61; only that
// is trimmed from the reclaimable CPU_Mapped page cache. CPU_REPACK is
// a real copy and is kept. Result: file once + REPACK + output + KV +
// compute.
wantTotalMiB: 368.50 + 129.49 + 0.75 + 32.50 + 47.50 + 20.76 + 24.51,
wantVRAMMiB: 366.80 + 47.50 + 20.76,
},
{
// dhiltgen's llama3.2 CPU-only case (PR #16709 review): mmap on,
// nothing offloaded. CPU_Mapped equals the file and CPU_REPACK is a
// real repacked copy. With no device buffer to overlap, the repack
// must not be trimmed: report file + repack, not just the file.
name: "CPU-only mmap with repack is not trimmed",
fileSizeBytes: 1919 * 1024 * 1024, // ~1918.35 MiB file, CPU_Mapped span fits within
lines: []string{
"load_tensors: offloaded 0/29 layers to GPU\n",
"load_tensors: CPU_Mapped model buffer size = 1918.35 MiB\n",
"load_tensors: CPU_REPACK model buffer size = 1299.38 MiB\n",
"llama_kv_cache: CPU KV buffer size = 112.00 MiB\n",
"sched_reserve: CPU compute buffer size = 72.00 MiB\n",
},
wantTotalMiB: 1918.35 + 1299.38 + 112.00 + 72.00,
wantVRAMMiB: 0,
},
{
// use_mmap=false: weights are copied into plain CPU buffers and
// the REPACK copy legitimately exceeds the file size. No trim.
name: "no mmap is unchanged",
fileSizeBytes: 386404992,
lines: []string{
"load_tensors: offloaded 20/33 layers to GPU\n",
"load_tensors: CPU model buffer size = 234.82 MiB\n",
"load_tensors: CPU_REPACK model buffer size = 129.49 MiB\n",
"load_tensors: MTL0 model buffer size = 132.00 MiB\n",
"llama_kv_cache: MTL0 KV buffer size = 47.50 MiB\n",
},
wantTotalMiB: 234.82 + 129.49 + 132.00 + 47.50,
wantVRAMMiB: 132.00 + 47.50,
},
{
// Model file size unknown (stat failure): keep parsed sizes as-is.
name: "missing model file is unchanged",
fileSizeBytes: 0,
lines: []string{
"load_tensors: offloaded 48/49 layers to GPU\n",
"load_tensors: CUDA0 model buffer size = 12900.00 MiB\n",
"load_tensors: CPU_Mapped model buffer size = 13260.00 MiB\n",
},
wantTotalMiB: 12900 + 13260,
wantVRAMMiB: 12900,
},
{
// Mapped buffers that fit within the file budget cover disjoint
// file ranges: nothing is double-counted, nothing to trim.
name: "mapped buffers within file size are unchanged",
fileSizeBytes: 13578 * 1024 * 1024,
lines: []string{
"load_tensors: offloaded 24/49 layers to GPU\n",
"load_tensors: CUDA0 model buffer size = 6500.00 MiB\n",
"load_tensors: CPU_Mapped model buffer size = 7000.00 MiB\n",
},
wantTotalMiB: 6500 + 7000,
wantVRAMMiB: 6500,
},
}
withinKiB := func(got, want uint64) bool {
if got > want {
return got-want <= 1024
}
return want-got <= 1024
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
runner := &llamaServerRunner{vramByDevice: make(map[string]uint64)}
if tt.fileSizeBytes > 0 {
modelPath := filepath.Join(t.TempDir(), "model.gguf")
f, err := os.Create(modelPath)
if err != nil {
t.Fatal(err)
}
if err := f.Truncate(tt.fileSizeBytes); err != nil {
f.Close()
t.Fatal(err)
}
if err := f.Close(); err != nil {
t.Fatal(err)
}
runner.modelPath = modelPath
}
w := &memoryParsingWriter{inner: io.Discard, runner: runner}
for _, line := range tt.lines {
if _, err := w.Write([]byte(line)); err != nil {
t.Fatal(err)
}
}
total, vram := runner.MemorySize()
wantTotal := uint64(tt.wantTotalMiB * 1024 * 1024)
wantVRAM := uint64(tt.wantVRAMMiB * 1024 * 1024)
if !withinKiB(total, wantTotal) {
t.Errorf("MemorySize total = %d (%.2f MiB), want %d (%.2f MiB)",
total, float64(total)/1024/1024, wantTotal, tt.wantTotalMiB)
}
if !withinKiB(vram, wantVRAM) {
t.Errorf("MemorySize vram = %d (%.2f MiB), want %d (%.2f MiB)",
vram, float64(vram)/1024/1024, wantVRAM, tt.wantVRAMMiB)
}
})
}
}
func TestVRAMByGPU(t *testing.T) {
runner := &llamaServerRunner{
vramByDevice: map[string]uint64{