docker
diff --git a/‎CHANGELOG.md‎
Lines changed: 27 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎cmd/root/otel.go‎
Lines changed: 13 additions & 9 deletions b/‎cmd/root/otel.go‎
Lines changed: 13 additions & 9 deletions
diff --git a/‎cmd/root/otel_test.go‎
Lines changed: 10 additions & 0 deletions b/‎cmd/root/otel_test.go‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎docs/providers/dmr/index.md‎
Lines changed: 94 additions & 12 deletions b/‎docs/providers/dmr/index.md‎
Lines changed: 94 additions & 12 deletions
diff --git a/‎pkg/model/provider/dmr/client.go‎
Lines changed: 59 additions & 9 deletions b/‎pkg/model/provider/dmr/client.go‎
Lines changed: 59 additions & 9 deletions
@@ -3,6 +3,31 @@
 All notable changes to this project will be documented in this file.
 
 
+## [v1.48.0] - 2026-04-20
+
+This release adds working directory configuration for MCP and LSP toolsets and improves toolset reliability with better retry handling.
+
+## What's New
+- Adds optional `working_dir` field to MCP and LSP toolset configurations to launch processes from a specific directory
+
+## Bug Fixes
+- Fixes retry behavior for MCP toolsets after tool calls within the same turn
+- Stops retrying SQLITE_CANTOPEN (14) errors that cannot be resolved
+- Fixes filepath handling to satisfy gocritic filepathJoin lint rule
+- Returns explicit error when ref-based MCP resolves to remote server with working_dir
+
+## Technical Changes
+- Documents working_dir field for MCP and LSP toolsets in configuration
+
+### Pull Requests
+
+- [#2457](https://github.com/docker/docker-agent/pull/2457) - fix(#2457): retry MCP toolsets after tool calls within the same turn
+- [#2458](https://github.com/docker/docker-agent/pull/2458) - fix: retry LSP/MCP toolsets after tool calls, covering env-wrapped commands (fixes #2457)
+- [#2460](https://github.com/docker/docker-agent/pull/2460) - feat: add optional working_dir to MCP and LSP toolset configs
+- [#2466](https://github.com/docker/docker-agent/pull/2466) - Don't retry SQLITE_CANTOPEN (14) errors
+- [#2468](https://github.com/docker/docker-agent/pull/2468) - docs: update CHANGELOG.md for v1.47.0
+
+
 ## [v1.47.0] - 2026-04-20
 
 This release fixes several issues with AI model interactions, including title generation failures with reasoning models and shell command hangs.
@@ -2038,3 +2063,5 @@ This release improves the terminal user interface with better error handling and
 [v1.46.0]: https://github.com/docker/docker-agent/releases/tag/v1.46.0
 
 [v1.47.0]: https://github.com/docker/docker-agent/releases/tag/v1.47.0
+
+[v1.48.0]: https://github.com/docker/docker-agent/releases/tag/v1.48.0
@@ -12,21 +12,14 @@ import (
 	"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp"
 	"go.opentelemetry.io/otel/sdk/resource"
 	"go.opentelemetry.io/otel/sdk/trace"
-	semconv "go.opentelemetry.io/otel/semconv/v1.37.0"
+	semconv "go.opentelemetry.io/otel/semconv/v1.40.0"
 )
 
 const AppName = "cagent"
 
 // initOTelSDK initializes OpenTelemetry SDK with OTLP exporter
 func initOTelSDK(ctx context.Context) (err error) {
-	res, err := resource.Merge(
-		resource.Default(),
-		resource.NewWithAttributes(
-			semconv.SchemaURL,
-			semconv.ServiceName(AppName),
-			semconv.ServiceVersion("dev"), // TODO: use actual version
-		),
-	)
+	res, err := newOTelResource()
 	if err != nil {
 		return fmt.Errorf("failed to create resource: %w", err)
 	}
@@ -75,6 +68,17 @@ func initOTelSDK(ctx context.Context) (err error) {
 	return nil
 }
 
+func newOTelResource() (*resource.Resource, error) {
+	return resource.Merge(
+		resource.Default(),
+		resource.NewWithAttributes(
+			semconv.SchemaURL,
+			semconv.ServiceName(AppName),
+			semconv.ServiceVersion("dev"), // TODO: use actual version
+		),
+	)
+}
+
 // isLocalhostEndpoint reports whether the given endpoint refers to a
 // loopback address so that we can safely skip TLS.
 func isLocalhostEndpoint(endpoint string) bool {
 
@@ -4,8 +4,18 @@ import (
 	"testing"
 
 	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+	semconv "go.opentelemetry.io/otel/semconv/v1.40.0"
 )
 
+func TestNewOTelResourceUsesCurrentSchemaURL(t *testing.T) {
+	t.Parallel()
+
+	res, err := newOTelResource()
+	require.NoError(t, err)
+	assert.Equal(t, semconv.SchemaURL, res.SchemaURL())
+}
+
 func TestIsLocalhostEndpoint(t *testing.T) {
 	t.Parallel()
 
 
@@ -64,29 +64,111 @@ models:
     model: ai/qwen3
     max_tokens: 8192
     provider_opts:
-      runtime_flags: ["--ngl=33", "--top-p=0.9"]
+      runtime_flags: ["--threads", "8"]
 ```
 
 Runtime flags also accept a single string:
 
 ```yaml
 provider_opts:
-  runtime_flags: "--ngl=33 --top-p=0.9"
+  runtime_flags: "--threads 8"
 ```
 
-## Parameter Mapping
+Use only flags your Model Runner backend allows (see `docker model configure --help` and backend docs). **Do not** put sampling parameters (`temperature`, `top_p`, penalties) in `runtime_flags` — set them on the model (`temperature`, `top_p`, etc.); they are sent **per request** via the OpenAI-compatible chat API.
 
-docker-agent model config fields map to llama.cpp flags automatically:
+## Context size
 
-| Config              | llama.cpp Flag        |
-| ------------------- | --------------------- |
-| `temperature`       | `--temp`              |
-| `top_p`             | `--top-p`             |
-| `frequency_penalty` | `--frequency-penalty` |
-| `presence_penalty`  | `--presence-penalty`  |
-| `max_tokens`        | `--context-size`      |
+`max_tokens` controls the **maximum output tokens** per chat completion request. To set the engine's **total context window**, use `provider_opts.context_size`:
 
-`runtime_flags` always take priority over derived flags on conflict.
+```yaml
+models:
+  local:
+    provider: dmr
+    model: ai/qwen3
+    max_tokens: 4096            # max output tokens (per-request)
+    provider_opts:
+      context_size: 32768       # total context window (sent via _configure)
+```
+
+If `context_size` is omitted, Model Runner uses its default. `max_tokens` is **not** used as the context window.
+
+## Thinking / reasoning budget
+
+When using the **llama.cpp** backend, `thinking_budget` is sent as structured `llamacpp.reasoning-budget` on `_configure` (maps to `--reasoning-budget`). String efforts use the same token mapping as other providers; `adaptive` maps to unlimited (`-1`).
+
+When using the **vLLM** backend, `thinking_budget` is sent as `thinking_token_budget` in each chat completion request. Effort levels map to token counts using the same scale as other providers; `adaptive` maps to unlimited (`-1`).
+
+```yaml
+models:
+  local:
+    provider: dmr
+    model: ai/qwen3
+    thinking_budget: medium   # llama.cpp: reasoning-budget=8192; vLLM: thinking_token_budget=8192
+```
+
+On **MLX** and **SGLang** backends, `thinking_budget` is silently ignored — those engines do not currently expose a per-request reasoning token budget knob.
+
+## vLLM-specific configuration
+
+When running a model on the **vLLM** backend, additional engine-level settings can be passed via `provider_opts` and are forwarded to model-runner's `_configure` endpoint:
+
+- `gpu_memory_utilization` — fraction of GPU memory (0.0–1.0) vLLM may use. Values outside this range are rejected.
+- `hf_overrides` — map of Hugging Face config overrides applied when vLLM loads the model.
+
+```yaml
+models:
+  vllm-local:
+    provider: dmr
+    model: ai/some-model-safetensors
+    provider_opts:
+      gpu_memory_utilization: 0.9
+      hf_overrides:
+        max_model_len: 8192
+        dtype: bfloat16
+```
+
+`hf_overrides` keys (including nested ones) must match `^[a-zA-Z_][a-zA-Z0-9_]*$` — the same rule model-runner enforces server-side to block injection via flags. Invalid keys are rejected at client creation time so you fail fast instead of after a round-trip.
+
+These options are ignored on non-vLLM backends.
+
+## Keeping models resident in memory (`keep_alive`)
+
+By default model-runner unloads idle models after a few minutes. Override the idle timeout via `provider_opts.keep_alive`:
+
+```yaml
+models:
+  sticky:
+    provider: dmr
+    model: ai/qwen3
+    provider_opts:
+      keep_alive: "30m"   # duration string
+      # keep_alive: "0"   # unload immediately after each request
+      # keep_alive: "-1"  # keep loaded forever
+```
+
+Accepted values: any Go duration string (`"30s"`, `"5m"`, `"1h"`, `"2h30m"`), `"0"` (immediate unload), or `"-1"` (never unload). Invalid values are rejected before the configure request is sent.
+
+## Operating mode (`mode`)
+
+Model-runner normally infers the backend mode from the request path. You can pin it explicitly via `provider_opts.mode`:
+
+```yaml
+provider_opts:
+  mode: embedding   # one of: completion, embedding, reranking, image-generation
+```
+
+Most agents don't need this — leave it unset unless you know you need it.
+
+## Raw runtime flags (`raw_runtime_flags`)
+
+`runtime_flags` (a list) is the preferred way to pass flags. If you have a pre-built command-line string you'd rather ship verbatim, use `raw_runtime_flags` instead:
+
+```yaml
+provider_opts:
+  raw_runtime_flags: "--threads 8 --batch-size 512"
+```
+
+Model-runner parses the string with shell-style word splitting. `runtime_flags` and `raw_runtime_flags` are mutually exclusive — setting both is an error.
 
 ## Speculative Decoding
 
 
@@ -7,6 +7,7 @@ import (
 	"errors"
 	"fmt"
 	"log/slog"
+	"maps"
 	"net/http"
 	"os"
 	"time"
@@ -54,6 +55,7 @@ type Client struct {
 	client     openai.Client
 	baseURL    string
 	httpClient *http.Client
+	engine     string
 }
 
 // NewClient creates a new DMR client from the provided configuration
@@ -103,18 +105,28 @@ func NewClient(ctx context.Context, cfg *latest.ModelConfig, opts ...options.Opt
 
 	clientOptions = append(clientOptions, option.WithBaseURL(baseURL), option.WithAPIKey("")) // DMR doesn't need auth
 
-	// Build runtime flags from ModelConfig and engine
-	contextSize, providerRuntimeFlags, specOpts := parseDMRProviderOpts(cfg)
-	configFlags := buildRuntimeFlagsFromModelConfig(engine, cfg)
-	finalFlags, warnings := mergeRuntimeFlagsPreferUser(configFlags, providerRuntimeFlags)
-	for _, w := range warnings {
-		slog.Warn(w)
+	parsed, err := parseDMRProviderOpts(engine, cfg)
+	if err != nil {
+		slog.Error("DMR provider_opts invalid", "error", err, "model", cfg.Model)
+		return nil, err
 	}
-	slog.Debug("DMR provider_opts parsed", "model", cfg.Model, "context_size", contextSize, "runtime_flags", finalFlags, "speculative_opts", specOpts, "engine", engine)
+	backendCfg := buildConfigureBackendConfig(parsed.contextSize, parsed.runtimeFlags, parsed.specOpts, parsed.llamaCpp, parsed.vllm, parsed.keepAlive)
+	slog.Debug("DMR provider_opts parsed",
+		"model", cfg.Model,
+		"engine", engine,
+		"context_size", derefInt64(parsed.contextSize),
+		"runtime_flags", parsed.runtimeFlags,
+		"raw_runtime_flags", parsed.rawRuntimeFlags,
+		"mode", derefString(parsed.mode),
+		"keep_alive", derefString(parsed.keepAlive),
+		"speculative_opts", parsed.specOpts,
+		"llamacpp", parsed.llamaCpp,
+		"vllm", parsed.vllm,
+	)
 	// Skip model configuration when generating titles to avoid reconfiguring the model
 	// with different settings (e.g., smaller max_tokens) that would affect the main agent.
 	if !globalOptions.GeneratingTitle() {
-		if err := configureModel(ctx, httpClient, baseURL, cfg.Model, contextSize, finalFlags, specOpts); err != nil {
+		if err := configureModel(ctx, httpClient, baseURL, cfg.Model, backendCfg, parsed.mode, parsed.rawRuntimeFlags); err != nil {
 			slog.Debug("model configure via API skipped or failed", "error", err)
 		}
 	}
@@ -129,6 +141,7 @@ func NewClient(ctx context.Context, cfg *latest.ModelConfig, opts ...options.Opt
 		client:     openai.NewClient(clientOptions...),
 		baseURL:    baseURL,
 		httpClient: httpClient,
+		engine:     engine,
 	}, nil
 }
 
@@ -214,6 +227,43 @@ func (c *Client) CreateChatCompletionStream(ctx context.Context, messages []chat
 		}
 	}
 
+	// Collect per-request extra JSON fields. SetExtraFields replaces the map
+	// wholesale, so merge all contributors before a single Set call.
+	extraFields := map[string]any{}
+
+	// NoThinking: disable reasoning at the chat-template level. llama.cpp and
+	// vLLM both honor chat_template_kwargs.enable_thinking=false for Qwen3 /
+	// Hermes / DeepSeek-R1 style templates; other engines ignore unknown keys.
+	//
+	// When the caller has also set a small MaxTokens (e.g. session title
+	// generation sets max_tokens=20), raise it to noThinkingMinOutputTokens
+	// so any residual reasoning tokens the engine/template still emits can't
+	// starve the visible output. The nil-guard is intentional: if MaxTokens
+	// is unset the caller has imposed no cap, so there is nothing to floor
+	// and we leave max_tokens off the request (letting the engine use its
+	// own output budget). Mirrors the OpenAI provider (see
+	// pkg/model/provider/openai/client.go).
+	if c.ModelOptions.NoThinking() {
+		extraFields["chat_template_kwargs"] = map[string]any{"enable_thinking": false}
+		if c.ModelConfig.MaxTokens != nil && *c.ModelConfig.MaxTokens < noThinkingMinOutputTokens {
+			params.MaxTokens = openai.Int(noThinkingMinOutputTokens)
+			slog.Debug("DMR NoThinking: bumped max_tokens floor",
+				"from", *c.ModelConfig.MaxTokens, "to", noThinkingMinOutputTokens)
+		}
+	}
+
+	// vLLM-specific per-request fields (e.g. thinking_token_budget).
+	if c.engine == engineVLLM {
+		if fields := buildVLLMRequestFields(&c.ModelConfig); fields != nil {
+			maps.Copy(extraFields, fields)
+		}
+	}
+
+	if len(extraFields) > 0 {
+		params.SetExtraFields(extraFields)
+		slog.Debug("DMR extra request fields applied", "fields", extraFields)
+	}
+
 	// Log the request in JSON format for debugging
 	if requestJSON, err := json.Marshal(params); err == nil {
 		slog.Debug("DMR chat completion request", "request", string(requestJSON))
@@ -222,7 +272,7 @@ func (c *Client) CreateChatCompletionStream(ctx context.Context, messages []chat
 	}
 
 	if structuredOutput := c.ModelOptions.StructuredOutput(); structuredOutput != nil {
-		slog.Debug("Adding structured output to DMR request", "structured_output", structuredOutput)
+		slog.Debug("Adding structured output to DMR request", "name", structuredOutput.Name, "strict", structuredOutput.Strict)
 
 		params.ResponseFormat.OfJSONSchema = &openai.ResponseFormatJSONSchemaParam{
 			JSONSchema: openai.ResponseFormatJSONSchemaJSONSchemaParam{