|
| 1 | +package runtime |
| 2 | + |
| 3 | +import ( |
| 4 | + "context" |
| 5 | + "log/slog" |
| 6 | + "slices" |
| 7 | + |
| 8 | + "github.com/docker/docker-agent/pkg/chat" |
| 9 | + "github.com/docker/docker-agent/pkg/hooks" |
| 10 | +) |
| 11 | + |
| 12 | +// BuiltinStripUnsupportedModalities is the name of the runtime-shipped |
| 13 | +// before_llm_call message transform that drops image content from the |
| 14 | +// outgoing messages when the agent's current model doesn't list image |
| 15 | +// in its input modalities. It's the runtime-shipped peer of |
| 16 | +// [BuiltinCacheResponse] (a stop hook) — the constant exists mostly |
| 17 | +// for log filtering and diagnostics. |
| 18 | +// |
| 19 | +// Sending images to a text-only model produces hard provider errors |
| 20 | +// (HTTP 400 from OpenAI, "image input is not supported" from |
| 21 | +// Anthropic text variants, etc.); promoting the strip into a |
| 22 | +// registered transform replaces an inline branch in runStreamLoop and |
| 23 | +// opens the door to a family of message-mutating transforms |
| 24 | +// (redactors, scrubbers, ...). |
| 25 | +const BuiltinStripUnsupportedModalities = "strip_unsupported_modalities" |
| 26 | + |
| 27 | +// modalityImage is the canonical models.dev modality name for image |
| 28 | +// input. A constant instead of a literal so a typo trips a compile |
| 29 | +// error and the contract with [modelsdev.Modalities.Input] is |
| 30 | +// discoverable from the runtime side. |
| 31 | +const modalityImage = "image" |
| 32 | + |
| 33 | +// stripUnsupportedModalitiesTransform is the [MessageTransform] |
| 34 | +// registered under [BuiltinStripUnsupportedModalities]. It looks up |
| 35 | +// the model definition from [hooks.Input.ModelID] (populated by the |
| 36 | +// runtime with the actual model the loop chose, including per-tool |
| 37 | +// overrides and alloy-mode selection) and applies |
| 38 | +// [stripImageContent] when image is missing from the model's input |
| 39 | +// modalities. |
| 40 | +// |
| 41 | +// The transform is a no-op for every "we don't know enough to act" |
| 42 | +// case (missing ModelID, models.dev miss, empty modalities, image |
| 43 | +// already supported): erring on the side of "send the messages |
| 44 | +// as-is" matches the previous inline behavior in runStreamLoop, |
| 45 | +// where an unknown model also fell through. Each fall-through emits |
| 46 | +// a Debug log so operators can tell strip_unsupported_modalities |
| 47 | +// from a transform that's silently inactive. |
| 48 | +func (r *LocalRuntime) stripUnsupportedModalitiesTransform( |
| 49 | + ctx context.Context, |
| 50 | + in *hooks.Input, |
| 51 | + msgs []chat.Message, |
| 52 | +) ([]chat.Message, error) { |
| 53 | + if in == nil || in.ModelID == "" { |
| 54 | + slog.Debug("strip_unsupported_modalities: skipping, no ModelID on input") |
| 55 | + return msgs, nil |
| 56 | + } |
| 57 | + m, err := r.modelsStore.GetModel(ctx, in.ModelID) |
| 58 | + if err != nil || m == nil { |
| 59 | + // Unknown model: keep the previous (inline) behavior of |
| 60 | + // passing messages through untouched. The model call will |
| 61 | + // surface any modality mismatch as a provider error. |
| 62 | + slog.Debug("strip_unsupported_modalities: skipping, model definition unavailable", |
| 63 | + "model_id", in.ModelID, "error", err) |
| 64 | + return msgs, nil |
| 65 | + } |
| 66 | + if len(m.Modalities.Input) == 0 || slices.Contains(m.Modalities.Input, modalityImage) { |
| 67 | + return msgs, nil |
| 68 | + } |
| 69 | + return stripImageContent(msgs), nil |
| 70 | +} |
| 71 | + |
| 72 | +// stripImageContent returns a copy of messages with all image-related |
| 73 | +// content removed. Text content is preserved; image parts in |
| 74 | +// [chat.Message.MultiContent] are filtered out, and file attachments |
| 75 | +// with image MIME types are dropped. |
| 76 | +// |
| 77 | +// Lives next to [stripUnsupportedModalitiesTransform] (rather than in |
| 78 | +// streaming.go where it originated) so the builtin's storage, |
| 79 | +// transform, and helper are co-located. Kept as an unexported helper |
| 80 | +// because the only legitimate caller is the transform itself — direct |
| 81 | +// use bypasses the modality check. |
| 82 | +func stripImageContent(messages []chat.Message) []chat.Message { |
| 83 | + result := make([]chat.Message, len(messages)) |
| 84 | + for i, msg := range messages { |
| 85 | + result[i] = msg |
| 86 | + |
| 87 | + if len(msg.MultiContent) == 0 { |
| 88 | + continue |
| 89 | + } |
| 90 | + |
| 91 | + var filtered []chat.MessagePart |
| 92 | + for _, part := range msg.MultiContent { |
| 93 | + switch part.Type { |
| 94 | + case chat.MessagePartTypeImageURL: |
| 95 | + // Drop image URL parts entirely. |
| 96 | + continue |
| 97 | + case chat.MessagePartTypeFile: |
| 98 | + // Drop file parts that are images. |
| 99 | + if part.File != nil && chat.IsImageMimeType(part.File.MimeType) { |
| 100 | + continue |
| 101 | + } |
| 102 | + } |
| 103 | + filtered = append(filtered, part) |
| 104 | + } |
| 105 | + |
| 106 | + if len(filtered) != len(msg.MultiContent) { |
| 107 | + result[i].MultiContent = filtered |
| 108 | + slog.Debug("Stripped image content from message", |
| 109 | + "role", msg.Role, |
| 110 | + "original_parts", len(msg.MultiContent), |
| 111 | + "remaining_parts", len(filtered)) |
| 112 | + } |
| 113 | + } |
| 114 | + return result |
| 115 | +} |
0 commit comments