package config import ( "encoding/json" "fmt" "os" "regexp" "slices" "strings" "text/template" "github.com/mudler/LocalAI/core/schema" "github.com/mudler/LocalAI/pkg/downloader" "github.com/mudler/LocalAI/pkg/functions" "github.com/mudler/LocalAI/pkg/reasoning" "github.com/mudler/cogito" "gopkg.in/yaml.v3" ) const ( RAND_SEED = -1 ) // @Description TTS configuration type TTSConfig struct { // Voice wav path or id Voice string `yaml:"voice,omitempty" json:"voice,omitempty"` AudioPath string `yaml:"audio_path,omitempty" json:"audio_path,omitempty"` } // @Description ModelConfig represents a model configuration type ModelConfig struct { modelConfigFile string `yaml:"-" json:"-"` modelTemplate string `yaml:"-" json:"-"` schema.PredictionOptions `yaml:"parameters,omitempty" json:"parameters,omitempty"` Name string `yaml:"name,omitempty" json:"name,omitempty"` F16 *bool `yaml:"f16,omitempty" json:"f16,omitempty"` Threads *int `yaml:"threads,omitempty" json:"threads,omitempty"` Debug *bool `yaml:"debug,omitempty" json:"debug,omitempty"` Roles map[string]string `yaml:"roles,omitempty" json:"roles,omitempty"` Embeddings *bool `yaml:"embeddings,omitempty" json:"embeddings,omitempty"` Backend string `yaml:"backend,omitempty" json:"backend,omitempty"` TemplateConfig TemplateConfig `yaml:"template,omitempty" json:"template,omitempty"` KnownUsecaseStrings []string `yaml:"known_usecases,omitempty" json:"known_usecases,omitempty"` KnownUsecases *ModelConfigUsecase `yaml:"-" json:"-"` Pipeline Pipeline `yaml:"pipeline,omitempty" json:"pipeline,omitempty"` PromptStrings, InputStrings []string `yaml:"-" json:"-"` InputToken [][]int `yaml:"-" json:"-"` functionCallString, functionCallNameString string `yaml:"-" json:"-"` ResponseFormat string `yaml:"-" json:"-"` ResponseFormatMap map[string]any `yaml:"-" json:"-"` // MediaMarker is the runtime-discovered multimodal marker the backend expects // in the prompt (e.g. "<__media__>" or a random "<__media___>" picked by // llama.cpp). Populated on first successful ModelMetadata call. Empty until // then — callers must fall back to templates.DefaultMultiMediaMarker. MediaMarker string `yaml:"-" json:"-"` FunctionsConfig functions.FunctionsConfig `yaml:"function,omitempty" json:"function,omitempty"` ReasoningConfig reasoning.Config `yaml:"reasoning,omitempty" json:"reasoning,omitempty"` FeatureFlag FeatureFlag `yaml:"feature_flags,omitempty" json:"feature_flags,omitempty"` // Feature Flag registry. We move fast, and features may break on a per model/backend basis. Registry for (usually temporary) flags that indicate aborting something early. // LLM configs (GPT4ALL, Llama.cpp, ...) LLMConfig `yaml:",inline" json:",inline"` // Diffusers Diffusers Diffusers `yaml:"diffusers,omitempty" json:"diffusers,omitempty"` Step int `yaml:"step,omitempty" json:"step,omitempty"` // GRPC Options GRPC GRPC `yaml:"grpc,omitempty" json:"grpc,omitempty"` // TTS specifics TTSConfig `yaml:"tts,omitempty" json:"tts,omitempty"` // CUDA // Explicitly enable CUDA or not (some backends might need it) CUDA bool `yaml:"cuda,omitempty" json:"cuda,omitempty"` DownloadFiles []File `yaml:"download_files,omitempty" json:"download_files,omitempty"` Description string `yaml:"description,omitempty" json:"description,omitempty"` Usage string `yaml:"usage,omitempty" json:"usage,omitempty"` Disabled *bool `yaml:"disabled,omitempty" json:"disabled,omitempty"` Pinned *bool `yaml:"pinned,omitempty" json:"pinned,omitempty"` // ConcurrencyGroups declares per-node mutual-exclusion groups: the model // cannot be loaded alongside another model that shares any group name. // See docs/content/advanced/vram-management.md for usage. ConcurrencyGroups []string `yaml:"concurrency_groups,omitempty" json:"concurrency_groups,omitempty"` Options []string `yaml:"options,omitempty" json:"options,omitempty"` Overrides []string `yaml:"overrides,omitempty" json:"overrides,omitempty"` MCP MCPConfig `yaml:"mcp,omitempty" json:"mcp,omitempty"` Agent AgentConfig `yaml:"agent,omitempty" json:"agent,omitempty"` PII PIIConfig `yaml:"pii,omitempty" json:"pii,omitempty"` Router RouterConfig `yaml:"router,omitempty" json:"router,omitempty"` Proxy ProxyConfig `yaml:"proxy,omitempty" json:"proxy,omitempty"` MITM MITMModelConfig `yaml:"mitm,omitempty" json:"mitm,omitempty"` Limits LimitsConfig `yaml:"limits,omitempty" json:"limits,omitempty"` } // @Description Admission-control limits applied per request. The // admission middleware enforces these before invoking the handler; // requests that exceed a limit get 503 with a Retry-After hint so // clients back off rather than pile on. Per-model so cloud passthroughs // can have a stricter ceiling than local models. type LimitsConfig struct { // MaxConcurrent caps simultaneous in-flight requests for this // model. 0 = unlimited (default). Useful for cloud-passthrough // configs where the upstream rate-limits aggressively, or for // local backends whose memory budget tops out before LocalAI's // queue depth would. MaxConcurrent int `yaml:"max_concurrent,omitempty" json:"max_concurrent,omitempty"` // RetryAfterSeconds advises clients how long to wait before // retrying when admission rejects. 0 defaults to 1s — enough to // let an in-flight request finish on a busy local model. The // value is sent verbatim in the Retry-After response header. RetryAfterSeconds int `yaml:"retry_after_seconds,omitempty" json:"retry_after_seconds,omitempty"` } // @Description MITM intercept binding for the model. When the cloudproxy // MITM listener is enabled and any host listed here appears in a CONNECT, // the proxy uses THIS model config's pii: settings to filter the // intercepted body. Strict 1-to-1: a host claimed by two configs is a // configuration error and disables the MITM listener until resolved. // // Lets an admin pair a host (api.anthropic.com) with the model's // PII overrides without maintaining a parallel per-host map. type MITMModelConfig struct { // Hosts is the list of hostnames this model claims for MITM // interception. Each entry must be unique across all model configs. Hosts []string `yaml:"hosts,omitempty" json:"hosts,omitempty"` } // @Description Cloud proxy configuration. The cloud-proxy backend // forwards a model's traffic to an external provider. Two modes: // // - mode: passthrough — client and upstream must speak the same wire // format; the backend ships the raw request body to the upstream // URL and streams the response back untouched. The streaming PII // filter still runs because it operates on extracted token text. // // - mode: translate — the backend converts LocalAI's internal proto // to the provider's wire format and back. Unlocks cross-provider // routing (OpenAI client → Anthropic upstream, etc.) at the cost // of dropping provider-specific extensions that the internal proto // doesn't model. type ProxyConfig struct { // UpstreamURL is the full POST endpoint, e.g. // https://api.openai.com/v1/chat/completions or // https://api.anthropic.com/v1/messages. Required. UpstreamURL string `yaml:"upstream_url,omitempty" json:"upstream_url,omitempty"` // Mode selects passthrough (wire-perfect) or translate (full // control via internal proto). Empty defaults to passthrough. Mode string `yaml:"mode,omitempty" json:"mode,omitempty"` // Provider identifies the upstream's wire format for translate // mode (openai, anthropic). Ignored in passthrough mode — the // wire format there is whatever the client sent. Provider string `yaml:"provider,omitempty" json:"provider,omitempty"` // APIKeyEnv names the environment variable holding the upstream // API key. Mutually exclusive with APIKeyFile. Both empty is // allowed (no-auth upstreams). APIKeyEnv string `yaml:"api_key_env,omitempty" json:"api_key_env,omitempty"` // APIKeyFile is a path to a file whose contents are the upstream // API key. Trailing whitespace is trimmed. Mutually exclusive // with APIKeyEnv. The integration point for K8s secret mounts, // Vault agent files, and similar external-secret workflows. APIKeyFile string `yaml:"api_key_file,omitempty" json:"api_key_file,omitempty"` // UpstreamModel overrides the model name sent to the upstream. // Useful when the LocalAI-facing model alias differs from the // upstream's canonical name (e.g. local "claude-strict" maps to // upstream "claude-3-5-sonnet-20241022"). Empty means forward // the client's model field unchanged. UpstreamModel string `yaml:"upstream_model,omitempty" json:"upstream_model,omitempty"` // RequestTimeoutSeconds caps the upstream request duration. 0 // means no per-request timeout (only the request context, which // is bound to the client connection, applies). RequestTimeoutSeconds int `yaml:"request_timeout_seconds,omitempty" json:"request_timeout_seconds,omitempty"` } // Proxy mode names. Validate() normalises an empty Mode to // ProxyModePassthrough so downstream code only sees concrete values. const ( ProxyModePassthrough = "passthrough" ProxyModeTranslate = "translate" ) // Proxy provider names. Only meaningful in translate mode, where the // cloud-proxy backend picks the wire format to use against the // upstream URL. const ( ProxyProviderOpenAI = "openai" ProxyProviderAnthropic = "anthropic" ) // IsCloudProxyBackendPassthrough reports whether this model uses the // cloud-proxy gRPC backend in passthrough mode. Empty Mode counts as // passthrough (SetDefaults normalises it, but Validate accepts empty // too — handlers should not rely on a particular call order). func (c *ModelConfig) IsCloudProxyBackendPassthrough() bool { if c.Backend != "cloud-proxy" { return false } return c.Proxy.Mode == "" || c.Proxy.Mode == ProxyModePassthrough } // @Description Intelligent routing configuration. When a model declares // a Router block, requests addressed to it are reclassified at runtime // and dispatched to one of the named candidates. The router rewrites // input.Model in-place, then the standard model-resolution path picks // up the resolved config — meaning ACL checks, disabled-state, and // per-model PII still run against the chosen target. // // Depth-1 invariant: candidates must NOT themselves carry a Router // block. The router's "smart-router → claude-strict → cloud-proxy" // chain is fine, but "router-A → router-B → claude" is rejected at // config load to keep the dispatch graph acyclic and predictable. The // middleware also asserts depth ≤ 1 at runtime as a defensive check. type RouterConfig struct { // Classifier picks the implementation. Only "score" ships today: // it asks the classifier model to score every Policy label as a // continuation of the routing prompt and reads off the // distribution. Empty defaults to "score". Classifier string `yaml:"classifier,omitempty" json:"classifier,omitempty"` // Policies is the label vocabulary the classifier scores over. // Each policy carries a natural-language description that ends up // in the system prompt the classifier model sees — short, action- // oriented sentences work best ("writing or debugging code", // "small talk", ...). The Score classifier picks the subset of // labels whose softmax probability passes ActivationThreshold. Policies []RouterPolicy `yaml:"policies,omitempty" json:"policies,omitempty"` // Candidates is the routing table — each entry binds a downstream // model to a set of labels it can serve. The middleware picks the // FIRST candidate whose Labels are a superset of the active label // set from the classifier. Admins order this list smallest → // largest so a query that needs one label routes to the smallest // capable model, while a query that needs multiple falls to a // bigger candidate that covers them all. Candidates []RouterCandidate `yaml:"candidates,omitempty" json:"candidates,omitempty"` // Fallback is the model used when no candidate matches the active // label set, or when the classifier returns nothing above // threshold. Empty fallback means router failures bubble up as // 500 — fail-fast, not silent-bypass. Fallback string `yaml:"fallback,omitempty" json:"fallback,omitempty"` // ClassifierModel names the model the Score classifier scores // against (Arch-Router-1.5B is the canonical choice). ClassifierModel string `yaml:"classifier_model,omitempty" json:"classifier_model,omitempty"` // ClassifierCacheSize bounds the per-prompt memo cache that // amortises the classifier round-trip across repeat probes. // 0 disables the cache. Default 1024. ClassifierCacheSize int `yaml:"classifier_cache_size,omitempty" json:"classifier_cache_size,omitempty"` // ActivationThreshold is the softmax-probability floor a policy // must clear to be considered "active" for the request. 0 // defaults to a sensible value (~0.15) inside the classifier. // Higher → narrower routes (single-label dominant); lower → // more multi-label activations. ActivationThreshold float64 `yaml:"activation_threshold,omitempty" json:"activation_threshold,omitempty"` // ClassifierSystemTemplate overrides the routing system prompt // the score classifier feeds to its classifier_model. Go // text/template + Sprig, executed with `.Policies []ScorePolicy` // (Label + Description fields). Empty falls back to the built-in // Arch-Router-shaped template (route-listing block + JSON output // schema). Override when the classifier model was trained on a // different schema (e.g. bare label output, XML route block) or // when the routing instructions need to be in a different // language. The candidate format scored against the model is // fixed at `{"route": "