SIGN IN SIGN UP

Finalize backend eviction with retries (#1895)

Requests that would previously fail after a 429 can now succeed on the
retry by failing over to the fallback provider within the same request.
This improves the UX because users see fewer transient failures and less
need to retry manually.

In order to get this behavior, backend health is updated in the retry
loop so that the retry target selection uses current health instead of
stale health.

Tested locally and this is the new behavior:

1. fake providers + eviction policy config:
```
 apiVersion: v1
  kind: ConfigMap
  metadata:
    name: mock-llm-primary-nginx
    namespace: default
  data:
    nginx.conf: |
      events {}
      http {
        server {
          listen 9234;
          location = /v1/chat/completions {
            default_type application/json;
            return 429 '{"error":{"message":"rate limited","type":"rate_limit_error"}}';
          }
          location / {
            return 404;
          }
        }
      }
  ---
  apiVersion: apps/v1
  kind: Deployment
  metadata:
    name: mock-llm-primary
    namespace: default
  spec:
    replicas: 1
    selector:
      matchLabels:
        app: mock-llm-primary
    template:
      metadata:
        labels:
          app: mock-llm-primary
      spec:
        containers:
        - name: nginx
          image: nginx:1.27-alpine
          ports:
          - containerPort: 9234
          volumeMounts:
          - name: nginx-conf
            mountPath: /etc/nginx/nginx.conf
            subPath: nginx.conf
        volumes:
        - name: nginx-conf
          configMap:
            name: mock-llm-primary-nginx
  ---
  apiVersion: v1
  kind: Service
  metadata:
    name: mock-llm-primary
    namespace: default
  spec:
    selector:
      app: mock-llm-primary
    ports:
    - name: http
      port: 9234
      targetPort: 9234
  ---
  apiVersion: v1
  kind: ConfigMap
  metadata:
    name: mock-llm-fallback-nginx
    namespace: default
  data:
    nginx.conf: |
      events {}
      http {
        server {
          listen 9234;
          location = /v1/chat/completions {
            default_type application/json;
            return 200 '{"id":"chatcmpl-fallback","object":"chat.completion","created":1710000000,"model":"gpt-4o-mini","choices":[{"index":0,"message":{"role":"assistant","content":"fallback
  worked"},"finish_reason":"stop"}],"usage":{"prompt_tokens":5,"completion_tokens":2,"total_tokens":7}}';
          }
          location / {
            return 404;
          }
        }
      }
  ---
  apiVersion: apps/v1
  kind: Deployment
  metadata:
    name: mock-llm-fallback
    namespace: default
  spec:
    replicas: 1
    selector:
      matchLabels:
        app: mock-llm-fallback
    template:
      metadata:
        labels:
          app: mock-llm-fallback
      spec:
        containers:
        - name: nginx
          image: nginx:1.27-alpine
          ports:
          - containerPort: 9234
          volumeMounts:
          - name: nginx-conf
            mountPath: /etc/nginx/nginx.conf
            subPath: nginx.conf
        volumes:
        - name: nginx-conf
          configMap:
            name: mock-llm-fallback-nginx
  ---
  apiVersion: v1
  kind: Service
  metadata:
    name: mock-llm-fallback
    namespace: default
  spec:
    selector:
      app: mock-llm-fallback
    ports:
    - name: http
      port: 9234
      targetPort: 9234
  ---
  apiVersion: gateway.networking.k8s.io/v1
  kind: Gateway
  metadata:
    name: gateway
    namespace: agentgateway-system
  spec:
    gatewayClassName: agentgateway
    listeners:
    - name: http
      protocol: HTTP
      port: 8080
      allowedRoutes:
        namespaces:
          from: All
  ---
  apiVersion: gateway.networking.k8s.io/v1
  kind: HTTPRoute
  metadata:
    name: mock-ratelimit-failover
    namespace: default
  spec:
    parentRefs:
    - group: gateway.networking.k8s.io
      kind: Gateway
      name: gateway
      namespace: agentgateway-system
    rules:
    - backendRefs:
      - group: agentgateway.dev
        kind: AgentgatewayBackend
        name: mock-ratelimit-failover
      matches:
      - path:
          type: PathPrefix
          value: /v1/chat/completions
        headers:
        - type: Exact
          name: x-test-failover
          value: "1"
  ---
  apiVersion: agentgateway.dev/v1alpha1
  kind: AgentgatewayBackend
  metadata:
    name: mock-ratelimit-failover
    namespace: default
  spec:
    ai:
      groups:
      - providers:
        - name: primary
          openai:
            model: gpt-4o-mini
          host: mock-llm-primary.default.svc.cluster.local
          port: 9234
      - providers:
        - name: fallback
          openai:
            model: gpt-4o-mini
          host: mock-llm-fallback.default.svc.cluster.local
          port: 9234
    policies:
      auth:
        key: dummy-key
  ---
  apiVersion: agentgateway.dev/v1alpha1
  kind: AgentgatewayPolicy
  metadata:
    name: retry-policy
    namespace: default
  spec:
    targetRefs:
    - group: gateway.networking.k8s.io
      kind: HTTPRoute
      name: mock-ratelimit-failover
    traffic:
      retry:
        attempts: 1
        backoff: 50ms
        codes:
        - 429
    backend:
      health:
        unhealthyCondition: "response.code == 429"
        eviction:
          duration: 120s
          consecutiveFailures: 1
          restoreHealth: 0
```

2. Send a curl, you should get a 200 OK immediately:
```
❯ curl -si http://127.0.0.1:8080/v1/chat/completions \
    -H 'content-type: application/json' \
    -H 'x-test-failover: 1' \
    -d '{"model":"gpt-4o-mini","messages":[{"role":"user","content":"say hi"}]}'
HTTP/1.1 200 OK
server: nginx/1.27.5
date: Wed, 20 May 2026 19:04:59 GMT
content-type: application/json
connection: keep-alive
content-length: 269

{"model":"gpt-4o-mini","usage":{"prompt_tokens":5,"completion_tokens":2,"total_tokens":7},"choices":[{"message":{"content":"fallback worked","role":"assistant"},"index":0,"finish_reason":"stop"}],"id":"chatcmpl-fallback","object":"chat.completion","created":1710000000}%   
```

The old behavior had `http.status=429` returned and logs would show the
final request ending on the primary, not fallback.

3. Look at logs to check backends are both getting hit:


```
❯  kubectl -n default logs deploy/mock-llm-primary --since=10m
10.244.0.9 - - [20/May/2026:19:04:55 +0000] "POST /v1/chat/completions HTTP/1.1" 429 62 "-" "curl/8.13.0"
```

```
❯  kubectl -n default logs deploy/mock-llm-fallback --since=10m
10.244.0.9 - - [20/May/2026:19:04:55 +0000] "POST /v1/chat/completions HTTP/1.1" 200 269 "-" "curl/8.13.0"
10.244.0.9 - - [20/May/2026:19:04:59 +0000] "POST /v1/chat/completions HTTP/1.1" 200 269 "-" "curl/8.13.0"
```

 gateway’s final log shows retry.attempt=1:
```
2026-05-20T19:04:55.252812Z     info    request gateway=agentgateway-system/gateway listener=http route=default/mock-ratelimit-failover endpoint=mock-llm-fallback.default.svc.cluster.local:9234 src.addr=127.0.0.1:47846 http.method=POST http.host=127.0.0.1 http.path=/v1/chat/completions http.version=HTTP/1.1 http.status=200 protocol=llm gen_ai.operation.name=chat gen_ai.provider.name=openai gen_ai.request.model=gpt-4o-mini gen_ai.response.model=gpt-4o-mini gen_ai.usage.input_tokens=5 gen_ai.usage.output_tokens=2 retry.attempt=1 duration=72ms
2026-05-20T19:04:59.793919Z     info    request gateway=agentgateway-system/gateway listener=http route=default/mock-ratelimit-failover endpoint=mock-llm-fallback.default.svc.cluster.local:9234 src.addr=127.0.0.1:47948 http.method=POST http.host=127.0.0.1 http.path=/v1/chat/completions http.version=HTTP/1.1 http.status=200 protocol=llm gen_ai.operation.name=chat gen_ai.provider.name=openai gen_ai.request.model=gpt-4o-mini gen_ai.response.model=gpt-4o-mini gen_ai.usage.input_tokens=5 gen_ai.usage.output_tokens=2 duration=0ms
```

---------

Signed-off-by: npolshakova <nina.polshakova@solo.io>
N
Nina Polshakova committed
04755166d2db3a93c134c40dbdb6cfffcb206143
Parent: 8bc0a98
Committed by GitHub <noreply@github.com> on 5/29/2026, 7:32:32 PM