Finalize backend eviction with retries (#1895)
Requests that would previously fail after a 429 can now succeed on the
retry by failing over to the fallback provider within the same request.
This improves the UX because users see fewer transient failures and less
need to retry manually.
In order to get this behavior, backend health is updated in the retry
loop so that the retry target selection uses current health instead of
stale health.
Tested locally and this is the new behavior:
1. fake providers + eviction policy config:
```
apiVersion: v1
kind: ConfigMap
metadata:
name: mock-llm-primary-nginx
namespace: default
data:
nginx.conf: |
events {}
http {
server {
listen 9234;
location = /v1/chat/completions {
default_type application/json;
return 429 '{"error":{"message":"rate limited","type":"rate_limit_error"}}';
}
location / {
return 404;
}
}
}
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: mock-llm-primary
namespace: default
spec:
replicas: 1
selector:
matchLabels:
app: mock-llm-primary
template:
metadata:
labels:
app: mock-llm-primary
spec:
containers:
- name: nginx
image: nginx:1.27-alpine
ports:
- containerPort: 9234
volumeMounts:
- name: nginx-conf
mountPath: /etc/nginx/nginx.conf
subPath: nginx.conf
volumes:
- name: nginx-conf
configMap:
name: mock-llm-primary-nginx
---
apiVersion: v1
kind: Service
metadata:
name: mock-llm-primary
namespace: default
spec:
selector:
app: mock-llm-primary
ports:
- name: http
port: 9234
targetPort: 9234
---
apiVersion: v1
kind: ConfigMap
metadata:
name: mock-llm-fallback-nginx
namespace: default
data:
nginx.conf: |
events {}
http {
server {
listen 9234;
location = /v1/chat/completions {
default_type application/json;
return 200 '{"id":"chatcmpl-fallback","object":"chat.completion","created":1710000000,"model":"gpt-4o-mini","choices":[{"index":0,"message":{"role":"assistant","content":"fallback
worked"},"finish_reason":"stop"}],"usage":{"prompt_tokens":5,"completion_tokens":2,"total_tokens":7}}';
}
location / {
return 404;
}
}
}
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: mock-llm-fallback
namespace: default
spec:
replicas: 1
selector:
matchLabels:
app: mock-llm-fallback
template:
metadata:
labels:
app: mock-llm-fallback
spec:
containers:
- name: nginx
image: nginx:1.27-alpine
ports:
- containerPort: 9234
volumeMounts:
- name: nginx-conf
mountPath: /etc/nginx/nginx.conf
subPath: nginx.conf
volumes:
- name: nginx-conf
configMap:
name: mock-llm-fallback-nginx
---
apiVersion: v1
kind: Service
metadata:
name: mock-llm-fallback
namespace: default
spec:
selector:
app: mock-llm-fallback
ports:
- name: http
port: 9234
targetPort: 9234
---
apiVersion: gateway.networking.k8s.io/v1
kind: Gateway
metadata:
name: gateway
namespace: agentgateway-system
spec:
gatewayClassName: agentgateway
listeners:
- name: http
protocol: HTTP
port: 8080
allowedRoutes:
namespaces:
from: All
---
apiVersion: gateway.networking.k8s.io/v1
kind: HTTPRoute
metadata:
name: mock-ratelimit-failover
namespace: default
spec:
parentRefs:
- group: gateway.networking.k8s.io
kind: Gateway
name: gateway
namespace: agentgateway-system
rules:
- backendRefs:
- group: agentgateway.dev
kind: AgentgatewayBackend
name: mock-ratelimit-failover
matches:
- path:
type: PathPrefix
value: /v1/chat/completions
headers:
- type: Exact
name: x-test-failover
value: "1"
---
apiVersion: agentgateway.dev/v1alpha1
kind: AgentgatewayBackend
metadata:
name: mock-ratelimit-failover
namespace: default
spec:
ai:
groups:
- providers:
- name: primary
openai:
model: gpt-4o-mini
host: mock-llm-primary.default.svc.cluster.local
port: 9234
- providers:
- name: fallback
openai:
model: gpt-4o-mini
host: mock-llm-fallback.default.svc.cluster.local
port: 9234
policies:
auth:
key: dummy-key
---
apiVersion: agentgateway.dev/v1alpha1
kind: AgentgatewayPolicy
metadata:
name: retry-policy
namespace: default
spec:
targetRefs:
- group: gateway.networking.k8s.io
kind: HTTPRoute
name: mock-ratelimit-failover
traffic:
retry:
attempts: 1
backoff: 50ms
codes:
- 429
backend:
health:
unhealthyCondition: "response.code == 429"
eviction:
duration: 120s
consecutiveFailures: 1
restoreHealth: 0
```
2. Send a curl, you should get a 200 OK immediately:
```
❯ curl -si http://127.0.0.1:8080/v1/chat/completions \
-H 'content-type: application/json' \
-H 'x-test-failover: 1' \
-d '{"model":"gpt-4o-mini","messages":[{"role":"user","content":"say hi"}]}'
HTTP/1.1 200 OK
server: nginx/1.27.5
date: Wed, 20 May 2026 19:04:59 GMT
content-type: application/json
connection: keep-alive
content-length: 269
{"model":"gpt-4o-mini","usage":{"prompt_tokens":5,"completion_tokens":2,"total_tokens":7},"choices":[{"message":{"content":"fallback worked","role":"assistant"},"index":0,"finish_reason":"stop"}],"id":"chatcmpl-fallback","object":"chat.completion","created":1710000000}%
```
The old behavior had `http.status=429` returned and logs would show the
final request ending on the primary, not fallback.
3. Look at logs to check backends are both getting hit:
```
❯ kubectl -n default logs deploy/mock-llm-primary --since=10m
10.244.0.9 - - [20/May/2026:19:04:55 +0000] "POST /v1/chat/completions HTTP/1.1" 429 62 "-" "curl/8.13.0"
```
```
❯ kubectl -n default logs deploy/mock-llm-fallback --since=10m
10.244.0.9 - - [20/May/2026:19:04:55 +0000] "POST /v1/chat/completions HTTP/1.1" 200 269 "-" "curl/8.13.0"
10.244.0.9 - - [20/May/2026:19:04:59 +0000] "POST /v1/chat/completions HTTP/1.1" 200 269 "-" "curl/8.13.0"
```
gateway’s final log shows retry.attempt=1:
```
2026-05-20T19:04:55.252812Z info request gateway=agentgateway-system/gateway listener=http route=default/mock-ratelimit-failover endpoint=mock-llm-fallback.default.svc.cluster.local:9234 src.addr=127.0.0.1:47846 http.method=POST http.host=127.0.0.1 http.path=/v1/chat/completions http.version=HTTP/1.1 http.status=200 protocol=llm gen_ai.operation.name=chat gen_ai.provider.name=openai gen_ai.request.model=gpt-4o-mini gen_ai.response.model=gpt-4o-mini gen_ai.usage.input_tokens=5 gen_ai.usage.output_tokens=2 retry.attempt=1 duration=72ms
2026-05-20T19:04:59.793919Z info request gateway=agentgateway-system/gateway listener=http route=default/mock-ratelimit-failover endpoint=mock-llm-fallback.default.svc.cluster.local:9234 src.addr=127.0.0.1:47948 http.method=POST http.host=127.0.0.1 http.path=/v1/chat/completions http.version=HTTP/1.1 http.status=200 protocol=llm gen_ai.operation.name=chat gen_ai.provider.name=openai gen_ai.request.model=gpt-4o-mini gen_ai.response.model=gpt-4o-mini gen_ai.usage.input_tokens=5 gen_ai.usage.output_tokens=2 duration=0ms
```
---------
Signed-off-by: npolshakova <nina.polshakova@solo.io> N
Nina Polshakova committed
04755166d2db3a93c134c40dbdb6cfffcb206143
Parent: 8bc0a98
Committed by GitHub <noreply@github.com>
on 5/29/2026, 7:32:32 PM