Skip to content

Commit 42493fd

Browse files
authored
Add support for sending grpc server backend metrics via ORCA (#18282)
Signed-off-by: twthorn <[email protected]>
1 parent d003465 commit 42493fd

22 files changed

+721
-3
lines changed

go.mod

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ require (
103103
github.com/kr/text v0.2.0
104104
github.com/mitchellh/mapstructure v1.5.1-0.20231216201459-8508981c8b6c
105105
github.com/nsf/jsondiff v0.0.0-20210926074059-1e845ec5d249
106+
github.com/shirou/gopsutil/v4 v4.25.4
106107
github.com/spf13/afero v1.14.0
107108
github.com/spf13/jwalterweatherman v1.1.0
108109
github.com/xlab/treeprint v1.2.0
@@ -113,6 +114,16 @@ require (
113114
modernc.org/sqlite v1.37.0
114115
)
115116

117+
require (
118+
github.com/cilium/ebpf v0.16.0 // indirect
119+
github.com/containerd/log v0.1.0 // indirect
120+
github.com/docker/go-units v0.5.0 // indirect
121+
github.com/godbus/dbus/v5 v5.1.0 // indirect
122+
github.com/moby/sys/userns v0.1.0 // indirect
123+
github.com/opencontainers/runtime-spec v1.2.0 // indirect
124+
github.com/sirupsen/logrus v1.9.3 // indirect
125+
)
126+
116127
require (
117128
cel.dev/expr v0.24.0 // indirect
118129
cloud.google.com/go v0.121.1 // indirect
@@ -148,6 +159,8 @@ require (
148159
github.com/aws/aws-sdk-go-v2/service/sts v1.33.19 // indirect
149160
github.com/beorn7/perks v1.0.1 // indirect
150161
github.com/cncf/xds/go v0.0.0-20250501225837-2ac532fd4443 // indirect
162+
github.com/containerd/cgroups v1.1.0
163+
github.com/containerd/cgroups/v3 v3.0.5
151164
github.com/coreos/go-semver v0.3.1 // indirect
152165
github.com/coreos/go-systemd/v22 v22.5.0 // indirect
153166
github.com/cpuguy83/go-md2man/v2 v2.0.7 // indirect
@@ -161,6 +174,7 @@ require (
161174
github.com/go-jose/go-jose/v4 v4.1.0 // indirect
162175
github.com/go-logr/logr v1.4.2 // indirect
163176
github.com/go-logr/stdr v1.2.2 // indirect
177+
github.com/go-ole/go-ole v1.2.6 // indirect
164178
github.com/go-viper/mapstructure/v2 v2.2.1 // indirect
165179
github.com/gogo/protobuf v1.3.2 // indirect
166180
github.com/google/s2a-go v0.1.9 // indirect
@@ -177,6 +191,7 @@ require (
177191
github.com/hashicorp/go-sockaddr v1.0.7 // indirect
178192
github.com/hashicorp/golang-lru v1.0.2 // indirect
179193
github.com/inconshreveable/mousetrap v1.1.0 // indirect
194+
github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect
180195
github.com/mattn/go-colorable v0.1.14 // indirect
181196
github.com/mattn/go-ieproxy v0.0.12 // indirect
182197
github.com/mattn/go-isatty v0.0.20 // indirect
@@ -189,6 +204,7 @@ require (
189204
github.com/outcaste-io/ristretto v0.2.3 // indirect
190205
github.com/pelletier/go-toml/v2 v2.2.4 // indirect
191206
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
207+
github.com/power-devops/perfstat v0.0.0-20210106213030-5aafc221ea8c // indirect
192208
github.com/prometheus/client_model v0.6.2 // indirect
193209
github.com/prometheus/procfs v0.16.1 // indirect
194210
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
@@ -204,6 +220,9 @@ require (
204220
github.com/subosito/gotenv v1.6.0 // indirect
205221
github.com/tidwall/match v1.1.1 // indirect
206222
github.com/tidwall/pretty v1.2.1 // indirect
223+
github.com/tklauser/go-sysconf v0.3.12 // indirect
224+
github.com/tklauser/numcpus v0.6.1 // indirect
225+
github.com/yusufpapurcu/wmi v1.2.4 // indirect
207226
github.com/zeebo/errs v1.4.0 // indirect
208227
go.opentelemetry.io/auto/sdk v1.1.0 // indirect
209228
go.opentelemetry.io/contrib/detectors/gcp v1.35.0 // indirect

go.sum

Lines changed: 48 additions & 0 deletions
Large diffs are not rendered by default.

go/flags/endtoend/mysqlctld.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ Flags:
7272
--grpc-crl string path to a certificate revocation list in PEM format, client certificates will be further verified against this file during TLS handshake
7373
--grpc-dial-concurrency-limit int Maximum concurrency of grpc dial operations. This should be less than the golang max thread limit of 10000. (default 1024)
7474
--grpc-enable-optional-tls enable optional TLS mode when a server accepts both TLS and plain-text connections on the same port
75+
--grpc-enable-orca-metrics gRPC server option to enable sending ORCA metrics to clients for load balancing
7576
--grpc-initial-conn-window-size int gRPC initial connection window size
7677
--grpc-initial-window-size int gRPC initial window size
7778
--grpc-keepalive-time duration After a duration of this time, if the client doesn't see any activity, it pings the server to see if the transport is still alive. (default 10s)

go/flags/endtoend/vtcombo.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ Flags:
141141
--grpc-cert string server certificate to use for gRPC connections, requires grpc-key, enables TLS
142142
--grpc-crl string path to a certificate revocation list in PEM format, client certificates will be further verified against this file during TLS handshake
143143
--grpc-enable-optional-tls enable optional TLS mode when a server accepts both TLS and plain-text connections on the same port
144+
--grpc-enable-orca-metrics gRPC server option to enable sending ORCA metrics to clients for load balancing
144145
--grpc-enable-tracing Enable gRPC tracing.
145146
--grpc-key string server private key to use for gRPC connections, requires grpc-cert, enables TLS
146147
--grpc-max-connection-age duration Maximum age of a client connection before GoAway is sent. (default 2562047h47m16.854775807s)

go/flags/endtoend/vtctld.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ Flags:
6868
--grpc-crl string path to a certificate revocation list in PEM format, client certificates will be further verified against this file during TLS handshake
6969
--grpc-dial-concurrency-limit int Maximum concurrency of grpc dial operations. This should be less than the golang max thread limit of 10000. (default 1024)
7070
--grpc-enable-optional-tls enable optional TLS mode when a server accepts both TLS and plain-text connections on the same port
71+
--grpc-enable-orca-metrics gRPC server option to enable sending ORCA metrics to clients for load balancing
7172
--grpc-enable-tracing Enable gRPC tracing.
7273
--grpc-initial-conn-window-size int gRPC initial connection window size
7374
--grpc-initial-window-size int gRPC initial window size

go/flags/endtoend/vtgate.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ Flags:
7878
--grpc-crl string path to a certificate revocation list in PEM format, client certificates will be further verified against this file during TLS handshake
7979
--grpc-dial-concurrency-limit int Maximum concurrency of grpc dial operations. This should be less than the golang max thread limit of 10000. (default 1024)
8080
--grpc-enable-optional-tls enable optional TLS mode when a server accepts both TLS and plain-text connections on the same port
81+
--grpc-enable-orca-metrics gRPC server option to enable sending ORCA metrics to clients for load balancing
8182
--grpc-enable-tracing Enable gRPC tracing.
8283
--grpc-initial-conn-window-size int gRPC initial connection window size
8384
--grpc-initial-window-size int gRPC initial window size

go/flags/endtoend/vtgateclienttest.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ Flags:
2525
--grpc-crl string path to a certificate revocation list in PEM format, client certificates will be further verified against this file during TLS handshake
2626
--grpc-dial-concurrency-limit int Maximum concurrency of grpc dial operations. This should be less than the golang max thread limit of 10000. (default 1024)
2727
--grpc-enable-optional-tls enable optional TLS mode when a server accepts both TLS and plain-text connections on the same port
28+
--grpc-enable-orca-metrics gRPC server option to enable sending ORCA metrics to clients for load balancing
2829
--grpc-enable-tracing Enable gRPC tracing.
2930
--grpc-initial-conn-window-size int gRPC initial connection window size
3031
--grpc-initial-window-size int gRPC initial window size

go/flags/endtoend/vttablet.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ Flags:
170170
--grpc-crl string path to a certificate revocation list in PEM format, client certificates will be further verified against this file during TLS handshake
171171
--grpc-dial-concurrency-limit int Maximum concurrency of grpc dial operations. This should be less than the golang max thread limit of 10000. (default 1024)
172172
--grpc-enable-optional-tls enable optional TLS mode when a server accepts both TLS and plain-text connections on the same port
173+
--grpc-enable-orca-metrics gRPC server option to enable sending ORCA metrics to clients for load balancing
173174
--grpc-enable-tracing Enable gRPC tracing.
174175
--grpc-initial-conn-window-size int gRPC initial connection window size
175176
--grpc-initial-window-size int gRPC initial window size

go/flags/endtoend/vttestserver.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ Flags:
5454
--grpc-crl string path to a certificate revocation list in PEM format, client certificates will be further verified against this file during TLS handshake
5555
--grpc-dial-concurrency-limit int Maximum concurrency of grpc dial operations. This should be less than the golang max thread limit of 10000. (default 1024)
5656
--grpc-enable-optional-tls enable optional TLS mode when a server accepts both TLS and plain-text connections on the same port
57+
--grpc-enable-orca-metrics gRPC server option to enable sending ORCA metrics to clients for load balancing
5758
--grpc-enable-tracing Enable gRPC tracing.
5859
--grpc-initial-conn-window-size int gRPC initial connection window size
5960
--grpc-initial-window-size int gRPC initial window size

go/vt/servenv/grpc_server.go

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ import (
3232
"google.golang.org/grpc/health"
3333
healthpb "google.golang.org/grpc/health/grpc_health_v1"
3434
"google.golang.org/grpc/keepalive"
35+
"google.golang.org/grpc/orca"
3536
"google.golang.org/grpc/reflection"
3637

3738
"vitess.io/vitess/go/trace"
@@ -65,6 +66,9 @@ var (
6566
// GRPCServer is the global server to serve gRPC.
6667
GRPCServer *grpc.Server
6768

69+
// GRPC server metrics recorder
70+
GRPCServerMetricsRecorder orca.ServerMetricsRecorder
71+
6872
authPlugin Authenticator
6973
)
7074

@@ -101,10 +105,18 @@ var (
101105
// there are no active streams, server will send GOAWAY and close the connection.
102106
gRPCKeepAliveEnforcementPolicyPermitWithoutStream bool
103107

108+
// Enable ORCA metrics to be sent from the server to the client to be used for load balancing.
109+
gRPCEnableOrcaMetrics bool
110+
104111
gRPCKeepaliveTime = 10 * time.Second
105112
gRPCKeepaliveTimeout = 10 * time.Second
106113
)
107114

115+
// Injectable behavior for testing.
116+
var (
117+
orcaRegisterFunc = orca.Register
118+
)
119+
108120
// TLS variables.
109121
var (
110122
// gRPCCert is the cert to use if TLS is enabled.
@@ -130,7 +142,6 @@ var (
130142
// ParseFlags(WithArgs)? if they wish to run a gRPC server.
131143
func RegisterGRPCServerFlags() {
132144
OnParse(func(fs *pflag.FlagSet) {
133-
134145
utils.SetFlagIntVar(fs, &gRPCPort, "grpc-port", gRPCPort, "Port to listen on for gRPC calls. If zero, do not listen.")
135146
utils.SetFlagStringVar(fs, &gRPCBindAddress, "grpc-bind-address", gRPCBindAddress, "Bind address for gRPC calls. If empty, listen on all addresses.")
136147
utils.SetFlagDurationVar(fs, &gRPCMaxConnectionAge, "grpc-max-connection-age", gRPCMaxConnectionAge, "Maximum age of a client connection before GoAway is sent.")
@@ -139,6 +150,7 @@ func RegisterGRPCServerFlags() {
139150
utils.SetFlagIntVar(fs, &gRPCInitialWindowSize, "grpc-server-initial-window-size", gRPCInitialWindowSize, "gRPC server initial window size")
140151
utils.SetFlagDurationVar(fs, &gRPCKeepAliveEnforcementPolicyMinTime, "grpc-server-keepalive-enforcement-policy-min-time", gRPCKeepAliveEnforcementPolicyMinTime, "gRPC server minimum keepalive time")
141152
utils.SetFlagBoolVar(fs, &gRPCKeepAliveEnforcementPolicyPermitWithoutStream, "grpc-server-keepalive-enforcement-policy-permit-without-stream", gRPCKeepAliveEnforcementPolicyPermitWithoutStream, "gRPC server permit client keepalive pings even when there are no active streams (RPCs)")
153+
utils.SetFlagBoolVar(fs, &gRPCEnableOrcaMetrics, "grpc-enable-orca-metrics", gRPCEnableOrcaMetrics, "gRPC server option to enable sending ORCA metrics to clients for load balancing")
142154

143155
utils.SetFlagStringVar(fs, &gRPCCert, "grpc-cert", gRPCCert, "server certificate to use for gRPC connections, requires grpc-key, enables TLS")
144156
utils.SetFlagStringVar(fs, &gRPCKey, "grpc-key", gRPCKey, "server private key to use for gRPC connections, requires grpc-cert, enables TLS")
@@ -226,6 +238,11 @@ func createGRPCServer() {
226238
opts = append(opts, grpc.MaxRecvMsgSize(msgSize))
227239
opts = append(opts, grpc.MaxSendMsgSize(msgSize))
228240

241+
if gRPCEnableOrcaMetrics {
242+
GRPCServerMetricsRecorder = orca.NewServerMetricsRecorder()
243+
opts = append(opts, orca.CallMetricsServerOption(GRPCServerMetricsRecorder))
244+
}
245+
229246
if gRPCInitialConnWindowSize != 0 {
230247
log.Infof("Setting grpc server initial conn window size to %d", int32(gRPCInitialConnWindowSize))
231248
opts = append(opts, grpc.InitialConnWindowSize(int32(gRPCInitialConnWindowSize)))
@@ -289,6 +306,10 @@ func serveGRPC() {
289306
return
290307
}
291308

309+
if gRPCEnableOrcaMetrics {
310+
registerOrca()
311+
}
312+
292313
// register reflection to support list calls :)
293314
reflection.Register(GRPCServer)
294315

@@ -327,6 +348,29 @@ func serveGRPC() {
327348
})
328349
}
329350

351+
func registerOrca() {
352+
if err := orcaRegisterFunc(GRPCServer, orca.ServiceOptions{
353+
// The minimum interval of orca is 30 seconds, unless we enable a testing flag.
354+
MinReportingInterval: 30 * time.Second,
355+
ServerMetricsProvider: GRPCServerMetricsRecorder,
356+
}); err != nil {
357+
log.Exitf("Failed to register ORCA service: %v", err)
358+
}
359+
360+
// Initialize the server metrics values.
361+
GRPCServerMetricsRecorder.SetCPUUtilization(getCpuUsage())
362+
GRPCServerMetricsRecorder.SetMemoryUtilization(getMemoryUsage())
363+
364+
go func() {
365+
ticker := time.NewTicker(30 * time.Second)
366+
defer ticker.Stop()
367+
for range ticker.C {
368+
GRPCServerMetricsRecorder.SetCPUUtilization(getCpuUsage())
369+
GRPCServerMetricsRecorder.SetMemoryUtilization(getMemoryUsage())
370+
}
371+
}()
372+
}
373+
330374
// GRPCCheckServiceMap returns if we should register a gRPC service
331375
// (and also logs how to enable / disable it)
332376
func GRPCCheckServiceMap(name string) bool {

go/vt/servenv/grpc_server_test.go

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,14 @@ limitations under the License.
1717
package servenv
1818

1919
import (
20+
"fmt"
21+
"net"
2022
"testing"
2123

2224
"context"
2325

2426
"google.golang.org/grpc"
27+
"google.golang.org/grpc/orca"
2528
)
2629

2730
func TestEmpty(t *testing.T) {
@@ -61,6 +64,65 @@ func TestDoubleInterceptor(t *testing.T) {
6164
}
6265
}
6366

67+
func TestOrcaRecorder(t *testing.T) {
68+
recorder := orca.NewServerMetricsRecorder()
69+
70+
recorder.SetCPUUtilization(0.25)
71+
recorder.SetMemoryUtilization(0.5)
72+
73+
snap := recorder.ServerMetrics()
74+
75+
if snap.CPUUtilization != 0.25 {
76+
t.Errorf("expected cpu 0.25, got %v", snap.CPUUtilization)
77+
}
78+
if snap.MemUtilization != 0.5 {
79+
t.Errorf("expected memory 0.5, got %v", snap.MemUtilization)
80+
}
81+
}
82+
83+
func TestReportedOrca(t *testing.T) {
84+
// Set the port to enable gRPC server.
85+
withTempVar(&gRPCPort, getFreePort())
86+
withTempVar(&gRPCEnableOrcaMetrics, true)
87+
withTempVar(&GRPCServerMetricsRecorder, nil)
88+
89+
createGRPCServer()
90+
if GRPCServerMetricsRecorder == nil {
91+
t.Errorf("GRPCServerMetricsRecorder should be initialized when gRPCEnableOrcaMetrics is false")
92+
}
93+
94+
serveGRPC()
95+
serverMetrics := GRPCServerMetricsRecorder.ServerMetrics()
96+
cpuUsage := serverMetrics.CPUUtilization
97+
if cpuUsage < 0 {
98+
t.Errorf("CPU Utilization is not set %.2f", cpuUsage)
99+
}
100+
t.Logf("CPU Utilization is %.2f", cpuUsage)
101+
102+
memUsage := serverMetrics.MemUtilization
103+
if memUsage < 0 {
104+
t.Errorf("Mem Utilization is not set %.2f", memUsage)
105+
}
106+
t.Logf("Memory utilization is %.2f", memUsage)
107+
}
108+
109+
func getFreePort() int {
110+
l, err := net.Listen("tcp", ":0")
111+
if err != nil {
112+
panic(fmt.Sprintf("could not get free port: %v", err))
113+
}
114+
defer l.Close()
115+
return l.Addr().(*net.TCPAddr).Port
116+
}
117+
118+
func withTempVar[T any](set *T, temp T) (restore func()) {
119+
original := *set
120+
*set = temp
121+
return func() {
122+
*set = original
123+
}
124+
}
125+
64126
type FakeInterceptor struct {
65127
name string
66128
streamSeen any

go/vt/servenv/metrics.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
/*
2+
Copyright 2025 The Vitess Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package servenv
18+
19+
func getCpuUsage() float64 {
20+
value, err := getCgroupCpu()
21+
if err == nil {
22+
return value
23+
}
24+
value, err = getHostCpuUsage()
25+
if err == nil {
26+
return value
27+
}
28+
return -1
29+
}
30+
31+
func getMemoryUsage() float64 {
32+
value, err := getCgroupMemory()
33+
if err == nil {
34+
return value
35+
}
36+
value, err = getHostMemoryUsage()
37+
if err == nil {
38+
return value
39+
}
40+
return -1
41+
}

0 commit comments

Comments
 (0)