-
Notifications
You must be signed in to change notification settings - Fork 692
TraceQL metrics sampling for faster performance #5469
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
e5e195e
eb35491
fbd0835
6753a8b
0585b10
0d904b6
2beb4b1
d7ec4ff
99c6ccf
75749fb
287e2b2
5c86b46
647c0aa
a8b2aee
7b91cfe
c9a1648
080dbdb
88e1285
a660931
5b36bfb
6b0e74c
a2619dd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -922,6 +922,45 @@ func (e *Engine) CompileMetricsQueryRange(req *tempopb.QueryRangeRequest, exempl | |
| exemplars = v | ||
| } | ||
|
|
||
| // Debug sampling hints, remove once we settle on approach. | ||
|
joe-elliott marked this conversation as resolved.
|
||
| if traceSample, traceSampleOk := expr.Hints.GetFloat(HintTraceSample, allowUnsafeQueryHints); traceSampleOk { | ||
| storageReq.TraceSampler = newProbablisticSampler(traceSample) | ||
| } | ||
| if spanSample, spanSampleOk := expr.Hints.GetFloat(HintSpanSample, allowUnsafeQueryHints); spanSampleOk { | ||
| storageReq.SpanSampler = newProbablisticSampler(spanSample) | ||
| } | ||
|
|
||
| if sample, sampleOk := expr.Hints.GetBool(HintSample, allowUnsafeQueryHints); sampleOk && sample { | ||
| // Automatic sampling | ||
| // Get other params | ||
| s := newAdaptiveSampler() | ||
| if debug, ok := expr.Hints.GetBool(HintDebug, allowUnsafeQueryHints); ok { | ||
| s.debug = debug | ||
| } | ||
| if info, ok := expr.Hints.GetBool(HintInfo, allowUnsafeQueryHints); ok { | ||
| s.info = info | ||
| } | ||
|
|
||
| // Classify the query and determine if it needs to be at the trace-level or can be at span-level (better) | ||
| if expr.NeedsFullTrace() { | ||
| storageReq.TraceSampler = s | ||
| } else { | ||
| storageReq.SpanSampler = s | ||
| } | ||
| } | ||
|
|
||
| if sampleFraction, ok := expr.Hints.GetFloat(HintSample, allowUnsafeQueryHints); ok && sampleFraction > 0 && sampleFraction < 1 { | ||
| // Fixed sampling rate. | ||
| s := newProbablisticSampler(sampleFraction) | ||
|
|
||
| // Classify the query and determine if it needs to be at the trace-level or can be at span-level (better) | ||
| if expr.NeedsFullTrace() { | ||
| storageReq.TraceSampler = s | ||
| } else { | ||
| storageReq.SpanSampler = s | ||
| } | ||
| } | ||
|
|
||
| // This initializes all step buffers, counters, etc | ||
| metricsPipeline.init(req, AggregateModeRaw) | ||
|
|
||
|
|
@@ -1148,19 +1187,28 @@ func (e *MetricsEvaluator) Do(ctx context.Context, f SpansetFetcher, fetcherStar | |
|
|
||
| e.mtx.Lock() | ||
|
|
||
| if e.storageReq.TraceSampler != nil { | ||
| e.storageReq.TraceSampler.Measured() | ||
| } | ||
|
|
||
| var validSpansCount int | ||
| var randomSpanIndex int | ||
|
|
||
| needExemplar := e.maxExemplars > 0 && e.sampleExemplar(ss.TraceID) | ||
|
|
||
| for i, s := range ss.Spans { | ||
|
|
||
| if e.checkTime { | ||
| st := s.StartTimeUnixNanos() | ||
| if st <= e.start || st > e.end { | ||
| continue | ||
| } | ||
| } | ||
|
|
||
| if e.storageReq.SpanSampler != nil { | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. how does calling Measured() after checking the start/end time impact the final results? does this mean that blocks that barely overlap the time range will be downsampled less? so the edges of the graph will have more accurate results? should we call measured before we filter due to time range?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good catch, this was something I tried both ways. Calling Measured() after the check is more accurate, because it's capturing the true rate of data being found. The edges of a block are where this most often occurs. It prevents the false signal of data looking more common than it really is. So this way the sampling probability stays high (starts at 100%), until there are enough spans passing the time check to make a real determination. |
||
| e.storageReq.SpanSampler.Measured() | ||
| } | ||
|
|
||
| validSpansCount++ | ||
| e.metricsPipeline.observe(s) | ||
|
|
||
|
|
@@ -1209,12 +1257,28 @@ func (e *MetricsEvaluator) Metrics() (uint64, uint64, uint64) { | |
| } | ||
|
|
||
| func (e *MetricsEvaluator) Results() SeriesSet { | ||
| e.mtx.Lock() | ||
| defer e.mtx.Unlock() | ||
|
|
||
| spanMultiplier := 1.0 | ||
| if e.storageReq.SpanSampler != nil { | ||
| spanMultiplier = e.storageReq.SpanSampler.FinalScalingFactor() | ||
| } | ||
| traceMultiplier := 1.0 | ||
| if e.storageReq.TraceSampler != nil { | ||
| traceMultiplier = e.storageReq.TraceSampler.FinalScalingFactor() | ||
| } | ||
|
|
||
| multiplier := spanMultiplier * traceMultiplier | ||
|
|
||
| // NOTE: skip processing of second stage because not all first stage functions can't be pushed down. | ||
| // for example: if query has avg_over_time(), then we can't push it down to second stage, and second stage | ||
| // can only be processed on the frontend. | ||
| // we could do this but it would require knowing if the first stage functions | ||
| // can be pushed down to second stage or not so we are skipping it for now, and will handle it later. | ||
| return e.metricsPipeline.result() | ||
| ss := e.metricsPipeline.result(multiplier) | ||
|
|
||
| return ss | ||
| } | ||
|
|
||
| func (e *MetricsEvaluator) sampleExemplar(id []byte) bool { | ||
|
|
@@ -1255,7 +1319,8 @@ func (m *MetricsFrontendEvaluator) Results() SeriesSet { | |
| m.mtx.Lock() | ||
| defer m.mtx.Unlock() | ||
|
|
||
| results := m.metricsPipeline.result() | ||
| // Job results are not scaled by sampling, but this is here for the interface. | ||
| results := m.metricsPipeline.result(1.0) | ||
|
|
||
| if m.metricsSecondStage != nil { | ||
| // metrics second stage is only set when query has second stage function and mode = final | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.