diff --git a/CHANGELOG.md b/CHANGELOG.md index e5eb3b35d49..80ed4867372 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ * [CHANGE] **BREAKING CHANGE** Remove duplicate "compaction" prefix from CompactorConfig CLI flags. Affected flags: `compaction.block-retention`, `compaction.max-objects-per-block`, `compaction.max-block-bytes`, `compaction.compaction-window`. [#6909](https://github.com/grafana/tempo/pull/6909) (@electron0zero) * [ENHANCEMENT] Support OR conditions for tag name and tag value autocomplete (search tags v2) [#6827](https://github.com/grafana/tempo/pull/6827) (@ie-pham) * [ENHANCEMENT] Expose MinIO retry settings via S3 config [#6561](https://github.com/grafana/tempo/pull/6561) (@rwhitty) +* [ENHANCEMENT] Reduce default livestore WAL size and align query defaults: `max_block_duration` `1m` to `30s`, `max_block_bytes` `100MiB` to `50MiB`, `complete_block_timeout` `1h` to `20m`, metrics `query_backend_after` `30m` to `15m`. [#6974](https://github.com/grafana/tempo/pull/6974) (@zhxiaogg) * [CHANGE] **BREAKING CHANGE** Centralize block and WAL config: `block_builder` and `live_store` now always use `storage.trace.block` settings; per-module block config fields are removed. [#6647](https://github.com/grafana/tempo/pull/6647) (@stoewer) * [CHANGE] **BREAKING CHANGE** Remove Opencensus receiver [#6523](https://github.com/grafana/tempo/pull/6523) (@javiermolinar) * [CHANGE] Upgrade Tempo to Go 1.26.0 [#6443](https://github.com/grafana/tempo/pull/6443) (@stoewer) diff --git a/docs/sources/tempo/configuration/_index.md b/docs/sources/tempo/configuration/_index.md index 7eb924a98e0..abf3d6340b9 100644 --- a/docs/sources/tempo/configuration/_index.md +++ b/docs/sources/tempo/configuration/_index.md @@ -421,7 +421,7 @@ live_store: [commit_interval: | default = 5s] # How often to sweep all tenants and move traces from live -> wal -> completed blocks. - [flush_check_period: | default = 10s] + [flush_check_period: | default = 5s] # Amount of time a trace must be idle before flushing it to the WAL. [max_trace_idle: | default = 5s] @@ -433,13 +433,13 @@ live_store: [max_live_traces_bytes: | default = 250000000] # Maximum size of a block before cutting it. - [max_block_bytes: | default = 104857600] + [max_block_bytes: | default = 52428800] # Maximum length of time before cutting a block. - [max_block_duration: | default = 30m] + [max_block_duration: | default = 30s] # Duration to keep blocks in the live-store after they have been completed. - [complete_block_timeout: | default = 1h] + [complete_block_timeout: | default = 20m] # Target consumer lag threshold before the live-store is considered ready to serve queries. # Set to 0 to disable readiness waiting. @@ -862,7 +862,7 @@ query_frontend: # query_backend_after controls where the query-frontend searches for traces. # Time ranges older than query_backend_after will be searched in the backend/object storage only. # Time ranges between query_backend_after and now will be queried from the metrics-generators. - [query_backend_after: | default = 30m ] + [query_backend_after: | default = 15m ] # The target length of time for each job to handle when querying the backend. [interval: | default = 5m ] diff --git a/docs/sources/tempo/configuration/manifest.md b/docs/sources/tempo/configuration/manifest.md index 1eaa8c1da38..87a8b46c5d1 100644 --- a/docs/sources/tempo/configuration/manifest.md +++ b/docs/sources/tempo/configuration/manifest.md @@ -341,7 +341,7 @@ query_frontend: concurrent_jobs: 1000 target_bytes_per_job: 104857600 max_duration: 24h0m0s - query_backend_after: 30m0s + query_backend_after: 15m0s interval: 5m0s max_exemplars: 100 streaming_shards: 200 @@ -1032,7 +1032,7 @@ live_store: path: /var/tempo/live-store/traces ingestion_time_range_slack: 2m0s query_block_concurrency: 10 - complete_block_timeout: 1h0m0s + complete_block_timeout: 20m0s complete_block_concurrency: 2 shutdown_marker_dir: /var/tempo/live-store/shutdown-marker flush_check_period: 5s @@ -1040,8 +1040,8 @@ live_store: max_trace_live: 30s max_trace_idle: 5s max_live_traces_bytes: 250000000 - max_block_duration: 1m0s - max_block_bytes: 104857600 + max_block_duration: 30s + max_block_bytes: 52428800 readiness_target_lag: 0s readiness_max_wait: 30m0s fail_on_high_lag: false diff --git a/docs/sources/tempo/metrics-from-traces/metrics-queries/configure-traceql-metrics.md b/docs/sources/tempo/metrics-from-traces/metrics-queries/configure-traceql-metrics.md index f726f77bb6d..28b684a1406 100644 --- a/docs/sources/tempo/metrics-from-traces/metrics-queries/configure-traceql-metrics.md +++ b/docs/sources/tempo/metrics-from-traces/metrics-queries/configure-traceql-metrics.md @@ -52,7 +52,7 @@ This is different to the default TraceQL maximum time range of 168 hours (7 days {{< /admonition >}} The `query_frontend.metrics.query_backend_after` parameter controls the boundary between querying the live-store and backend storage. -Time ranges older than `query_backend_after` (default `30m`) are searched in backend/object storage only, while more recent data is queried from the live-store. +Time ranges older than `query_backend_after` (default `15m`) are searched in backend/object storage only, while more recent data is queried from the live-store. For example, in a cloud environment, smaller jobs with more concurrency may be desired due to the nature of scale on the backend. diff --git a/docs/sources/tempo/set-up-for-tracing/setup-tempo/upgrade.md b/docs/sources/tempo/set-up-for-tracing/setup-tempo/upgrade.md index 9c63274c701..24fc0676e67 100644 --- a/docs/sources/tempo/set-up-for-tracing/setup-tempo/upgrade.md +++ b/docs/sources/tempo/set-up-for-tracing/setup-tempo/upgrade.md @@ -129,14 +129,17 @@ storage: - { scope: resource, name: service.name, type: string } ``` -### Live-store flush defaults updated +### Live-store and query defaults reduced -The default values for two live-store flush settings have changed to limit the amount of data replayed from the WAL, which prevents long startup and shutdown times. [[PR 6650](https://github.com/grafana/tempo/pull/6650)] +The default values for several live-store and query-frontend settings have been reduced to produce smaller WAL blocks, release completed blocks sooner, and align the metrics query backend boundary with search. -| Setting | Previous default | New default | -| ------------------------------- | ---------------- | ----------- | -| `live_store.flush_check_period` | `10s` | `5s` | -| `live_store.max_block_duration` | `30m` | `1m` | +| Setting | Previous default | New default | +| ------------------------------------------------ | ---------------- | ----------- | +| `live_store.flush_check_period` | `10s` | `5s` | +| `live_store.max_block_duration` | `30m` | `30s` | +| `live_store.max_block_bytes` | `100 MiB` | `50 MiB` | +| `live_store.complete_block_timeout` | `1h` | `20m` | +| `query_frontend.metrics.query_backend_after` | `30m` | `15m` | If you explicitly set these values in your configuration, no action is needed. diff --git a/modules/frontend/config.go b/modules/frontend/config.go index 4b8e2ac4b5c..5787f798f0b 100644 --- a/modules/frontend/config.go +++ b/modules/frontend/config.go @@ -108,7 +108,7 @@ func (cfg *Config) RegisterFlagsAndApplyDefaults(string, *flag.FlagSet) { cfg.Metrics = MetricsConfig{ Sharder: QueryRangeSharderConfig{ MaxDuration: 24 * time.Hour, - QueryBackendAfter: 30 * time.Minute, + QueryBackendAfter: 15 * time.Minute, ConcurrentRequests: defaultConcurrentRequests, TargetBytesPerRequest: defaultTargetBytesPerRequest, Interval: 5 * time.Minute, diff --git a/modules/livestore/config.go b/modules/livestore/config.go index e02f4bcff12..b3173fac4a1 100644 --- a/modules/livestore/config.go +++ b/modules/livestore/config.go @@ -11,7 +11,7 @@ import ( "github.com/grafana/tempo/tempodb/wal" ) -const defaultCompleteBlockTimeout = time.Hour +const defaultCompleteBlockTimeout = 20 * time.Minute type Config struct { Ring ring.Config `yaml:"ring,omitempty"` @@ -103,8 +103,8 @@ func (cfg *Config) RegisterFlagsAndApplyDefaults(prefix string, f *flag.FlagSet) cfg.MaxTraceLive = 30 * time.Second cfg.MaxTraceIdle = 5 * time.Second cfg.MaxLiveTracesBytes = 250_000_000 // 250MB - cfg.MaxBlockDuration = 1 * time.Minute - cfg.MaxBlockBytes = 100 * 1024 * 1024 + cfg.MaxBlockDuration = 30 * time.Second + cfg.MaxBlockBytes = 50 * 1024 * 1024 cfg.CommitInterval = 5 * time.Second cfg.ConsumeFromKafka = true diff --git a/modules/livestore/live_store_test.go b/modules/livestore/live_store_test.go index 4125d69c113..3549fef459b 100644 --- a/modules/livestore/live_store_test.go +++ b/modules/livestore/live_store_test.go @@ -1213,7 +1213,7 @@ func TestIsLagged(t *testing.T) { ctx := user.InjectOrgID(t.Context(), testTenantID) resp, err := ls.QueryRange(ctx, &tempopb.QueryRangeRequest{ Query: "{} | rate()", - Start: uint64(now.Add(-30 * time.Minute).UnixNano()), + Start: uint64(now.Add(-defaultCompleteBlockTimeout).UnixNano()), End: uint64(tc.end.UnixNano()), Step: uint64(time.Second), })