[clickhouse] Implement FindTraceIDs for ClickHouse Storage For Primitive Parameters#7648
Conversation
Signed-off-by: Mahad Zaryab <mahadzaryab1@gmail.com>
Signed-off-by: Mahad Zaryab <mahadzaryab1@gmail.com>
Codecov Report✅ All modified and coverable lines are covered by tests. Additional details and impacted files@@ Coverage Diff @@
## main #7648 +/- ##
==========================================
+ Coverage 95.54% 95.58% +0.03%
==========================================
Files 307 307
Lines 15392 15440 +48
==========================================
+ Hits 14707 14759 +52
+ Misses 537 534 -3
+ Partials 148 147 -1
Flags with carried forward coverage won't be shown. Click here to find out more. ☔ View full report in Codecov by Sentry. 🚀 New features to boost your workflow:
|
Metrics Comparison SummaryTotal changes across all snapshots: 53 Detailed changes per snapshotsummary_metrics_snapshot_cassandra📊 Metrics Diff SummaryTotal Changes: 53
❌ Removed Metrics
View diff sample-http_server_request_body_size_bytes{http_request_method="GET",http_response_status_code="503",le="+Inf",network_protocol_name="http",network_protocol_version="1.1",otel_scope_name="go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp",otel_scope_schema_url="",otel_scope_version="0.64.0",server_address="localhost",server_port="13133",url_scheme="http"}
-http_server_request_body_size_bytes{http_request_method="GET",http_response_status_code="503",le="0",network_protocol_name="http",network_protocol_version="1.1",otel_scope_name="go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp",otel_scope_schema_url="",otel_scope_version="0.64.0",server_address="localhost",server_port="13133",url_scheme="http"}
-http_server_request_body_size_bytes{http_request_method="GET",http_response_status_code="503",le="10",network_protocol_name="http",network_protocol_version="1.1",otel_scope_name="go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp",otel_scope_schema_url="",otel_scope_version="0.64.0",server_address="localhost",server_port="13133",url_scheme="http"}
-http_server_request_body_size_bytes{http_request_method="GET",http_response_status_code="503",le="100",network_protocol_name="http",network_protocol_version="1.1",otel_scope_name="go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp",otel_scope_schema_url="",otel_scope_version="0.64.0",server_address="localhost",server_port="13133",url_scheme="http"}
-http_server_request_body_size_bytes{http_request_method="GET",http_response_status_code="503",le="1000",network_protocol_name="http",network_protocol_version="1.1",otel_scope_name="go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp",otel_scope_schema_url="",otel_scope_version="0.64.0",server_address="localhost",server_port="13133",url_scheme="http"}
-http_server_request_body_size_bytes{http_request_method="GET",http_response_status_code="503",le="10000",network_protocol_name="http",network_protocol_version="1.1",otel_scope_name="go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp",otel_scope_schema_url="",otel_scope_version="0.64.0",server_address="localhost",server_port="13133",url_scheme="http"}
-http_server_request_body_size_bytes{http_request_method="GET",http_response_status_code="503",le="25",network_protocol_name="http",network_protocol_version="1.1",otel_scope_name="go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp",otel_scope_schema_url="",otel_scope_version="0.64.0",server_address="localhost",server_port="13133",url_scheme="http"}
...View diff sample-http_server_request_duration_seconds{http_request_method="GET",http_response_status_code="503",le="+Inf",network_protocol_name="http",network_protocol_version="1.1",otel_scope_name="go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp",otel_scope_schema_url="",otel_scope_version="0.64.0",server_address="localhost",server_port="13133",url_scheme="http"}
-http_server_request_duration_seconds{http_request_method="GET",http_response_status_code="503",le="0.005",network_protocol_name="http",network_protocol_version="1.1",otel_scope_name="go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp",otel_scope_schema_url="",otel_scope_version="0.64.0",server_address="localhost",server_port="13133",url_scheme="http"}
-http_server_request_duration_seconds{http_request_method="GET",http_response_status_code="503",le="0.01",network_protocol_name="http",network_protocol_version="1.1",otel_scope_name="go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp",otel_scope_schema_url="",otel_scope_version="0.64.0",server_address="localhost",server_port="13133",url_scheme="http"}
-http_server_request_duration_seconds{http_request_method="GET",http_response_status_code="503",le="0.025",network_protocol_name="http",network_protocol_version="1.1",otel_scope_name="go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp",otel_scope_schema_url="",otel_scope_version="0.64.0",server_address="localhost",server_port="13133",url_scheme="http"}
-http_server_request_duration_seconds{http_request_method="GET",http_response_status_code="503",le="0.05",network_protocol_name="http",network_protocol_version="1.1",otel_scope_name="go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp",otel_scope_schema_url="",otel_scope_version="0.64.0",server_address="localhost",server_port="13133",url_scheme="http"}
-http_server_request_duration_seconds{http_request_method="GET",http_response_status_code="503",le="0.075",network_protocol_name="http",network_protocol_version="1.1",otel_scope_name="go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp",otel_scope_schema_url="",otel_scope_version="0.64.0",server_address="localhost",server_port="13133",url_scheme="http"}
-http_server_request_duration_seconds{http_request_method="GET",http_response_status_code="503",le="0.1",network_protocol_name="http",network_protocol_version="1.1",otel_scope_name="go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp",otel_scope_schema_url="",otel_scope_version="0.64.0",server_address="localhost",server_port="13133",url_scheme="http"}
...View diff sample-http_server_response_body_size_bytes{http_request_method="GET",http_response_status_code="503",le="+Inf",network_protocol_name="http",network_protocol_version="1.1",otel_scope_name="go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp",otel_scope_schema_url="",otel_scope_version="0.64.0",server_address="localhost",server_port="13133",url_scheme="http"}
-http_server_response_body_size_bytes{http_request_method="GET",http_response_status_code="503",le="0",network_protocol_name="http",network_protocol_version="1.1",otel_scope_name="go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp",otel_scope_schema_url="",otel_scope_version="0.64.0",server_address="localhost",server_port="13133",url_scheme="http"}
-http_server_response_body_size_bytes{http_request_method="GET",http_response_status_code="503",le="10",network_protocol_name="http",network_protocol_version="1.1",otel_scope_name="go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp",otel_scope_schema_url="",otel_scope_version="0.64.0",server_address="localhost",server_port="13133",url_scheme="http"}
-http_server_response_body_size_bytes{http_request_method="GET",http_response_status_code="503",le="100",network_protocol_name="http",network_protocol_version="1.1",otel_scope_name="go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp",otel_scope_schema_url="",otel_scope_version="0.64.0",server_address="localhost",server_port="13133",url_scheme="http"}
-http_server_response_body_size_bytes{http_request_method="GET",http_response_status_code="503",le="1000",network_protocol_name="http",network_protocol_version="1.1",otel_scope_name="go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp",otel_scope_schema_url="",otel_scope_version="0.64.0",server_address="localhost",server_port="13133",url_scheme="http"}
-http_server_response_body_size_bytes{http_request_method="GET",http_response_status_code="503",le="10000",network_protocol_name="http",network_protocol_version="1.1",otel_scope_name="go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp",otel_scope_schema_url="",otel_scope_version="0.64.0",server_address="localhost",server_port="13133",url_scheme="http"}
-http_server_response_body_size_bytes{http_request_method="GET",http_response_status_code="503",le="25",network_protocol_name="http",network_protocol_version="1.1",otel_scope_name="go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp",otel_scope_schema_url="",otel_scope_version="0.64.0",server_address="localhost",server_port="13133",url_scheme="http"}
... |
Signed-off-by: Mahad Zaryab <mahadzaryab1@gmail.com>
Signed-off-by: Mahad Zaryab <mahadzaryab1@gmail.com>
Signed-off-by: Mahad Zaryab <mahadzaryab1@gmail.com>
Signed-off-by: Mahad Zaryab <mahadzaryab1@gmail.com>
Signed-off-by: Mahad Zaryab <mahadzaryab1@gmail.com>
Signed-off-by: Mahad Zaryab <mahadzaryab1@gmail.com>
Signed-off-by: Mahad Zaryab <mahadzaryab1@gmail.com>
yurishkuro
left a comment
There was a problem hiding this comment.
We need to revisit the schema & index design. Looking at the spans table, all I see is
) ENGINE = MergeTree PRIMARY KEY (trace_id)
which doesn't make sense to me. Do we have a design doc about how we want CH to lay out the data?
At minimum there needs to be ORDER BY. The typical way to organize telemetry data is to group data by time range.
internal/storage/v2/clickhouse/sql/create_service_name_index.sql
Outdated
Show resolved
Hide resolved
internal/storage/v2/clickhouse/sql/create_service_name_index.sql
Outdated
Show resolved
Hide resolved
Signed-off-by: Mahad Zaryab <mahadzaryab1@gmail.com>
Signed-off-by: Mahad Zaryab <mahadzaryab1@gmail.com>
Signed-off-by: Mahad Zaryab <mahadzaryab1@gmail.com>
Signed-off-by: Mahad Zaryab <mahadzaryab1@gmail.com>
Signed-off-by: Mahad Zaryab <mahadzaryab1@gmail.com>
| ) | ||
|
|
||
| const ( | ||
| defualtProtocol = "native" |
There was a problem hiding this comment.
Typo in constant name: defualtProtocol should be defaultProtocol. This typo is propagated to the test file (config_test.go line 90) and should be fixed in both places.
Spotted by Graphite Agent
Is this helpful? React 👍 or 👎 to let us know.
Signed-off-by: Mahad Zaryab <mahadzaryab1@gmail.com>
@yurishkuro For the most part - the primary key and the sort key are the same (https://clickhouse.com/docs/best-practices/choosing-a-primary-key). A primary key in ClickHouse doesn't define uniqueness. I did change this to be the sort key however since that looks to be the convention in ClickHouse. I'm going to create a design document to optimize the schema via the sort key and data-skipping indexes. I can make those optimizations in a follow-up PR. |
| b, err := hex.DecodeString(traceID) | ||
| if err != nil { | ||
| if !yield(nil, fmt.Errorf("failed to decode trace ID: %w", err)) { | ||
| return | ||
| } | ||
| continue | ||
| } | ||
|
|
||
| if !yield([]tracestore.FoundTraceID{{TraceID: pcommon.TraceID(b)}}, nil) { |
There was a problem hiding this comment.
Type conversion error: Cannot convert []byte to pcommon.TraceID (which is [16]byte). The hex.DecodeString(traceID) returns a slice []byte, but pcommon.TraceID expects a fixed-size array [16]byte. This will cause a compilation error.
Fix by copying the slice into a fixed-size array:
b, err := hex.DecodeString(traceID)
if err != nil {
if !yield(nil, fmt.Errorf("failed to decode trace ID: %w", err)) {
return
}
continue
}
var traceIDArray [16]byte
if len(b) != 16 {
if !yield(nil, fmt.Errorf("invalid trace ID length: expected 16 bytes, got %d", len(b))) {
return
}
continue
}
copy(traceIDArray[:], b)
if !yield([]tracestore.FoundTraceID{{TraceID: pcommon.TraceID(traceIDArray)}}, nil) {
return
}| b, err := hex.DecodeString(traceID) | |
| if err != nil { | |
| if !yield(nil, fmt.Errorf("failed to decode trace ID: %w", err)) { | |
| return | |
| } | |
| continue | |
| } | |
| if !yield([]tracestore.FoundTraceID{{TraceID: pcommon.TraceID(b)}}, nil) { | |
| b, err := hex.DecodeString(traceID) | |
| if err != nil { | |
| if !yield(nil, fmt.Errorf("failed to decode trace ID: %w", err)) { | |
| return | |
| } | |
| continue | |
| } | |
| var traceIDArray [16]byte | |
| if len(b) != 16 { | |
| if !yield(nil, fmt.Errorf("invalid trace ID length: expected 16 bytes, got %d", len(b))) { | |
| return | |
| } | |
| continue | |
| } | |
| copy(traceIDArray[:], b) | |
| if !yield([]tracestore.FoundTraceID{{TraceID: pcommon.TraceID(traceIDArray)}}, nil) { | |
| return | |
| } |
Spotted by Graphite Agent
Is this helpful? React 👍 or 👎 to let us know.
Signed-off-by: Mahad Zaryab <43658574+mahadzaryab1@users.noreply.github.com>
Signed-off-by: Mahad Zaryab <mahadzaryab1@gmail.com>
| for rows.Next() { | ||
| var traceID string | ||
| if err := rows.Scan(&traceID); err != nil { | ||
| if !yield(nil, fmt.Errorf("failed to scan row: %w", err)) { |
There was a problem hiding this comment.
nit: it's distracting that there are so many places you need to call yield(). I would recommend moving the logic of reading the row into a helper function such that here you simply do
traceID, err := readRowIntoTraceID(row)
if !yield(traceID, err) {
return
}
Signed-off-by: Mahad Zaryab <mahadzaryab1@gmail.com>
c0b1db9
Which problem is this PR solving?
Description of the changes
FindTraceIDsfunction for ClickHouse Storage. It only adds the primitive query parameters (service name and operation name). The other query parameters will be added in follow-up PRs.How was this change tested?
Checklist
jaeger:make lint testjaeger-ui:npm run lintandnpm run test