Skip to content

feat: implement automatic region failure detector registrations #6370

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jun 24, 2025
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions config/config.md
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,7 @@
| `selector` | String | `round_robin` | Datanode selector type.<br/>- `round_robin` (default value)<br/>- `lease_based`<br/>- `load_based`<br/>For details, please see "https://docs.greptime.com/developer-guide/metasrv/selector". |
| `use_memory_store` | Bool | `false` | Store data in memory. |
| `enable_region_failover` | Bool | `false` | Whether to enable region failover.<br/>This feature is only available on GreptimeDB running on cluster mode and<br/>- Using Remote WAL<br/>- Using shared storage (e.g., s3). |
| `region_failure_detector_initialization_delay` | String | `10m` | Delay before initializing region failure detectors.<br/><br/>This delay helps prevent premature initialization of region failure detectors in cases where<br/>cluster maintenance mode is enabled right after metasrv starts, especially when the cluster<br/>is not deployed via the recommended GreptimeDB Operator. Without this delay, early detector registration<br/>may trigger unnecessary region failovers during datanode startup. |
| `allow_region_failover_on_local_wal` | Bool | `false` | Whether to allow region failover on local WAL.<br/>**This option is not recommended to be set to true, because it may lead to data loss during failover.** |
| `node_max_idle_time` | String | `24hours` | Max allowed idle time before removing node info from metasrv memory. |
| `enable_telemetry` | Bool | `true` | Whether to enable greptimedb telemetry. Enabled by default. |
Expand Down
8 changes: 8 additions & 0 deletions config/metasrv.example.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,14 @@ use_memory_store = false
## - Using shared storage (e.g., s3).
enable_region_failover = false

## Delay before initializing region failure detectors.
##
## This delay helps prevent premature initialization of region failure detectors in cases where
## cluster maintenance mode is enabled right after metasrv starts, especially when the cluster
## is not deployed via the recommended GreptimeDB Operator. Without this delay, early detector registration
## may trigger unnecessary region failovers during datanode startup.
region_failure_detector_initialization_delay = '10m'

## Whether to allow region failover on local WAL.
## **This option is not recommended to be set to true, because it may lead to data loss during failover.**
allow_region_failover_on_local_wal = false
Expand Down
5 changes: 5 additions & 0 deletions src/common/meta/src/key/table_route.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@ impl TableRouteKey {
pub fn new(table_id: TableId) -> Self {
Self { table_id }
}

/// Returns the range prefix of the table route key.
pub fn range_prefix() -> Vec<u8> {
format!("{}/", TABLE_ROUTE_PREFIX).into_bytes()
}
}

#[derive(Debug, PartialEq, Serialize, Deserialize, Clone)]
Expand Down
9 changes: 0 additions & 9 deletions src/meta-srv/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,14 +54,6 @@ pub enum Error {
peer_id: u64,
},

#[snafu(display("Failed to lookup peer: {}", peer_id))]
LookupPeer {
#[snafu(implicit)]
location: Location,
source: common_meta::error::Error,
peer_id: u64,
},

#[snafu(display("Another migration procedure is running for region: {}", region_id))]
MigrationRunning {
#[snafu(implicit)]
Expand Down Expand Up @@ -1033,7 +1025,6 @@ impl ErrorExt for Error {
}

Error::Other { source, .. } => source.status_code(),
Error::LookupPeer { source, .. } => source.status_code(),
Error::NoEnoughAvailableNode { .. } => StatusCode::RuntimeResourcesExhausted,

#[cfg(feature = "pg_kvbackend")]
Expand Down
9 changes: 9 additions & 0 deletions src/meta-srv/src/metasrv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,14 @@ pub struct MetasrvOptions {
pub use_memory_store: bool,
/// Whether to enable region failover.
pub enable_region_failover: bool,
/// Delay before initializing region failure detectors.
///
/// This delay helps prevent premature initialization of region failure detectors in cases where
/// cluster maintenance mode is enabled right after metasrv starts, especially when the cluster
/// is not deployed via the recommended GreptimeDB Operator. Without this delay, early detector registration
/// may trigger unnecessary region failovers during datanode startup.
#[serde(with = "humantime_serde")]
pub region_failure_detector_initialization_delay: Duration,
/// Whether to allow region failover on local WAL.
///
/// If it's true, the region failover will be allowed even if the local WAL is used.
Expand Down Expand Up @@ -219,6 +227,7 @@ impl Default for MetasrvOptions {
selector: SelectorType::default(),
use_memory_store: false,
enable_region_failover: false,
region_failure_detector_initialization_delay: Duration::from_secs(10 * 60),
allow_region_failover_on_local_wal: false,
grpc: GrpcOptions {
bind_addr: format!("127.0.0.1:{}", DEFAULT_METASRV_ADDR_PORT),
Expand Down
5 changes: 4 additions & 1 deletion src/meta-srv/src/metasrv/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ use crate::procedure::wal_prune::manager::{WalPruneManager, WalPruneTicker};
use crate::procedure::wal_prune::Context as WalPruneContext;
use crate::region::supervisor::{
HeartbeatAcceptor, RegionFailureDetectorControl, RegionSupervisor, RegionSupervisorSelector,
RegionSupervisorTicker, DEFAULT_TICK_INTERVAL,
RegionSupervisorTicker, DEFAULT_INITIALIZATION_RETRY_PERIOD, DEFAULT_TICK_INTERVAL,
};
use crate::selector::lease_based::LeaseBasedSelector;
use crate::selector::round_robin::RoundRobinSelector;
Expand Down Expand Up @@ -299,6 +299,8 @@ impl MetasrvBuilder {
Arc::new(RegionFailureDetectorControl::new(tx.clone())) as _,
Some(Arc::new(RegionSupervisorTicker::new(
DEFAULT_TICK_INTERVAL,
options.region_failure_detector_initialization_delay,
DEFAULT_INITIALIZATION_RETRY_PERIOD,
tx.clone(),
))),
)
Expand Down Expand Up @@ -341,6 +343,7 @@ impl MetasrvBuilder {
region_migration_manager.clone(),
maintenance_mode_manager.clone(),
peer_lookup_service.clone(),
leader_cached_kv_backend.clone(),
);

Some(RegionFailureHandler::new(
Expand Down
31 changes: 21 additions & 10 deletions src/meta-srv/src/procedure/region_migration/manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ use common_meta::key::table_route::TableRouteValue;
use common_meta::peer::Peer;
use common_meta::rpc::router::RegionRoute;
use common_procedure::{watcher, ProcedureId, ProcedureManagerRef, ProcedureWithId};
use common_telemetry::{error, info};
use common_telemetry::{error, info, warn};
use snafu::{ensure, OptionExt, ResultExt};
use store_api::storage::RegionId;
use table::table_name::TableName;
Expand Down Expand Up @@ -253,10 +253,12 @@ impl RegionMigrationManager {
}

/// Throws an error if `leader_peer` is not the `from_peer`.
///
/// If `from_peer` is unknown, use the leader peer as the `from_peer`.
fn verify_region_leader_peer(
&self,
region_route: &RegionRoute,
task: &RegionMigrationProcedureTask,
task: &mut RegionMigrationProcedureTask,
) -> Result<()> {
let leader_peer = region_route
.leader_peer
Expand All @@ -275,6 +277,15 @@ impl RegionMigrationManager {
}
);

if task.from_peer.addr.is_empty() {
warn!(
"The `from_peer` is unknown, use the leader peer({}) as the `from_peer`, region: {}",
leader_peer, task.region_id
);
// The peer id is the same as the leader peer id.
task.from_peer = leader_peer.clone();
}

Ok(())
}

Expand All @@ -300,7 +311,7 @@ impl RegionMigrationManager {
/// Submits a new region migration procedure.
pub async fn submit_procedure(
&self,
task: RegionMigrationProcedureTask,
mut task: RegionMigrationProcedureTask,
) -> Result<Option<ProcedureId>> {
let Some(guard) = self.insert_running_procedure(&task) else {
return error::MigrationRunningSnafu {
Expand Down Expand Up @@ -333,20 +344,14 @@ impl RegionMigrationManager {
.fail();
}

self.verify_region_leader_peer(&region_route, &task)?;
self.verify_region_leader_peer(&region_route, &mut task)?;
self.verify_region_follower_peers(&region_route, &task)?;
let table_info = self.retrieve_table_info(region_id).await?;
let TableName {
catalog_name,
schema_name,
..
} = table_info.table_name();
METRIC_META_REGION_MIGRATION_DATANODES
.with_label_values(&["src", &task.from_peer.id.to_string()])
.inc();
METRIC_META_REGION_MIGRATION_DATANODES
.with_label_values(&["desc", &task.to_peer.id.to_string()])
.inc();
let RegionMigrationProcedureTask {
region_id,
from_peer,
Expand Down Expand Up @@ -377,6 +382,12 @@ impl RegionMigrationManager {
return;
}
};
METRIC_META_REGION_MIGRATION_DATANODES
.with_label_values(&["src", &task.from_peer.id.to_string()])
.inc();
METRIC_META_REGION_MIGRATION_DATANODES
.with_label_values(&["desc", &task.to_peer.id.to_string()])
.inc();

if let Err(e) = watcher::wait(watcher).await {
error!(e; "Failed to wait region migration procedure {procedure_id} for {task}");
Expand Down
Loading