diff --git a/config/config.md b/config/config.md
index 37cd5d1bacb1..3250728e9e81 100644
--- a/config/config.md
+++ b/config/config.md
@@ -323,6 +323,7 @@
| `selector` | String | `round_robin` | Datanode selector type. - `round_robin` (default value) - `lease_based` - `load_based` For details, please see "https://docs.greptime.com/developer-guide/metasrv/selector". |
| `use_memory_store` | Bool | `false` | Store data in memory. |
| `enable_region_failover` | Bool | `false` | Whether to enable region failover. This feature is only available on GreptimeDB running on cluster mode and - Using Remote WAL - Using shared storage (e.g., s3). |
+| `region_failure_detector_initialization_delay` | String | `10m` | Delay before initializing region failure detectors. This delay helps prevent premature initialization of region failure detectors in cases where cluster maintenance mode is enabled right after metasrv starts, especially when the cluster is not deployed via the recommended GreptimeDB Operator. Without this delay, early detector registration may trigger unnecessary region failovers during datanode startup. |
| `allow_region_failover_on_local_wal` | Bool | `false` | Whether to allow region failover on local WAL. **This option is not recommended to be set to true, because it may lead to data loss during failover.** |
| `node_max_idle_time` | String | `24hours` | Max allowed idle time before removing node info from metasrv memory. |
| `enable_telemetry` | Bool | `true` | Whether to enable greptimedb telemetry. Enabled by default. |
diff --git a/config/metasrv.example.toml b/config/metasrv.example.toml
index 81213b2a1268..63ff0155520b 100644
--- a/config/metasrv.example.toml
+++ b/config/metasrv.example.toml
@@ -43,6 +43,13 @@ use_memory_store = false
## - Using shared storage (e.g., s3).
enable_region_failover = false
+## Delay before initializing region failure detectors.
+## This delay helps prevent premature initialization of region failure detectors in cases where
+## cluster maintenance mode is enabled right after metasrv starts, especially when the cluster
+## is not deployed via the recommended GreptimeDB Operator. Without this delay, early detector registration
+## may trigger unnecessary region failovers during datanode startup.
+region_failure_detector_initialization_delay = '10m'
+
## Whether to allow region failover on local WAL.
## **This option is not recommended to be set to true, because it may lead to data loss during failover.**
allow_region_failover_on_local_wal = false
diff --git a/src/common/meta/src/key/table_route.rs b/src/common/meta/src/key/table_route.rs
index 94d2a0bf07b3..dbf87adf2f9e 100644
--- a/src/common/meta/src/key/table_route.rs
+++ b/src/common/meta/src/key/table_route.rs
@@ -48,6 +48,11 @@ impl TableRouteKey {
pub fn new(table_id: TableId) -> Self {
Self { table_id }
}
+
+ /// Returns the range prefix of the table route key.
+ pub fn range_prefix() -> Vec {
+ format!("{}/", TABLE_ROUTE_PREFIX).into_bytes()
+ }
}
#[derive(Debug, PartialEq, Serialize, Deserialize, Clone)]
diff --git a/src/meta-srv/src/error.rs b/src/meta-srv/src/error.rs
index b1ea799e5fce..7abf5193f6f0 100644
--- a/src/meta-srv/src/error.rs
+++ b/src/meta-srv/src/error.rs
@@ -54,14 +54,6 @@ pub enum Error {
peer_id: u64,
},
- #[snafu(display("Failed to lookup peer: {}", peer_id))]
- LookupPeer {
- #[snafu(implicit)]
- location: Location,
- source: common_meta::error::Error,
- peer_id: u64,
- },
-
#[snafu(display("Another migration procedure is running for region: {}", region_id))]
MigrationRunning {
#[snafu(implicit)]
@@ -1033,7 +1025,6 @@ impl ErrorExt for Error {
}
Error::Other { source, .. } => source.status_code(),
- Error::LookupPeer { source, .. } => source.status_code(),
Error::NoEnoughAvailableNode { .. } => StatusCode::RuntimeResourcesExhausted,
#[cfg(feature = "pg_kvbackend")]
diff --git a/src/meta-srv/src/metasrv.rs b/src/meta-srv/src/metasrv.rs
index 50797d44c480..91d0b22caf63 100644
--- a/src/meta-srv/src/metasrv.rs
+++ b/src/meta-srv/src/metasrv.rs
@@ -110,6 +110,14 @@ pub struct MetasrvOptions {
pub use_memory_store: bool,
/// Whether to enable region failover.
pub enable_region_failover: bool,
+ /// Delay before initializing region failure detectors.
+ ///
+ /// This delay helps prevent premature initialization of region failure detectors in cases where
+ /// cluster maintenance mode is enabled right after metasrv starts, especially when the cluster
+ /// is not deployed via the recommended GreptimeDB Operator. Without this delay, early detector registration
+ /// may trigger unnecessary region failovers during datanode startup.
+ #[serde(with = "humantime_serde")]
+ pub region_failure_detector_initialization_delay: Duration,
/// Whether to allow region failover on local WAL.
///
/// If it's true, the region failover will be allowed even if the local WAL is used.
@@ -219,6 +227,7 @@ impl Default for MetasrvOptions {
selector: SelectorType::default(),
use_memory_store: false,
enable_region_failover: false,
+ region_failure_detector_initialization_delay: Duration::from_secs(10 * 60),
allow_region_failover_on_local_wal: false,
grpc: GrpcOptions {
bind_addr: format!("127.0.0.1:{}", DEFAULT_METASRV_ADDR_PORT),
diff --git a/src/meta-srv/src/metasrv/builder.rs b/src/meta-srv/src/metasrv/builder.rs
index 167c5afd8ea3..85e50c8669f4 100644
--- a/src/meta-srv/src/metasrv/builder.rs
+++ b/src/meta-srv/src/metasrv/builder.rs
@@ -64,7 +64,7 @@ use crate::procedure::wal_prune::manager::{WalPruneManager, WalPruneTicker};
use crate::procedure::wal_prune::Context as WalPruneContext;
use crate::region::supervisor::{
HeartbeatAcceptor, RegionFailureDetectorControl, RegionSupervisor, RegionSupervisorSelector,
- RegionSupervisorTicker, DEFAULT_TICK_INTERVAL,
+ RegionSupervisorTicker, DEFAULT_INITIALIZATION_RETRY_PERIOD, DEFAULT_TICK_INTERVAL,
};
use crate::selector::lease_based::LeaseBasedSelector;
use crate::selector::round_robin::RoundRobinSelector;
@@ -299,6 +299,8 @@ impl MetasrvBuilder {
Arc::new(RegionFailureDetectorControl::new(tx.clone())) as _,
Some(Arc::new(RegionSupervisorTicker::new(
DEFAULT_TICK_INTERVAL,
+ options.region_failure_detector_initialization_delay,
+ DEFAULT_INITIALIZATION_RETRY_PERIOD,
tx.clone(),
))),
)
@@ -341,6 +343,7 @@ impl MetasrvBuilder {
region_migration_manager.clone(),
maintenance_mode_manager.clone(),
peer_lookup_service.clone(),
+ leader_cached_kv_backend.clone(),
);
Some(RegionFailureHandler::new(
diff --git a/src/meta-srv/src/procedure/region_migration/manager.rs b/src/meta-srv/src/procedure/region_migration/manager.rs
index b277bd3e23b6..adf6c0732b7e 100644
--- a/src/meta-srv/src/procedure/region_migration/manager.rs
+++ b/src/meta-srv/src/procedure/region_migration/manager.rs
@@ -23,7 +23,7 @@ use common_meta::key::table_route::TableRouteValue;
use common_meta::peer::Peer;
use common_meta::rpc::router::RegionRoute;
use common_procedure::{watcher, ProcedureId, ProcedureManagerRef, ProcedureWithId};
-use common_telemetry::{error, info};
+use common_telemetry::{error, info, warn};
use snafu::{ensure, OptionExt, ResultExt};
use store_api::storage::RegionId;
use table::table_name::TableName;
@@ -253,10 +253,12 @@ impl RegionMigrationManager {
}
/// Throws an error if `leader_peer` is not the `from_peer`.
+ ///
+ /// If `from_peer` is unknown, use the leader peer as the `from_peer`.
fn verify_region_leader_peer(
&self,
region_route: &RegionRoute,
- task: &RegionMigrationProcedureTask,
+ task: &mut RegionMigrationProcedureTask,
) -> Result<()> {
let leader_peer = region_route
.leader_peer
@@ -275,6 +277,15 @@ impl RegionMigrationManager {
}
);
+ if task.from_peer.addr.is_empty() {
+ warn!(
+ "The `from_peer` is unknown, use the leader peer({}) as the `from_peer`, region: {}",
+ leader_peer, task.region_id
+ );
+ // The peer id is the same as the leader peer id.
+ task.from_peer = leader_peer.clone();
+ }
+
Ok(())
}
@@ -300,7 +311,7 @@ impl RegionMigrationManager {
/// Submits a new region migration procedure.
pub async fn submit_procedure(
&self,
- task: RegionMigrationProcedureTask,
+ mut task: RegionMigrationProcedureTask,
) -> Result