@@ -554,7 +554,12 @@ public void onResponse(RepositoryData repositoryData) {
554
554
555
555
@ Override
556
556
public void onFailure (Exception e ) {
557
- logger .error ("Failed to upload files to snapshot repo {} for snapshot-v2 {} due to {} " , repositoryName , snapshotName , e );
557
+ logger .error (
558
+ "Failed to upload files to snapshot repo {} for snapshot-v2 {} due to {} " ,
559
+ repositoryName ,
560
+ snapshotName ,
561
+ e
562
+ );
558
563
listener .onFailure (e );
559
564
}
560
565
}
@@ -594,6 +599,8 @@ public void createSnapshotV2(final CreateSnapshotRequest request, final ActionLi
594
599
595
600
private Snapshot snapshot ;
596
601
602
+ boolean enteredLoop ;
603
+
597
604
@ Override
598
605
public ClusterState execute (ClusterState currentState ) {
599
606
// move to in progress
@@ -625,13 +632,7 @@ public ClusterState execute(ClusterState currentState) {
625
632
626
633
final SnapshotsInProgress snapshots = currentState .custom (SnapshotsInProgress .TYPE , SnapshotsInProgress .EMPTY );
627
634
final List <SnapshotsInProgress .Entry > runningSnapshots = snapshots .entries ();
628
- if (tryEnterRepoLoop (repositoryName ) == false ) {
629
- throw new ConcurrentSnapshotExecutionException (
630
- repositoryName ,
631
- snapshotName ,
632
- "cannot start snapshot-v2 while a repository is in finalization state"
633
- );
634
- }
635
+
635
636
final List <IndexId > indexIds = repositoryData .resolveNewIndices (
636
637
indices ,
637
638
getInFlightIndexIds (runningSnapshots , repositoryName ),
@@ -662,6 +663,15 @@ public ClusterState execute(ClusterState currentState) {
662
663
);
663
664
final List <SnapshotsInProgress .Entry > newEntries = new ArrayList <>(runningSnapshots );
664
665
newEntries .add (newEntry );
666
+
667
+ enteredLoop = tryEnterRepoLoop (repositoryName );
668
+ if (enteredLoop == false ) {
669
+ throw new ConcurrentSnapshotExecutionException (
670
+ repositoryName ,
671
+ snapshotName ,
672
+ "cannot start snapshot-v2 while a repository is in finalization state"
673
+ );
674
+ }
665
675
return ClusterState .builder (currentState )
666
676
.putCustom (SnapshotsInProgress .TYPE , SnapshotsInProgress .of (new ArrayList <>(newEntries )))
667
677
.build ();
@@ -671,7 +681,7 @@ public ClusterState execute(ClusterState currentState) {
671
681
public void onFailure (String source , Exception e ) {
672
682
logger .warn (() -> new ParameterizedMessage ("[{}][{}] failed to create snapshot-v2" , repositoryName , snapshotName ), e );
673
683
listener .onFailure (e );
674
- if (( e instanceof ConcurrentSnapshotExecutionException ) == false ) {
684
+ if (enteredLoop ) {
675
685
leaveRepoLoop (repositoryName );
676
686
}
677
687
@@ -685,7 +695,6 @@ public void clusterStateProcessed(String source, ClusterState oldState, final Cl
685
695
newEntry .indices (),
686
696
repositoryData
687
697
);
688
-
689
698
final List <String > dataStreams = indexNameExpressionResolver .dataStreamNames (
690
699
newState ,
691
700
request .indicesOptions (),
@@ -705,11 +714,18 @@ public void clusterStateProcessed(String source, ClusterState oldState, final Cl
705
714
true ,
706
715
pinnedTimestamp
707
716
);
717
+ // if (snapshotName.contains("snapshot-concurrent-")) {
718
+ // try {
719
+ // listener.onResponse(snapshotInfo);
720
+ // leaveRepoLoop(repositoryName);
721
+ // return;
722
+ // } catch (Exception e) {
723
+ // }
724
+ // }
725
+
708
726
final Version version = minCompatibleVersion (newState .nodes ().getMinNodeVersion (), repositoryData , null );
709
727
final StepListener <RepositoryData > pinnedTimestampListener = new StepListener <>();
710
- pinnedTimestampListener .whenComplete (repoData -> {
711
- listener .onResponse (snapshotInfo );
712
- }, listener ::onFailure );
728
+ pinnedTimestampListener .whenComplete (repoData -> { listener .onResponse (snapshotInfo ); }, listener ::onFailure );
713
729
repository .finalizeSnapshot (
714
730
shardGenerations ,
715
731
repositoryData .getGenId (),
@@ -721,7 +737,7 @@ public void clusterStateProcessed(String source, ClusterState oldState, final Cl
721
737
new ActionListener <RepositoryData >() {
722
738
@ Override
723
739
public void onResponse (RepositoryData repositoryData ) {
724
- if (! clusterService .state ().nodes ().isLocalNodeElectedClusterManager ()) {
740
+ if (clusterService .state ().nodes ().isLocalNodeElectedClusterManager () == false ) {
725
741
failSnapshotCompletionListeners (
726
742
snapshot ,
727
743
new SnapshotException (snapshot , "Aborting snapshot-v2, no longer cluster manager" )
@@ -732,7 +748,7 @@ public void onResponse(RepositoryData repositoryData) {
732
748
733
749
return ;
734
750
}
735
- logger . info ( "Process it now" );
751
+ endingSnapshots . remove ( snapshot );
736
752
leaveRepoLoop (repositoryName );
737
753
updateSnapshotPinnedTimestamp (repositoryData , snapshot , pinnedTimestamp , pinnedTimestampListener );
738
754
}
@@ -1667,9 +1683,10 @@ public void applyClusterState(ClusterChangedEvent event) {
1667
1683
SnapshotsInProgress snapshotsInProgress = event .state ().custom (SnapshotsInProgress .TYPE , SnapshotsInProgress .EMPTY );
1668
1684
final boolean newClusterManager = event .previousState ().nodes ().isLocalNodeElectedClusterManager () == false ;
1669
1685
if (newClusterManager && snapshotsInProgress .entries ().isEmpty () == false ) {
1670
- logger .info ("Cleaning it now" );
1671
- // clean up snapshot v2 in progress or clone v2 present
1672
- stateWithoutSnapshotv2 (event .state ());
1686
+ // clean up snapshot v2 in progress or clone v2 present.
1687
+ // Snapshot v2 create and clone are sync operation . In case of cluster manager failures in midst , we won't
1688
+ // send ack to caller and won't continue on new cluster manager . Caller will need to retry it.
1689
+ stateWithoutSnapshotV2 (event .state ());
1673
1690
}
1674
1691
processExternalChanges (
1675
1692
newClusterManager || removedNodesCleanupNeeded (snapshotsInProgress , event .nodesDelta ().removedNodes ()),
@@ -1782,7 +1799,14 @@ private void processExternalChanges(boolean changedNodes, boolean startShards) {
1782
1799
@ Override
1783
1800
public ClusterState execute (ClusterState currentState ) {
1784
1801
RoutingTable routingTable = currentState .routingTable ();
1785
- final SnapshotsInProgress snapshots = currentState .custom (SnapshotsInProgress .TYPE , SnapshotsInProgress .EMPTY );
1802
+ SnapshotsInProgress snapshots = currentState .custom (SnapshotsInProgress .TYPE , SnapshotsInProgress .EMPTY );
1803
+ // Removing shallow snapshots v2 as we we take care of these in stateWithoutSnapshotV2()
1804
+ snapshots = SnapshotsInProgress .of (
1805
+ snapshots .entries ()
1806
+ .stream ()
1807
+ .filter (snapshot -> snapshot .remoteStoreIndexShallowCopyV2 () == false )
1808
+ .collect (Collectors .toList ())
1809
+ );
1786
1810
DiscoveryNodes nodes = currentState .nodes ();
1787
1811
boolean changed = false ;
1788
1812
final EnumSet <State > statesToUpdate ;
@@ -1839,7 +1863,7 @@ public ClusterState execute(ClusterState currentState) {
1839
1863
changed = true ;
1840
1864
logger .debug ("[{}] was found in dangling INIT or ABORTED state" , snapshot );
1841
1865
} else {
1842
- if (snapshot .state ().completed () || completed (snapshot .shards ().values ())) {
1866
+ if (( snapshot .state ().completed () || completed (snapshot .shards ().values () ))) {
1843
1867
finishedSnapshots .add (snapshot );
1844
1868
}
1845
1869
updatedSnapshotEntries .add (snapshot );
@@ -2365,9 +2389,8 @@ private static ClusterState stateWithoutSnapshot(ClusterState state, Snapshot sn
2365
2389
return readyDeletions (result ).v1 ();
2366
2390
}
2367
2391
2368
- private ClusterState stateWithoutSnapshotv2 (ClusterState state ) {
2392
+ private void stateWithoutSnapshotV2 (ClusterState state ) {
2369
2393
SnapshotsInProgress snapshots = state .custom (SnapshotsInProgress .TYPE , SnapshotsInProgress .EMPTY );
2370
- ClusterState result = state ;
2371
2394
boolean changed = false ;
2372
2395
ArrayList <SnapshotsInProgress .Entry > entries = new ArrayList <>();
2373
2396
for (SnapshotsInProgress .Entry entry : snapshots .entries ()) {
@@ -2378,32 +2401,44 @@ private ClusterState stateWithoutSnapshotv2(ClusterState state) {
2378
2401
}
2379
2402
}
2380
2403
if (changed ) {
2381
- result = ClusterState .builder (state )
2382
- .putCustom (SnapshotsInProgress .TYPE , SnapshotsInProgress .of (unmodifiableList (entries )))
2383
- .build ();
2384
-
2385
- ClusterState finalResult = result ;
2386
- clusterService .submitStateUpdateTask ("update snapshot v2 after cluster manager switch" , new ClusterStateUpdateTask () {
2387
-
2388
- @ Override
2389
- public ClusterState execute (ClusterState currentState ) throws Exception {
2390
- return finalResult ;
2391
- }
2404
+ clusterService .submitStateUpdateTask (
2405
+ "remove in progress snapshot v2 after cluster manager switch" ,
2406
+ new ClusterStateUpdateTask () {
2407
+ @ Override
2408
+ public ClusterState execute (ClusterState currentState ) {
2409
+ SnapshotsInProgress snapshots = state .custom (SnapshotsInProgress .TYPE , SnapshotsInProgress .EMPTY );
2410
+ boolean changed = false ;
2411
+ ArrayList <SnapshotsInProgress .Entry > entries = new ArrayList <>();
2412
+ for (SnapshotsInProgress .Entry entry : snapshots .entries ()) {
2413
+ if (entry .remoteStoreIndexShallowCopyV2 ()) {
2414
+ changed = true ;
2415
+ } else {
2416
+ entries .add (entry );
2417
+ }
2418
+ }
2419
+ if (changed ) {
2420
+ return ClusterState .builder (currentState )
2421
+ .putCustom (SnapshotsInProgress .TYPE , SnapshotsInProgress .of (unmodifiableList (entries )))
2422
+ .build ();
2423
+ } else {
2424
+ return currentState ;
2425
+ }
2426
+ }
2392
2427
2393
- @ Override
2394
- public void onFailure (String source , Exception e ) {
2395
- // execute never fails today, so we should never hit this.
2396
- logger .warn (
2397
- () -> new ParameterizedMessage (
2398
- "failed to remove in progress snapshot v2 state after cluster manager switch" ,
2399
- source
2400
- ),
2401
- e
2402
- );
2428
+ @ Override
2429
+ public void onFailure (String source , Exception e ) {
2430
+ // execute never fails , so we should never hit this.
2431
+ logger .warn (
2432
+ () -> new ParameterizedMessage (
2433
+ "failed to remove in progress snapshot v2 state after cluster manager switch" ,
2434
+ source
2435
+ ),
2436
+ e
2437
+ );
2438
+ }
2403
2439
}
2404
- } );
2440
+ );
2405
2441
}
2406
- return result ;
2407
2442
}
2408
2443
2409
2444
/**
@@ -3556,6 +3591,9 @@ public boolean assertAllListenersResolved() {
3556
3591
+ " on ["
3557
3592
+ localNode
3558
3593
+ "]" ;
3594
+ if (repositoryOperations .isEmpty () == false ) {
3595
+ logger .info ("Not empty" );
3596
+ }
3559
3597
assert repositoryOperations .isEmpty () : "Found leaked snapshots to finalize " + repositoryOperations + " on [" + localNode + "]" ;
3560
3598
return true ;
3561
3599
}
0 commit comments