117
117
import org .opensearch .index .snapshots .blobstore .RemoteStoreShardShallowCopySnapshot ;
118
118
import org .opensearch .index .snapshots .blobstore .SlicedInputStream ;
119
119
import org .opensearch .index .snapshots .blobstore .SnapshotFiles ;
120
+ import org .opensearch .index .store .RemoteSegmentStoreDirectory ;
120
121
import org .opensearch .index .store .RemoteSegmentStoreDirectoryFactory ;
121
122
import org .opensearch .index .store .Store ;
122
123
import org .opensearch .index .store .StoreFileMetadata ;
@@ -237,6 +238,8 @@ public abstract class BlobStoreRepository extends AbstractLifecycleComponent imp
237
238
Setting .Property .Deprecated
238
239
);
239
240
241
+ private static final Logger staticLogger = LogManager .getLogger (BlobStoreRepository .class );
242
+
240
243
/**
241
244
* Setting to disable caching of the latest repository data.
242
245
*/
@@ -1161,6 +1164,78 @@ private void asyncCleanupUnlinkedShardLevelBlobs(
1161
1164
}
1162
1165
}
1163
1166
1167
+ public static void remoteDirectoryCleanupAsync (
1168
+ RemoteSegmentStoreDirectoryFactory remoteDirectoryFactory ,
1169
+ ThreadPool threadpool ,
1170
+ String remoteStoreRepoForIndex ,
1171
+ String indexUUID ,
1172
+ ShardId shardId ,
1173
+ String threadPoolName
1174
+ ) {
1175
+ threadpool .executor (threadPoolName )
1176
+ .execute (
1177
+ new RemoteStoreShardCleanupTask (
1178
+ () -> RemoteSegmentStoreDirectory .remoteDirectoryCleanup (
1179
+ remoteDirectoryFactory ,
1180
+ remoteStoreRepoForIndex ,
1181
+ indexUUID ,
1182
+ shardId
1183
+ ),
1184
+ indexUUID ,
1185
+ shardId
1186
+ )
1187
+ );
1188
+ }
1189
+
1190
+ protected void releaseRemoteStoreLockAndCleanup (
1191
+ String shardId ,
1192
+ String shallowSnapshotUUID ,
1193
+ BlobContainer shardContainer ,
1194
+ RemoteStoreLockManagerFactory remoteStoreLockManagerFactory
1195
+ ) throws IOException {
1196
+ if (remoteStoreLockManagerFactory == null ) {
1197
+ return ;
1198
+ }
1199
+
1200
+ RemoteStoreShardShallowCopySnapshot remoteStoreShardShallowCopySnapshot = REMOTE_STORE_SHARD_SHALLOW_COPY_SNAPSHOT_FORMAT .read (
1201
+ shardContainer ,
1202
+ shallowSnapshotUUID ,
1203
+ namedXContentRegistry
1204
+ );
1205
+ String indexUUID = remoteStoreShardShallowCopySnapshot .getIndexUUID ();
1206
+ String remoteStoreRepoForIndex = remoteStoreShardShallowCopySnapshot .getRemoteStoreRepository ();
1207
+ // Releasing lock file before deleting the shallow-snap-UUID file because in case of any failure while
1208
+ // releasing the lock file, we would still have the shallow-snap-UUID file and that would be used during
1209
+ // next delete operation for releasing this lock file
1210
+ RemoteStoreLockManager remoteStoreMetadataLockManager = remoteStoreLockManagerFactory .newLockManager (
1211
+ remoteStoreRepoForIndex ,
1212
+ indexUUID ,
1213
+ shardId
1214
+ );
1215
+ remoteStoreMetadataLockManager .release (FileLockInfo .getLockInfoBuilder ().withAcquirerId (shallowSnapshotUUID ).build ());
1216
+ logger .debug ("Successfully released lock for shard {} of index with uuid {}" , shardId , indexUUID );
1217
+ if (!isIndexPresent (clusterService , indexUUID )) {
1218
+ // Note: this is a temporary solution where snapshot deletion triggers remote store side cleanup if
1219
+ // index is already deleted. shard cleanup will still happen asynchronously using REMOTE_PURGE
1220
+ // threadpool. if it fails, it could leave some stale files in remote directory. this issue could
1221
+ // even happen in cases of shard level remote store data cleanup which also happens asynchronously.
1222
+ // in long term, we have plans to implement remote store GC poller mechanism which will take care of
1223
+ // such stale data. related issue: https://github.com/opensearch-project/OpenSearch/issues/8469
1224
+ RemoteSegmentStoreDirectoryFactory remoteDirectoryFactory = new RemoteSegmentStoreDirectoryFactory (
1225
+ remoteStoreLockManagerFactory .getRepositoriesService (),
1226
+ threadPool
1227
+ );
1228
+ remoteDirectoryCleanupAsync (
1229
+ remoteDirectoryFactory ,
1230
+ threadPool ,
1231
+ remoteStoreRepoForIndex ,
1232
+ indexUUID ,
1233
+ new ShardId (Index .UNKNOWN_INDEX_NAME , indexUUID , Integer .parseInt (shardId )),
1234
+ ThreadPool .Names .REMOTE_PURGE
1235
+ );
1236
+ }
1237
+ }
1238
+
1164
1239
// When remoteStoreLockManagerFactory is non-null, while deleting the files, lock files are also released before deletion of respective
1165
1240
// shallow-snap-UUID files. And if it is null, we just delete the stale shard blobs.
1166
1241
private void executeStaleShardDelete (
@@ -1172,53 +1247,34 @@ private void executeStaleShardDelete(
1172
1247
if (filesToDelete != null ) {
1173
1248
threadPool .executor (ThreadPool .Names .SNAPSHOT ).execute (ActionRunnable .wrap (listener , l -> {
1174
1249
try {
1175
- if (remoteStoreLockManagerFactory != null ) {
1176
- for (String fileToDelete : filesToDelete ) {
1177
- if (fileToDelete .contains (SHALLOW_SNAPSHOT_PREFIX )) {
1178
- String [] fileToDeletePath = fileToDelete .split ("/" );
1179
- String indexId = fileToDeletePath [1 ];
1180
- String shardId = fileToDeletePath [2 ];
1181
- String shallowSnapBlob = fileToDeletePath [3 ];
1182
- String snapshotUUID = extractShallowSnapshotUUID (shallowSnapBlob ).orElseThrow ();
1183
- BlobContainer shardContainer = blobStore ().blobContainer (indicesPath ().add (indexId ).add (shardId ));
1184
- RemoteStoreShardShallowCopySnapshot remoteStoreShardShallowCopySnapshot =
1185
- REMOTE_STORE_SHARD_SHALLOW_COPY_SNAPSHOT_FORMAT .read (
1186
- shardContainer ,
1187
- snapshotUUID ,
1188
- namedXContentRegistry
1189
- );
1190
- String indexUUID = remoteStoreShardShallowCopySnapshot .getIndexUUID ();
1191
- String remoteStoreRepoForIndex = remoteStoreShardShallowCopySnapshot .getRemoteStoreRepository ();
1192
- // Releasing lock file before deleting the shallow-snap-UUID file because in case of any failure while
1193
- // releasing the lock file, we would still have the shallow-snap-UUID file and that would be used during
1194
- // next delete operation for releasing this lock file
1195
- RemoteStoreLockManager remoteStoreMetadataLockManager = remoteStoreLockManagerFactory .newLockManager (
1196
- remoteStoreRepoForIndex ,
1197
- indexUUID ,
1198
- shardId
1199
- );
1200
- remoteStoreMetadataLockManager .release (
1201
- FileLockInfo .getLockInfoBuilder ().withAcquirerId (snapshotUUID ).build ()
1250
+ // filtering files for which remote store lock release and cleanup succeeded,
1251
+ // remaining files for which it failed will be retried in next snapshot delete run.
1252
+ List <String > eligibleFilesToDelete = new ArrayList <>();
1253
+ for (String fileToDelete : filesToDelete ) {
1254
+ if (fileToDelete .contains (SHALLOW_SNAPSHOT_PREFIX )) {
1255
+ String [] fileToDeletePath = fileToDelete .split ("/" );
1256
+ String indexId = fileToDeletePath [1 ];
1257
+ String shardId = fileToDeletePath [2 ];
1258
+ String shallowSnapBlob = fileToDeletePath [3 ];
1259
+ String snapshotUUID = extractShallowSnapshotUUID (shallowSnapBlob ).orElseThrow ();
1260
+ BlobContainer shardContainer = blobStore ().blobContainer (indicesPath ().add (indexId ).add (shardId ));
1261
+ try {
1262
+ releaseRemoteStoreLockAndCleanup (shardId , snapshotUUID , shardContainer , remoteStoreLockManagerFactory );
1263
+ eligibleFilesToDelete .add (fileToDelete );
1264
+ } catch (Exception e ) {
1265
+ logger .error (
1266
+ "Failed to release lock or cleanup shard for indexID {}, shardID {} " + "and snapshot {}" ,
1267
+ indexId ,
1268
+ shardId ,
1269
+ snapshotUUID
1202
1270
);
1203
- if (!isIndexPresent (clusterService , indexUUID )) {
1204
- // this is a temporary solution where snapshot deletion triggers remote store side
1205
- // cleanup if index is already deleted. We will add a poller in future to take
1206
- // care of remote store side cleanup.
1207
- // see https://github.com/opensearch-project/OpenSearch/issues/8469
1208
- new RemoteSegmentStoreDirectoryFactory (
1209
- remoteStoreLockManagerFactory .getRepositoriesService (),
1210
- threadPool
1211
- ).newDirectory (
1212
- remoteStoreRepoForIndex ,
1213
- indexUUID ,
1214
- new ShardId (Index .UNKNOWN_INDEX_NAME , indexUUID , Integer .valueOf (shardId ))
1215
- ).close ();
1216
- }
1217
1271
}
1272
+ } else {
1273
+ eligibleFilesToDelete .add (fileToDelete );
1218
1274
}
1219
1275
}
1220
1276
// Deleting the shard blobs
1221
- deleteFromContainer (blobContainer (), filesToDelete );
1277
+ deleteFromContainer (blobContainer (), eligibleFilesToDelete );
1222
1278
l .onResponse (null );
1223
1279
} catch (Exception e ) {
1224
1280
logger .warn (
@@ -1651,39 +1707,12 @@ private void executeOneStaleIndexDelete(
1651
1707
for (String blob : shardBlob .getValue ().listBlobs ().keySet ()) {
1652
1708
final Optional <String > snapshotUUID = extractShallowSnapshotUUID (blob );
1653
1709
if (snapshotUUID .isPresent ()) {
1654
- RemoteStoreShardShallowCopySnapshot remoteStoreShardShallowCopySnapshot =
1655
- REMOTE_STORE_SHARD_SHALLOW_COPY_SNAPSHOT_FORMAT .read (
1656
- shardBlob .getValue (),
1657
- snapshotUUID .get (),
1658
- namedXContentRegistry
1659
- );
1660
- String indexUUID = remoteStoreShardShallowCopySnapshot .getIndexUUID ();
1661
- String remoteStoreRepoForIndex = remoteStoreShardShallowCopySnapshot .getRemoteStoreRepository ();
1662
- // Releasing lock files before deleting the shallow-snap-UUID file because in case of any failure
1663
- // while releasing the lock file, we would still have the corresponding shallow-snap-UUID file
1664
- // and that would be used during next delete operation for releasing this stale lock file
1665
- RemoteStoreLockManager remoteStoreMetadataLockManager = remoteStoreLockManagerFactory .newLockManager (
1666
- remoteStoreRepoForIndex ,
1667
- indexUUID ,
1668
- shardBlob .getKey ()
1669
- );
1670
- remoteStoreMetadataLockManager .release (
1671
- FileLockInfo .getLockInfoBuilder ().withAcquirerId (snapshotUUID .get ()).build ()
1710
+ releaseRemoteStoreLockAndCleanup (
1711
+ shardBlob .getKey (),
1712
+ snapshotUUID .get (),
1713
+ shardBlob .getValue (),
1714
+ remoteStoreLockManagerFactory
1672
1715
);
1673
- if (!isIndexPresent (clusterService , indexUUID )) {
1674
- // this is a temporary solution where snapshot deletion triggers remote store side
1675
- // cleanup if index is already deleted. We will add a poller in future to take
1676
- // care of remote store side cleanup.
1677
- // see https://github.com/opensearch-project/OpenSearch/issues/8469
1678
- new RemoteSegmentStoreDirectoryFactory (
1679
- remoteStoreLockManagerFactory .getRepositoriesService (),
1680
- threadPool
1681
- ).newDirectory (
1682
- remoteStoreRepoForIndex ,
1683
- indexUUID ,
1684
- new ShardId (Index .UNKNOWN_INDEX_NAME , indexUUID , Integer .parseInt (shardBlob .getKey ()))
1685
- ).close ();
1686
- }
1687
1716
}
1688
1717
}
1689
1718
}
0 commit comments