Skip to content

Commit b61a4ad

Browse files
fwyzardAnnnnya
andcommitted
Introduce a mode where all processes are in MPI_COMM_WORLD
Co-authored-by: Anna Polova <[email protected]>
1 parent 95714a3 commit b61a4ad

File tree

2 files changed

+176
-48
lines changed

2 files changed

+176
-48
lines changed

HeterogeneousCore/MPICore/plugins/MPIController.cc

Lines changed: 81 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include "FWCore/Framework/interface/one/EDProducer.h"
1717
#include "FWCore/MessageLogger/interface/MessageLogger.h"
1818
#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
19+
#include "FWCore/ParameterSet/interface/EmptyGroupDescription.h"
1920
#include "FWCore/ParameterSet/interface/ParameterDescriptionNode.h"
2021
#include "FWCore/ParameterSet/interface/ParameterSet.h"
2122
#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
@@ -60,41 +61,93 @@ class MPIController : public edm::one::EDProducer<edm::one::WatchRuns, edm::one:
6061
static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
6162

6263
private:
64+
enum Mode { kInvalid = 0, kCommWorld, kIntercommunicator };
65+
static constexpr const char* ModeDescription[] = {"Invalid", "CommWorld", "Intercommunicator"};
66+
Mode parseMode(std::string const& label) {
67+
if (label == ModeDescription[kCommWorld])
68+
return kCommWorld;
69+
else if (label == ModeDescription[kIntercommunicator])
70+
return kIntercommunicator;
71+
else
72+
return kInvalid;
73+
}
74+
6375
MPI_Comm comm_ = MPI_COMM_NULL;
6476
MPIChannel channel_;
6577
edm::EDPutTokenT<MPIToken> token_;
78+
Mode mode_;
6679
};
6780

6881
MPIController::MPIController(edm::ParameterSet const& config)
69-
: token_(produces<MPIToken>()) //
82+
: token_(produces<MPIToken>()),
83+
mode_(parseMode(config.getUntrackedParameter<std::string>("mode"))) //
7084
{
7185
// make sure that MPI is initialised
7286
MPIService::required();
7387

74-
// FIXME move into the MPIService ?
7588
// make sure the EDM MPI types are available
7689
EDM_MPI_build_types();
7790

78-
// look up the "server" port
79-
char port[MPI_MAX_PORT_NAME];
80-
MPI_Lookup_name("server", MPI_INFO_NULL, port);
81-
edm::LogAbsolute("MPI") << "Trying to connect to the MPI server on port " << port;
82-
83-
// connect to the server
84-
int size;
85-
MPI_Comm_connect(port, MPI_INFO_NULL, 0, MPI_COMM_SELF, &comm_);
86-
MPI_Comm_remote_size(comm_, &size);
87-
edm::LogAbsolute("MPI") << "Client connected to " << size << (size == 1 ? " server" : " servers");
88-
if (size > 1) {
89-
throw cms::Exception("UnsupportedFeature")
90-
<< "MPIController supports only a single follower, but it was connected to " << size << " followers";
91+
if (mode_ == kCommWorld) {
92+
// All processes are in MPI_COMM_WORLD.
93+
// The current implementation supports only two processes: one controller and one source.
94+
edm::LogAbsolute("MPI") << "MPIController in " << ModeDescription[mode_] << " mode.";
95+
96+
// Check how many processes are there in MPI_COMM_WORLD
97+
int size;
98+
MPI_Comm_size(MPI_COMM_WORLD, &size);
99+
if (size != 2) {
100+
throw edm::Exception(edm::errors::Configuration)
101+
<< "The current implementation supports only two processes: one controller and one source.";
102+
}
103+
104+
// Check the rank of this process, and determine the rank of the other process.
105+
int rank;
106+
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
107+
edm::LogAbsolute("MPI") << "MPIController has rank " << rank << " in MPI_COMM_WORLD.";
108+
int other_rank = 1 - rank;
109+
comm_ = MPI_COMM_WORLD;
110+
channel_ = MPIChannel(comm_, other_rank);
111+
} else if (mode_ == kIntercommunicator) {
112+
// Use an intercommunicator to let two groups of processes communicate with each other.
113+
// The current implementation supports only two processes: one controller and one source.
114+
edm::LogAbsolute("MPI") << "MPISource in " << ModeDescription[mode_] << " mode.";
115+
116+
// Check how many processes are there in MPI_COMM_WORLD
117+
int size;
118+
MPI_Comm_size(MPI_COMM_WORLD, &size);
119+
if (size != 1) {
120+
throw edm::Exception(edm::errors::Configuration)
121+
<< "The current implementation supports only two processes: one controller and one source.";
122+
}
123+
124+
// Look for the port under the name indicated by the parameter "server".
125+
std::string name = config.getUntrackedParameter<std::string>("name", "server");
126+
char port[MPI_MAX_PORT_NAME];
127+
MPI_Lookup_name(name.c_str(), MPI_INFO_NULL, port);
128+
edm::LogAbsolute("MPI") << "Trying to connect to the MPI server on port " << port;
129+
130+
// Create an intercommunicator and connect to the server.
131+
MPI_Comm_connect(port, MPI_INFO_NULL, 0, MPI_COMM_SELF, &comm_);
132+
MPI_Comm_remote_size(comm_, &size);
133+
if (size != 1) {
134+
throw edm::Exception(edm::errors::Configuration)
135+
<< "The current implementation supports only two processes: one controller and one source.";
136+
}
137+
edm::LogAbsolute("MPI") << "Client connected to " << size << (size == 1 ? " server" : " servers");
138+
channel_ = MPIChannel(comm_, 0);
139+
} else {
140+
// Invalid mode.
141+
throw edm::Exception(edm::errors::Configuration)
142+
<< "Invalid mode \"" << config.getUntrackedParameter<std::string>("mode") << "\"";
91143
}
92-
channel_ = MPIChannel(comm_, 0);
93144
}
94145

95146
MPIController::~MPIController() {
96-
// close the intercommunicator
97-
MPI_Comm_disconnect(&comm_);
147+
// Close the intercommunicator.
148+
if (mode_ == kIntercommunicator) {
149+
MPI_Comm_disconnect(&comm_);
150+
}
98151
}
99152

100153
void MPIController::beginJob() {
@@ -219,10 +272,18 @@ void MPIController::produce(edm::Event& event, edm::EventSetup const& setup) {
219272

220273
void MPIController::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
221274
descriptions.setComment(
222-
"This module connects to an \"MPISource\" in a separate CMSSW job, and transmits all "
223-
"Runs, LuminosityBlocks and Events from the current process to the remote one.");
275+
"This module connects to an \"MPISource\" in a separate CMSSW job, and transmits all Runs, LuminosityBlocks and "
276+
"Events from the current process to the remote one.");
224277

225278
edm::ParameterSetDescription desc;
279+
desc.ifValue(
280+
edm::ParameterDescription<std::string>("mode", "CommWorld", false),
281+
ModeDescription[kCommWorld] >> edm::EmptyGroupDescription() or
282+
ModeDescription[kIntercommunicator] >> edm::ParameterDescription<std::string>("name", "server", false))
283+
->setComment(
284+
"Valid modes are CommWorld (use MPI_COMM_WORLD) and Intercommunicator (use an MPI name server to setup an "
285+
"intercommunicator).");
286+
226287
descriptions.addWithDefaultLabel(desc);
227288
}
228289

HeterogeneousCore/MPICore/plugins/MPISource.cc

Lines changed: 95 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,12 @@
2929
#include "FWCore/MessageLogger/interface/ErrorObj.h"
3030
#include "FWCore/MessageLogger/interface/MessageLogger.h"
3131
#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
32+
#include "FWCore/ParameterSet/interface/EmptyGroupDescription.h"
33+
#include "FWCore/ParameterSet/interface/ParameterSet.h"
3234
#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
3335
#include "FWCore/ParameterSet/interface/ParameterSetDescriptionFiller.h"
34-
#include "FWCore/ParameterSet/interface/ParameterSet.h"
3536
#include "FWCore/Sources/interface/ProducerSourceBase.h"
37+
#include "FWCore/Utilities/interface/EDMException.h"
3638
#include "HeterogeneousCore/MPICore/interface/MPIToken.h"
3739
#include "HeterogeneousCore/MPIServices/interface/MPIService.h"
3840

@@ -54,10 +56,22 @@ class MPISource : public edm::ProducerSourceBase {
5456
bool setRunAndEventInfo(edm::EventID& id, edm::TimeValue_t& time, edm::EventAuxiliary::ExperimentType&) override;
5557
void produce(edm::Event&) override;
5658

59+
enum Mode { kInvalid = 0, kCommWorld, kIntercommunicator };
60+
static constexpr const char* ModeDescription[] = {"Invalid", "CommWorld", "Intercommunicator"};
61+
Mode parseMode(std::string const& label) {
62+
if (label == ModeDescription[kCommWorld])
63+
return kCommWorld;
64+
else if (label == ModeDescription[kIntercommunicator])
65+
return kIntercommunicator;
66+
else
67+
return kInvalid;
68+
}
69+
5770
char port_[MPI_MAX_PORT_NAME];
5871
MPI_Comm comm_ = MPI_COMM_NULL;
5972
MPIChannel channel_;
6073
edm::EDPutTokenT<MPIToken> token_;
74+
Mode mode_;
6175

6276
edm::ProcessHistory history_;
6377
};
@@ -67,48 +81,91 @@ MPISource::MPISource(edm::ParameterSet const& config, edm::InputSourceDescriptio
6781
// effectively be ignored, because this ConfigurableSource will explicitly set the run, lumi, and event
6882
// numbers, the timestamp, and the event type
6983
edm::ProducerSourceBase(config, desc, false),
70-
token_(produces<MPIToken>()) //
84+
token_(produces<MPIToken>()),
85+
mode_(parseMode(config.getUntrackedParameter<std::string>("mode"))) //
7186
{
7287
// make sure that MPI is initialised
7388
MPIService::required();
7489

75-
// FIXME move into the MPIService ?
76-
// make sure the EDM MPI types are available
90+
// Make sure the EDM MPI types are available.
7791
EDM_MPI_build_types();
7892

79-
// open a server-side port
80-
MPI_Open_port(MPI_INFO_NULL, port_);
93+
if (mode_ == kCommWorld) {
94+
// All processes are in MPI_COMM_WORLD.
95+
// The current implementation supports only two processes: one controller and one source.
96+
edm::LogAbsolute("MPI") << "MPISource in " << ModeDescription[mode_] << " mode.";
97+
98+
// Check how many processes are there in MPI_COMM_WORLD
99+
int size;
100+
MPI_Comm_size(MPI_COMM_WORLD, &size);
101+
if (size != 2) {
102+
throw edm::Exception(edm::errors::Configuration)
103+
<< "The current implementation supports only two processes: one controller and one source.";
104+
}
81105

82-
// publish the port under the name "server"
83-
MPI_Info port_info;
84-
MPI_Info_create(&port_info);
85-
MPI_Info_set(port_info, "ompi_global_scope", "true");
86-
MPI_Info_set(port_info, "ompi_unique", "true");
87-
MPI_Publish_name("server", port_info, port_);
106+
// Check the rank of this process, and determine the rank of the other process.
107+
int rank;
108+
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
109+
edm::LogAbsolute("MPI") << "MPISource has rank " << rank << " in MPI_COMM_WORLD.";
110+
int other_rank = 1 - rank;
111+
comm_ = MPI_COMM_WORLD;
112+
channel_ = MPIChannel(comm_, other_rank);
113+
} else if (mode_ == kIntercommunicator) {
114+
// Use an intercommunicator to let two groups of processes communicate with each other.
115+
// The current implementation supports only two processes: one controller and one source.
116+
edm::LogAbsolute("MPI") << "MPISource in " << ModeDescription[mode_] << " mode.";
117+
118+
// Check how many processes are there in MPI_COMM_WORLD
119+
int size;
120+
MPI_Comm_size(MPI_COMM_WORLD, &size);
121+
if (size != 1) {
122+
throw edm::Exception(edm::errors::Configuration)
123+
<< "The current implementation supports only two processes: one controller and one source.";
124+
}
88125

89-
// create an intercommunicator and accept a client connection
90-
edm::LogAbsolute("MPI") << "waiting for a connection to the MPI server at port " << port_;
91-
MPI_Comm_accept(port_, MPI_INFO_NULL, 0, MPI_COMM_WORLD, &comm_);
92-
channel_ = MPIChannel(comm_, 0);
126+
// Open a server-side port.
127+
MPI_Open_port(MPI_INFO_NULL, port_);
128+
129+
// Publish the port under the name indicated by the parameter "server".
130+
std::string name = config.getUntrackedParameter<std::string>("name", "server");
131+
MPI_Info port_info;
132+
MPI_Info_create(&port_info);
133+
MPI_Info_set(port_info, "ompi_global_scope", "true");
134+
MPI_Info_set(port_info, "ompi_unique", "true");
135+
MPI_Publish_name(name.c_str(), port_info, port_);
136+
137+
// Create an intercommunicator and accept a client connection.
138+
edm::LogAbsolute("MPI") << "Waiting for a connection to the MPI server at port " << port_;
139+
140+
MPI_Comm_accept(port_, MPI_INFO_NULL, 0, MPI_COMM_SELF, &comm_);
141+
edm::LogAbsolute("MPI") << "Connection accepted.";
142+
channel_ = MPIChannel(comm_, 0);
143+
} else {
144+
// Invalid mode.
145+
throw edm::Exception(edm::errors::Configuration)
146+
<< "Invalid mode \"" << config.getUntrackedParameter<std::string>("mode") << "\"";
147+
}
93148

94-
// wait for a client to connect
149+
// Wait for a client to connect.
95150
MPI_Status status;
96151
EDM_MPI_Empty_t buffer;
97152
MPI_Recv(&buffer, 1, EDM_MPI_Empty, MPI_ANY_SOURCE, EDM_MPI_Connect, comm_, &status);
98153
edm::LogAbsolute("MPI") << "connected from " << status.MPI_SOURCE;
99154
}
100155

101156
MPISource::~MPISource() {
102-
// close the intercommunicator
103-
MPI_Comm_disconnect(&comm_);
104-
105-
// unpublish and close the port
106-
MPI_Info port_info;
107-
MPI_Info_create(&port_info);
108-
MPI_Info_set(port_info, "ompi_global_scope", "true");
109-
MPI_Info_set(port_info, "ompi_unique", "true");
110-
MPI_Unpublish_name("server", port_info, port_);
111-
MPI_Close_port(port_);
157+
if (mode_ == kIntercommunicator) {
158+
// Close the intercommunicator.
159+
MPI_Comm_disconnect(&comm_);
160+
161+
// Unpublish and close the port.
162+
MPI_Info port_info;
163+
MPI_Info_create(&port_info);
164+
MPI_Info_set(port_info, "ompi_global_scope", "true");
165+
MPI_Info_set(port_info, "ompi_unique", "true");
166+
MPI_Unpublish_name("server", port_info, port_);
167+
MPI_Close_port(port_);
168+
}
112169
}
113170

114171
//MPISource::ItemTypeInfo MPISource::getNextItemType() {
@@ -255,9 +312,19 @@ void MPISource::produce(edm::Event& event) {
255312
}
256313

257314
void MPISource::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
315+
descriptions.setComment(
316+
"This module connects to an \"MPIController\" in a separate CMSSW job, receives all Runs, LuminosityBlocks and "
317+
"Events from the remote process and reproduces them in the local one.");
318+
258319
edm::ParameterSetDescription desc;
259-
desc.setComment("Comunicate with another cmsRun process over MPI.");
260320
edm::ProducerSourceBase::fillDescription(desc);
321+
desc.ifValue(
322+
edm::ParameterDescription<std::string>("mode", "CommWorld", false),
323+
ModeDescription[kCommWorld] >> edm::EmptyGroupDescription() or
324+
ModeDescription[kIntercommunicator] >> edm::ParameterDescription<std::string>("name", "server", false))
325+
->setComment(
326+
"Valid modes are CommWorld (use MPI_COMM_WORLD) and Intercommunicator (use an MPI name server to setup an "
327+
"intercommunicator).");
261328

262329
descriptions.add("source", desc);
263330
}

0 commit comments

Comments
 (0)