mirror of
https://github.com/FairRootGroup/FairMQ.git
synced 2025-10-15 09:31:45 +00:00
PMIx: Add commands to plugin and command ui
This commit is contained in:
committed by
Dennis Klein
parent
fd2bac3e22
commit
c290c16896
@@ -8,8 +8,15 @@
|
||||
|
||||
#include "PMIxPlugin.h"
|
||||
|
||||
#include <fairmq/sdk/commands/Commands.h>
|
||||
#include <fairmq/Tools.h>
|
||||
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
#include <cstdint> // UINT32_MAX
|
||||
|
||||
using namespace std;
|
||||
using namespace fair::mq::sdk::cmd;
|
||||
|
||||
namespace fair
|
||||
{
|
||||
@@ -18,47 +25,99 @@ namespace mq
|
||||
namespace plugins
|
||||
{
|
||||
|
||||
PMIxPlugin::PMIxPlugin(const std::string& name,
|
||||
PMIxPlugin::PMIxPlugin(const string& name,
|
||||
const Plugin::Version version,
|
||||
const std::string& maintainer,
|
||||
const std::string& homepage,
|
||||
const string& maintainer,
|
||||
const string& homepage,
|
||||
PluginServices* pluginServices)
|
||||
: Plugin(name, version, maintainer, homepage, pluginServices)
|
||||
, fProcess(Init())
|
||||
, fPid(getpid())
|
||||
, fPMIxClient(tools::ToString("PMIx client(pid=", fPid, ") "))
|
||||
, fDeviceId(string(fProcess.nspace) + "_" + to_string(fProcess.rank))
|
||||
, fCommands(fProcess)
|
||||
, fLastExternalController(UINT32_MAX)
|
||||
, fExitingAckedByLastExternalController(false)
|
||||
, fCurrentState(DeviceState::Idle)
|
||||
, fLastState(DeviceState::Idle)
|
||||
{
|
||||
Init();
|
||||
SetProperty<std::string>("id", std::string(fProc.nspace) + "_" + std::to_string(fProc.rank));
|
||||
Fence();
|
||||
TakeDeviceControl();
|
||||
LOG(debug) << PMIxClient() << "pmix::init() OK: " << fProcess << ", version=" << pmix::get_version();
|
||||
SetProperty<string>("id", fDeviceId);
|
||||
|
||||
SubscribeToDeviceStateChange([&](DeviceState newState) {
|
||||
Fence("pmix::init");
|
||||
SubscribeForCommands();
|
||||
Fence("subscribed");
|
||||
|
||||
// fCommands.Send("test1");
|
||||
// fCommands.Send("test2", 0);
|
||||
// fCommands.Send("test3", 0);
|
||||
|
||||
// LOG(info) << "PMIX_EXTERNAL_ERR_BASE: " << PMIX_EXTERNAL_ERR_BASE;
|
||||
|
||||
// job level infos
|
||||
// LOG(info) << "PMIX_SESSION_ID: " << pmix::getInfo(PMIX_SESSION_ID, fProcess);
|
||||
// LOG(info) << "PMIX_UNIV_SIZE: " << pmix::getInfo(PMIX_UNIV_SIZE, fProcess);
|
||||
// LOG(info) << "PMIX_JOB_SIZE: " << pmix::getInfo(PMIX_JOB_SIZE, fProcess);
|
||||
// LOG(info) << "PMIX_JOB_NUM_APPS: " << pmix::getInfo(PMIX_JOB_NUM_APPS, fProcess);
|
||||
// LOG(info) << "PMIX_APP_SIZE: " << pmix::getInfo(PMIX_APP_SIZE, fProcess);
|
||||
// LOG(info) << "PMIX_MAX_PROCS: " << pmix::getInfo(PMIX_MAX_PROCS, fProcess);
|
||||
// LOG(info) << "PMIX_NUM_NODES: " << pmix::getInfo(PMIX_NUM_NODES, fProcess);
|
||||
// LOG(info) << "PMIX_CLUSTER_ID: " << pmix::getInfo(PMIX_CLUSTER_ID, fProcess);
|
||||
// LOG(info) << "PMIX_NSPACE: " << pmix::getInfo(PMIX_NSPACE, fProcess);
|
||||
// LOG(info) << "PMIX_JOBID: " << pmix::getInfo(PMIX_JOBID, fProcess);
|
||||
// LOG(info) << "PMIX_NODE_LIST: " << pmix::getInfo(PMIX_NODE_LIST, fProcess);
|
||||
// LOG(info) << "PMIX_ALLOCATED_NODELIST: " << pmix::getInfo(PMIX_ALLOCATED_NODELIST, fProcess);
|
||||
// LOG(info) << "PMIX_NPROC_OFFSET: " << pmix::getInfo(PMIX_NPROC_OFFSET, fProcess);
|
||||
// LOG(info) << "PMIX_LOCALLDR: " << pmix::getInfo(PMIX_LOCALLDR, fProcess);
|
||||
// LOG(info) << "PMIX_APPLDR: " << pmix::getInfo(PMIX_APPLDR, fProcess);
|
||||
|
||||
// // per-node information
|
||||
// LOG(info) << "PMIX_NODE_SIZE: " << pmix::getInfo(PMIX_NODE_SIZE, fProcess);
|
||||
// LOG(info) << "PMIX_LOCAL_SIZE: " << pmix::getInfo(PMIX_LOCAL_SIZE, fProcess);
|
||||
// LOG(info) << "PMIX_AVAIL_PHYS_MEMORY: " << pmix::getInfo(PMIX_AVAIL_PHYS_MEMORY, fProcess);
|
||||
|
||||
// // per-process information
|
||||
// LOG(info) << "PMIX_PROCID: " << pmix::getInfo(PMIX_PROCID, fProcess);
|
||||
// LOG(info) << "PMIX_APPNUM: " << pmix::getInfo(PMIX_APPNUM, fProcess);
|
||||
// LOG(info) << "PMIX_LOCAL_RANK: " << pmix::getInfo(PMIX_LOCAL_RANK, fProcess);
|
||||
// LOG(info) << "PMIX_NODE_RANK: " << pmix::getInfo(PMIX_NODE_RANK, fProcess);
|
||||
// LOG(info) << "PMIX_RANK: " << pmix::getInfo(PMIX_RANK, fProcess);
|
||||
// LOG(info) << "PMIX_GLOBAL_RANK: " << pmix::getInfo(PMIX_GLOBAL_RANK, fProcess);
|
||||
// LOG(info) << "PMIX_APP_RANK: " << pmix::getInfo(PMIX_APP_RANK, fProcess);
|
||||
|
||||
SubscribeToDeviceStateChange([this](DeviceState newState) {
|
||||
switch (newState) {
|
||||
case DeviceState::Idle:
|
||||
Fence();
|
||||
break;
|
||||
case DeviceState::Bound:
|
||||
Publish();
|
||||
Fence();
|
||||
break;
|
||||
case DeviceState::Connecting:
|
||||
Lookup();
|
||||
break;
|
||||
case DeviceState::DeviceReady:
|
||||
Fence();
|
||||
break;
|
||||
case DeviceState::Ready:
|
||||
Fence();
|
||||
break;
|
||||
case DeviceState::Exiting:
|
||||
UnsubscribeFromDeviceStateChange();
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
case DeviceState::Bound:
|
||||
Publish();
|
||||
break;
|
||||
case DeviceState::Connecting:
|
||||
Lookup();
|
||||
break;
|
||||
case DeviceState::Exiting:
|
||||
ReleaseDeviceControl();
|
||||
UnsubscribeFromDeviceStateChange();
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
lock_guard<mutex> lock{fStateChangeSubscriberMutex};
|
||||
fLastState = fCurrentState;
|
||||
fCurrentState = newState;
|
||||
for (auto subscriberId : fStateChangeSubscribers) {
|
||||
LOG(debug) << "Publishing state-change: " << fLastState << "->" << newState << " to " << subscriberId;
|
||||
Cmds cmds(make<StateChange>(fDeviceId, 0, fLastState, fCurrentState));
|
||||
fCommands.Send(cmds.Serialize(Format::JSON), static_cast<pmix::rank>(subscriberId));
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
PMIxPlugin::~PMIxPlugin()
|
||||
{
|
||||
LOG(debug) << "Destroying PMIxPlugin";
|
||||
ReleaseDeviceControl();
|
||||
fCommands.Unsubscribe();
|
||||
while (pmix::initialized()) {
|
||||
try {
|
||||
pmix::finalize();
|
||||
@@ -69,33 +128,112 @@ PMIxPlugin::~PMIxPlugin()
|
||||
}
|
||||
}
|
||||
|
||||
auto PMIxPlugin::PMIxClient() const -> std::string
|
||||
auto PMIxPlugin::SubscribeForCommands() -> void
|
||||
{
|
||||
std::stringstream ss;
|
||||
ss << "PMIx client(pid=" << fPid << ") ";
|
||||
return ss.str();
|
||||
fCommands.Subscribe([this](const string& cmdStr, const pmix::proc& sender) {
|
||||
// LOG(info) << "PMIx Plugin received message: '" << cmdStr << "', from " << sender;
|
||||
|
||||
Cmds inCmds;
|
||||
inCmds.Deserialize(cmdStr, Format::JSON);
|
||||
|
||||
for (const auto& cmd : inCmds) {
|
||||
LOG(info) << "Received command type: '" << cmd->GetType() << "' from " << sender;
|
||||
switch (cmd->GetType()) {
|
||||
case Type::check_state:
|
||||
fCommands.Send(Cmds(make<CurrentState>(fDeviceId, GetCurrentDeviceState()))
|
||||
.Serialize(Format::JSON),
|
||||
{sender});
|
||||
break;
|
||||
case Type::change_state: {
|
||||
Transition transition = static_cast<ChangeState&>(*cmd).GetTransition();
|
||||
if (ChangeDeviceState(transition)) {
|
||||
fCommands.Send(
|
||||
Cmds(make<TransitionStatus>(fDeviceId, Result::Ok, transition))
|
||||
.Serialize(Format::JSON),
|
||||
{sender});
|
||||
} else {
|
||||
fCommands.Send(
|
||||
Cmds(make<TransitionStatus>(fDeviceId, Result::Failure, transition))
|
||||
.Serialize(Format::JSON),
|
||||
{sender});
|
||||
}
|
||||
{
|
||||
lock_guard<mutex> lock{fStateChangeSubscriberMutex};
|
||||
fLastExternalController = sender.rank;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case Type::subscribe_to_state_change: {
|
||||
{
|
||||
lock_guard<mutex> lock{fStateChangeSubscriberMutex};
|
||||
fStateChangeSubscribers.insert(sender.rank);
|
||||
}
|
||||
|
||||
LOG(debug) << "Publishing state-change: " << fLastState << "->" << fCurrentState
|
||||
<< " to " << sender;
|
||||
Cmds outCmds(make<StateChangeSubscription>(fDeviceId, Result::Ok),
|
||||
make<StateChange>(fDeviceId, 0, fLastState, fCurrentState));
|
||||
fCommands.Send(outCmds.Serialize(Format::JSON), {sender});
|
||||
}
|
||||
break;
|
||||
case Type::unsubscribe_from_state_change: {
|
||||
{
|
||||
lock_guard<mutex> lock{fStateChangeSubscriberMutex};
|
||||
fStateChangeSubscribers.erase(sender.rank);
|
||||
}
|
||||
fCommands.Send(Cmds(make<StateChangeUnsubscription>(fDeviceId, Result::Ok))
|
||||
.Serialize(Format::JSON),
|
||||
{sender});
|
||||
}
|
||||
break;
|
||||
case Type::state_change_exiting_received: {
|
||||
{
|
||||
lock_guard<mutex> lock{fStateChangeSubscriberMutex};
|
||||
if (fLastExternalController == sender.rank) {
|
||||
fExitingAckedByLastExternalController = true;
|
||||
}
|
||||
}
|
||||
fExitingAcked.notify_one();
|
||||
}
|
||||
break;
|
||||
case Type::dump_config: {
|
||||
stringstream ss;
|
||||
for (const auto& k: GetPropertyKeys()) {
|
||||
ss << fDeviceId << ": " << k << " -> " << GetPropertyAsString(k) << "\n";
|
||||
}
|
||||
fCommands.Send(Cmds(make<Config>(fDeviceId, ss.str())).Serialize(Format::JSON),
|
||||
{sender});
|
||||
}
|
||||
break;
|
||||
default:
|
||||
LOG(warn) << "Unexpected/unknown command received: " << cmdStr;
|
||||
LOG(warn) << "Origin: " << sender;
|
||||
break;
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
auto PMIxPlugin::Init() -> void
|
||||
auto PMIxPlugin::Init() -> pmix::proc
|
||||
{
|
||||
if (!pmix::initialized()) {
|
||||
fProc = pmix::init();
|
||||
LOG(debug) << PMIxClient() << "pmix::init() OK: " << fProc
|
||||
<< ",version=" << pmix::get_version();
|
||||
return pmix::init();
|
||||
} else {
|
||||
throw runtime_error("trying to initialize PMIx while it is already initialized");
|
||||
}
|
||||
}
|
||||
|
||||
auto PMIxPlugin::Publish() -> void
|
||||
{
|
||||
auto channels(GetChannelInfo());
|
||||
std::vector<pmix::info> info;
|
||||
vector<pmix::info> info;
|
||||
|
||||
for (const auto& c : channels) {
|
||||
std::string methodKey{"chans." + c.first + "." + std::to_string(c.second - 1) + ".method"};
|
||||
if (GetProperty<std::string>(methodKey) == "bind") {
|
||||
string methodKey("chans." + c.first + "." + to_string(c.second - 1) + ".method");
|
||||
if (GetProperty<string>(methodKey) == "bind") {
|
||||
for (int i = 0; i < c.second; ++i) {
|
||||
std::string addressKey{"chans." + c.first + "." + std::to_string(i) + ".address"};
|
||||
info.emplace_back(addressKey, GetProperty<std::string>(addressKey));
|
||||
string addressKey("chans." + c.first + "." + to_string(i) + ".address");
|
||||
info.emplace_back(addressKey, GetProperty<string>(addressKey));
|
||||
LOG(debug) << PMIxClient() << info.back();
|
||||
}
|
||||
}
|
||||
@@ -103,32 +241,37 @@ auto PMIxPlugin::Publish() -> void
|
||||
|
||||
if (info.size() > 0) {
|
||||
pmix::publish(info);
|
||||
LOG(debug) << PMIxClient() << "pmix::publish() OK: published "
|
||||
<< info.size() << " binding channels.";
|
||||
LOG(debug) << PMIxClient() << "pmix::publish() OK: published " << info.size()
|
||||
<< " binding channels.";
|
||||
}
|
||||
}
|
||||
|
||||
auto PMIxPlugin::Fence() -> void
|
||||
{
|
||||
pmix::proc all(fProc);
|
||||
pmix::proc all(fProcess);
|
||||
all.rank = pmix::rank::wildcard;
|
||||
|
||||
pmix::fence({all});
|
||||
LOG(debug) << PMIxClient() << "pmix::fence() OK";
|
||||
}
|
||||
|
||||
auto PMIxPlugin::Fence(const std::string& label) -> void
|
||||
{
|
||||
Fence(label);
|
||||
LOG(debug) << PMIxClient() << "pmix::fence() [" << label << "] OK";
|
||||
}
|
||||
|
||||
auto PMIxPlugin::Lookup() -> void
|
||||
{
|
||||
auto channels(GetChannelInfo());
|
||||
for (const auto& c : channels) {
|
||||
std::string methodKey{"chans." + c.first + "." + std::to_string(c.second - 1) + ".method"};
|
||||
if (GetProperty<std::string>(methodKey) == "connect") {
|
||||
string methodKey("chans." + c.first + "." + to_string(c.second - 1) + ".method");
|
||||
if (GetProperty<string>(methodKey) == "connect") {
|
||||
for (int i = 0; i < c.second; ++i) {
|
||||
std::vector<pmix::pdata> pdata;
|
||||
std::string addressKey{"chans." + c.first + "." + std::to_string(i) + ".address"};
|
||||
vector<pmix::pdata> pdata;
|
||||
string addressKey("chans." + c.first + "." + to_string(i) + ".address");
|
||||
pdata.emplace_back();
|
||||
pdata.back().set_key(addressKey);
|
||||
std::vector<pmix::info> info;
|
||||
vector<pmix::info> info;
|
||||
info.emplace_back(PMIX_WAIT, static_cast<int>(pdata.size()));
|
||||
|
||||
if (pdata.size() > 0) {
|
||||
@@ -141,11 +284,11 @@ auto PMIxPlugin::Lookup() -> void
|
||||
LOG(debug) << PMIxClient() << "pmix::lookup() not found: key=" << p.key;
|
||||
} else if (p.value.type == PMIX_STRING) {
|
||||
LOG(debug) << PMIxClient() << "pmix::lookup() found:"
|
||||
<< " key=" << p.key << ",value=" << p.value.data.string;
|
||||
SetProperty<std::string>(p.key, p.value.data.string);
|
||||
<< " key=" << p.key << ",value=" << p.value.data.string;
|
||||
SetProperty<string>(p.key, p.value.data.string);
|
||||
} else {
|
||||
LOG(debug) << PMIxClient() << "pmix::lookup() wrong type returned: "
|
||||
<< "key=" << p.key << ",type=" << p.value.type;
|
||||
<< "key=" << p.key << ",type=" << p.value.type;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -153,6 +296,14 @@ auto PMIxPlugin::Lookup() -> void
|
||||
}
|
||||
}
|
||||
|
||||
auto PMIxPlugin::WaitForExitingAck() -> void
|
||||
{
|
||||
unique_lock<mutex> lock(fStateChangeSubscriberMutex);
|
||||
fExitingAcked.wait_for(lock, chrono::milliseconds(1000), [this]() {
|
||||
return fExitingAckedByLastExternalController;
|
||||
});
|
||||
}
|
||||
|
||||
} /* namespace plugins */
|
||||
} /* namespace mq */
|
||||
} /* namespace fair */
|
||||
|
Reference in New Issue
Block a user