PMIx: Add commands to plugin and command ui

This commit is contained in:
Alexey Rybalchenko
2020-01-06 19:48:03 +01:00
committed by Dennis Klein
parent fd2bac3e22
commit c290c16896
8 changed files with 864 additions and 82 deletions

View File

@@ -8,8 +8,15 @@
#include "PMIxPlugin.h"
#include <fairmq/sdk/commands/Commands.h>
#include <fairmq/Tools.h>
#include <sstream>
#include <stdexcept>
#include <cstdint> // UINT32_MAX
using namespace std;
using namespace fair::mq::sdk::cmd;
namespace fair
{
@@ -18,47 +25,99 @@ namespace mq
namespace plugins
{
PMIxPlugin::PMIxPlugin(const std::string& name,
PMIxPlugin::PMIxPlugin(const string& name,
const Plugin::Version version,
const std::string& maintainer,
const std::string& homepage,
const string& maintainer,
const string& homepage,
PluginServices* pluginServices)
: Plugin(name, version, maintainer, homepage, pluginServices)
, fProcess(Init())
, fPid(getpid())
, fPMIxClient(tools::ToString("PMIx client(pid=", fPid, ") "))
, fDeviceId(string(fProcess.nspace) + "_" + to_string(fProcess.rank))
, fCommands(fProcess)
, fLastExternalController(UINT32_MAX)
, fExitingAckedByLastExternalController(false)
, fCurrentState(DeviceState::Idle)
, fLastState(DeviceState::Idle)
{
Init();
SetProperty<std::string>("id", std::string(fProc.nspace) + "_" + std::to_string(fProc.rank));
Fence();
TakeDeviceControl();
LOG(debug) << PMIxClient() << "pmix::init() OK: " << fProcess << ", version=" << pmix::get_version();
SetProperty<string>("id", fDeviceId);
SubscribeToDeviceStateChange([&](DeviceState newState) {
Fence("pmix::init");
SubscribeForCommands();
Fence("subscribed");
// fCommands.Send("test1");
// fCommands.Send("test2", 0);
// fCommands.Send("test3", 0);
// LOG(info) << "PMIX_EXTERNAL_ERR_BASE: " << PMIX_EXTERNAL_ERR_BASE;
// job level infos
// LOG(info) << "PMIX_SESSION_ID: " << pmix::getInfo(PMIX_SESSION_ID, fProcess);
// LOG(info) << "PMIX_UNIV_SIZE: " << pmix::getInfo(PMIX_UNIV_SIZE, fProcess);
// LOG(info) << "PMIX_JOB_SIZE: " << pmix::getInfo(PMIX_JOB_SIZE, fProcess);
// LOG(info) << "PMIX_JOB_NUM_APPS: " << pmix::getInfo(PMIX_JOB_NUM_APPS, fProcess);
// LOG(info) << "PMIX_APP_SIZE: " << pmix::getInfo(PMIX_APP_SIZE, fProcess);
// LOG(info) << "PMIX_MAX_PROCS: " << pmix::getInfo(PMIX_MAX_PROCS, fProcess);
// LOG(info) << "PMIX_NUM_NODES: " << pmix::getInfo(PMIX_NUM_NODES, fProcess);
// LOG(info) << "PMIX_CLUSTER_ID: " << pmix::getInfo(PMIX_CLUSTER_ID, fProcess);
// LOG(info) << "PMIX_NSPACE: " << pmix::getInfo(PMIX_NSPACE, fProcess);
// LOG(info) << "PMIX_JOBID: " << pmix::getInfo(PMIX_JOBID, fProcess);
// LOG(info) << "PMIX_NODE_LIST: " << pmix::getInfo(PMIX_NODE_LIST, fProcess);
// LOG(info) << "PMIX_ALLOCATED_NODELIST: " << pmix::getInfo(PMIX_ALLOCATED_NODELIST, fProcess);
// LOG(info) << "PMIX_NPROC_OFFSET: " << pmix::getInfo(PMIX_NPROC_OFFSET, fProcess);
// LOG(info) << "PMIX_LOCALLDR: " << pmix::getInfo(PMIX_LOCALLDR, fProcess);
// LOG(info) << "PMIX_APPLDR: " << pmix::getInfo(PMIX_APPLDR, fProcess);
// // per-node information
// LOG(info) << "PMIX_NODE_SIZE: " << pmix::getInfo(PMIX_NODE_SIZE, fProcess);
// LOG(info) << "PMIX_LOCAL_SIZE: " << pmix::getInfo(PMIX_LOCAL_SIZE, fProcess);
// LOG(info) << "PMIX_AVAIL_PHYS_MEMORY: " << pmix::getInfo(PMIX_AVAIL_PHYS_MEMORY, fProcess);
// // per-process information
// LOG(info) << "PMIX_PROCID: " << pmix::getInfo(PMIX_PROCID, fProcess);
// LOG(info) << "PMIX_APPNUM: " << pmix::getInfo(PMIX_APPNUM, fProcess);
// LOG(info) << "PMIX_LOCAL_RANK: " << pmix::getInfo(PMIX_LOCAL_RANK, fProcess);
// LOG(info) << "PMIX_NODE_RANK: " << pmix::getInfo(PMIX_NODE_RANK, fProcess);
// LOG(info) << "PMIX_RANK: " << pmix::getInfo(PMIX_RANK, fProcess);
// LOG(info) << "PMIX_GLOBAL_RANK: " << pmix::getInfo(PMIX_GLOBAL_RANK, fProcess);
// LOG(info) << "PMIX_APP_RANK: " << pmix::getInfo(PMIX_APP_RANK, fProcess);
SubscribeToDeviceStateChange([this](DeviceState newState) {
switch (newState) {
case DeviceState::Idle:
Fence();
break;
case DeviceState::Bound:
Publish();
Fence();
break;
case DeviceState::Connecting:
Lookup();
break;
case DeviceState::DeviceReady:
Fence();
break;
case DeviceState::Ready:
Fence();
break;
case DeviceState::Exiting:
UnsubscribeFromDeviceStateChange();
break;
default:
break;
case DeviceState::Bound:
Publish();
break;
case DeviceState::Connecting:
Lookup();
break;
case DeviceState::Exiting:
ReleaseDeviceControl();
UnsubscribeFromDeviceStateChange();
break;
default:
break;
}
lock_guard<mutex> lock{fStateChangeSubscriberMutex};
fLastState = fCurrentState;
fCurrentState = newState;
for (auto subscriberId : fStateChangeSubscribers) {
LOG(debug) << "Publishing state-change: " << fLastState << "->" << newState << " to " << subscriberId;
Cmds cmds(make<StateChange>(fDeviceId, 0, fLastState, fCurrentState));
fCommands.Send(cmds.Serialize(Format::JSON), static_cast<pmix::rank>(subscriberId));
}
});
}
PMIxPlugin::~PMIxPlugin()
{
LOG(debug) << "Destroying PMIxPlugin";
ReleaseDeviceControl();
fCommands.Unsubscribe();
while (pmix::initialized()) {
try {
pmix::finalize();
@@ -69,33 +128,112 @@ PMIxPlugin::~PMIxPlugin()
}
}
auto PMIxPlugin::PMIxClient() const -> std::string
auto PMIxPlugin::SubscribeForCommands() -> void
{
std::stringstream ss;
ss << "PMIx client(pid=" << fPid << ") ";
return ss.str();
fCommands.Subscribe([this](const string& cmdStr, const pmix::proc& sender) {
// LOG(info) << "PMIx Plugin received message: '" << cmdStr << "', from " << sender;
Cmds inCmds;
inCmds.Deserialize(cmdStr, Format::JSON);
for (const auto& cmd : inCmds) {
LOG(info) << "Received command type: '" << cmd->GetType() << "' from " << sender;
switch (cmd->GetType()) {
case Type::check_state:
fCommands.Send(Cmds(make<CurrentState>(fDeviceId, GetCurrentDeviceState()))
.Serialize(Format::JSON),
{sender});
break;
case Type::change_state: {
Transition transition = static_cast<ChangeState&>(*cmd).GetTransition();
if (ChangeDeviceState(transition)) {
fCommands.Send(
Cmds(make<TransitionStatus>(fDeviceId, Result::Ok, transition))
.Serialize(Format::JSON),
{sender});
} else {
fCommands.Send(
Cmds(make<TransitionStatus>(fDeviceId, Result::Failure, transition))
.Serialize(Format::JSON),
{sender});
}
{
lock_guard<mutex> lock{fStateChangeSubscriberMutex};
fLastExternalController = sender.rank;
}
}
break;
case Type::subscribe_to_state_change: {
{
lock_guard<mutex> lock{fStateChangeSubscriberMutex};
fStateChangeSubscribers.insert(sender.rank);
}
LOG(debug) << "Publishing state-change: " << fLastState << "->" << fCurrentState
<< " to " << sender;
Cmds outCmds(make<StateChangeSubscription>(fDeviceId, Result::Ok),
make<StateChange>(fDeviceId, 0, fLastState, fCurrentState));
fCommands.Send(outCmds.Serialize(Format::JSON), {sender});
}
break;
case Type::unsubscribe_from_state_change: {
{
lock_guard<mutex> lock{fStateChangeSubscriberMutex};
fStateChangeSubscribers.erase(sender.rank);
}
fCommands.Send(Cmds(make<StateChangeUnsubscription>(fDeviceId, Result::Ok))
.Serialize(Format::JSON),
{sender});
}
break;
case Type::state_change_exiting_received: {
{
lock_guard<mutex> lock{fStateChangeSubscriberMutex};
if (fLastExternalController == sender.rank) {
fExitingAckedByLastExternalController = true;
}
}
fExitingAcked.notify_one();
}
break;
case Type::dump_config: {
stringstream ss;
for (const auto& k: GetPropertyKeys()) {
ss << fDeviceId << ": " << k << " -> " << GetPropertyAsString(k) << "\n";
}
fCommands.Send(Cmds(make<Config>(fDeviceId, ss.str())).Serialize(Format::JSON),
{sender});
}
break;
default:
LOG(warn) << "Unexpected/unknown command received: " << cmdStr;
LOG(warn) << "Origin: " << sender;
break;
}
}
});
}
auto PMIxPlugin::Init() -> void
auto PMIxPlugin::Init() -> pmix::proc
{
if (!pmix::initialized()) {
fProc = pmix::init();
LOG(debug) << PMIxClient() << "pmix::init() OK: " << fProc
<< ",version=" << pmix::get_version();
return pmix::init();
} else {
throw runtime_error("trying to initialize PMIx while it is already initialized");
}
}
auto PMIxPlugin::Publish() -> void
{
auto channels(GetChannelInfo());
std::vector<pmix::info> info;
vector<pmix::info> info;
for (const auto& c : channels) {
std::string methodKey{"chans." + c.first + "." + std::to_string(c.second - 1) + ".method"};
if (GetProperty<std::string>(methodKey) == "bind") {
string methodKey("chans." + c.first + "." + to_string(c.second - 1) + ".method");
if (GetProperty<string>(methodKey) == "bind") {
for (int i = 0; i < c.second; ++i) {
std::string addressKey{"chans." + c.first + "." + std::to_string(i) + ".address"};
info.emplace_back(addressKey, GetProperty<std::string>(addressKey));
string addressKey("chans." + c.first + "." + to_string(i) + ".address");
info.emplace_back(addressKey, GetProperty<string>(addressKey));
LOG(debug) << PMIxClient() << info.back();
}
}
@@ -103,32 +241,37 @@ auto PMIxPlugin::Publish() -> void
if (info.size() > 0) {
pmix::publish(info);
LOG(debug) << PMIxClient() << "pmix::publish() OK: published "
<< info.size() << " binding channels.";
LOG(debug) << PMIxClient() << "pmix::publish() OK: published " << info.size()
<< " binding channels.";
}
}
auto PMIxPlugin::Fence() -> void
{
pmix::proc all(fProc);
pmix::proc all(fProcess);
all.rank = pmix::rank::wildcard;
pmix::fence({all});
LOG(debug) << PMIxClient() << "pmix::fence() OK";
}
auto PMIxPlugin::Fence(const std::string& label) -> void
{
Fence(label);
LOG(debug) << PMIxClient() << "pmix::fence() [" << label << "] OK";
}
auto PMIxPlugin::Lookup() -> void
{
auto channels(GetChannelInfo());
for (const auto& c : channels) {
std::string methodKey{"chans." + c.first + "." + std::to_string(c.second - 1) + ".method"};
if (GetProperty<std::string>(methodKey) == "connect") {
string methodKey("chans." + c.first + "." + to_string(c.second - 1) + ".method");
if (GetProperty<string>(methodKey) == "connect") {
for (int i = 0; i < c.second; ++i) {
std::vector<pmix::pdata> pdata;
std::string addressKey{"chans." + c.first + "." + std::to_string(i) + ".address"};
vector<pmix::pdata> pdata;
string addressKey("chans." + c.first + "." + to_string(i) + ".address");
pdata.emplace_back();
pdata.back().set_key(addressKey);
std::vector<pmix::info> info;
vector<pmix::info> info;
info.emplace_back(PMIX_WAIT, static_cast<int>(pdata.size()));
if (pdata.size() > 0) {
@@ -141,11 +284,11 @@ auto PMIxPlugin::Lookup() -> void
LOG(debug) << PMIxClient() << "pmix::lookup() not found: key=" << p.key;
} else if (p.value.type == PMIX_STRING) {
LOG(debug) << PMIxClient() << "pmix::lookup() found:"
<< " key=" << p.key << ",value=" << p.value.data.string;
SetProperty<std::string>(p.key, p.value.data.string);
<< " key=" << p.key << ",value=" << p.value.data.string;
SetProperty<string>(p.key, p.value.data.string);
} else {
LOG(debug) << PMIxClient() << "pmix::lookup() wrong type returned: "
<< "key=" << p.key << ",type=" << p.value.type;
<< "key=" << p.key << ",type=" << p.value.type;
}
}
}
@@ -153,6 +296,14 @@ auto PMIxPlugin::Lookup() -> void
}
}
auto PMIxPlugin::WaitForExitingAck() -> void
{
unique_lock<mutex> lock(fStateChangeSubscriberMutex);
fExitingAcked.wait_for(lock, chrono::milliseconds(1000), [this]() {
return fExitingAckedByLastExternalController;
});
}
} /* namespace plugins */
} /* namespace mq */
} /* namespace fair */