shm: optimize monitor heartbeats

This commit is contained in:
Alexey Rybalchenko 2021-06-09 12:54:40 +02:00
parent ab54668aee
commit 28a887a457
5 changed files with 43 additions and 51 deletions

View File

@ -124,6 +124,15 @@ struct EventCounter
std::atomic<uint64_t> fCount; std::atomic<uint64_t> fCount;
}; };
struct Heartbeat
{
Heartbeat(uint64_t c)
: fCount(c)
{}
std::atomic<uint64_t> fCount;
};
struct RegionCounter struct RegionCounter
{ {
RegionCounter(uint16_t c) RegionCounter(uint16_t c)

View File

@ -80,7 +80,7 @@ class Manager
, fMsgCounterNew(0) , fMsgCounterNew(0)
, fMsgCounterDelete(0) , fMsgCounterDelete(0)
#endif #endif
, fSendHeartbeats(true) , fBeatTheHeart(true)
, fThrowOnBadAlloc(config ? config->GetProperty<bool>("shm-throw-bad-alloc", true) : true) , fThrowOnBadAlloc(config ? config->GetProperty<bool>("shm-throw-bad-alloc", true) : true)
, fNoCleanup(config ? config->GetProperty<bool>("shm-no-cleanup", false) : false) , fNoCleanup(config ? config->GetProperty<bool>("shm-no-cleanup", false) : false)
{ {
@ -106,7 +106,7 @@ class Manager
StartMonitor(fShmId); StartMonitor(fShmId);
} }
fHeartbeatThread = std::thread(&Manager::SendHeartbeats, this); fHeartbeatThread = std::thread(&Manager::Heartbeats, this);
{ {
std::stringstream ss; std::stringstream ss;
@ -544,23 +544,15 @@ class Manager
void DecrementShmMsgCounter(uint16_t segmentId) { --((*fShmMsgCounters)[segmentId].fCount); } void DecrementShmMsgCounter(uint16_t segmentId) { --((*fShmMsgCounters)[segmentId].fCount); }
#endif #endif
void SendHeartbeats() void Heartbeats()
{ {
std::string controlQueueName("fmq_" + fShmId + "_cq"); using namespace boost::interprocess;
Heartbeat* hb = fManagementSegment.find_or_construct<Heartbeat>(unique_instance)(0);
std::unique_lock<std::mutex> lock(fHeartbeatsMtx); std::unique_lock<std::mutex> lock(fHeartbeatsMtx);
while (fSendHeartbeats) { while (fBeatTheHeart) {
try { (hb->fCount)++;
boost::interprocess::message_queue mq(boost::interprocess::open_only, controlQueueName.c_str()); fHeartbeatsCV.wait_for(lock, std::chrono::milliseconds(100), [&]() { return !fBeatTheHeart; });
boost::posix_time::ptime sndTill = boost::posix_time::microsec_clock::universal_time() + boost::posix_time::milliseconds(100);
if (mq.timed_send(fDeviceId.c_str(), fDeviceId.size(), 0, sndTill)) {
fHeartbeatsCV.wait_for(lock, std::chrono::milliseconds(100), [&]() { return !fSendHeartbeats; });
} else {
LOG(debug) << "control queue timeout";
}
} catch (boost::interprocess::interprocess_exception& ie) {
fHeartbeatsCV.wait_for(lock, std::chrono::milliseconds(500), [&]() { return !fSendHeartbeats; });
// LOG(debug) << "no " << controlQueueName << " found";
}
} }
} }
@ -678,7 +670,7 @@ class Manager
{ {
std::unique_lock<std::mutex> lock(fHeartbeatsMtx); std::unique_lock<std::mutex> lock(fHeartbeatsMtx);
fSendHeartbeats = false; fBeatTheHeart = false;
} }
fHeartbeatsCV.notify_one(); fHeartbeatsCV.notify_one();
if (fHeartbeatThread.joinable()) { if (fHeartbeatThread.joinable()) {
@ -744,14 +736,12 @@ class Manager
#endif #endif
std::thread fHeartbeatThread; std::thread fHeartbeatThread;
bool fSendHeartbeats; bool fBeatTheHeart;
std::mutex fHeartbeatsMtx; std::mutex fHeartbeatsMtx;
std::condition_variable fHeartbeatsCV; std::condition_variable fHeartbeatsCV;
bool fThrowOnBadAlloc; bool fThrowOnBadAlloc;
bool fNoCleanup; bool fNoCleanup;
}; };
} // namespace fair::mq::shmem } // namespace fair::mq::shmem

View File

@ -87,7 +87,6 @@ Monitor::Monitor(string shmId, bool selfDestruct, bool interactive, bool viewOnl
, fTimeoutInMS(timeoutInMS) , fTimeoutInMS(timeoutInMS)
, fIntervalInMS(intervalInMS) , fIntervalInMS(intervalInMS)
, fShmId(std::move(shmId)) , fShmId(std::move(shmId))
, fControlQueueName("fmq_" + fShmId + "_cq")
, fTerminating(false) , fTerminating(false)
, fHeartbeatTriggered(false) , fHeartbeatTriggered(false)
, fLastHeartbeat(chrono::high_resolution_clock::now()) , fLastHeartbeat(chrono::high_resolution_clock::now())
@ -132,8 +131,7 @@ void Monitor::Run()
{ {
thread heartbeatThread; thread heartbeatThread;
if (!fViewOnly) { if (!fViewOnly) {
RemoveQueue(fControlQueueName); heartbeatThread = thread(&Monitor::CheckHeartbeats, this);
heartbeatThread = thread(&Monitor::ReceiveHeartbeats, this);
} }
if (fInteractive) { if (fInteractive) {
@ -158,7 +156,7 @@ void Monitor::Watch()
fSeenOnce = true; fSeenOnce = true;
auto now = chrono::high_resolution_clock::now(); auto now = chrono::high_resolution_clock::now();
unsigned int duration = chrono::duration_cast<chrono::milliseconds>(now - fLastHeartbeat).count(); unsigned int duration = chrono::duration_cast<chrono::milliseconds>(now - fLastHeartbeat.load()).count();
if (fHeartbeatTriggered && duration > fTimeoutInMS) { if (fHeartbeatTriggered && duration > fTimeoutInMS) {
// memory is present, but no heartbeats since timeout duration // memory is present, but no heartbeats since timeout duration
@ -181,7 +179,7 @@ void Monitor::Watch()
} else { } else {
// if self-destruct is requested, and no segment has ever been observed, quit after double timeout duration // if self-destruct is requested, and no segment has ever been observed, quit after double timeout duration
auto now = chrono::high_resolution_clock::now(); auto now = chrono::high_resolution_clock::now();
unsigned int duration = chrono::duration_cast<chrono::milliseconds>(now - fLastHeartbeat).count(); unsigned int duration = chrono::duration_cast<chrono::milliseconds>(now - fLastHeartbeat.load()).count();
if (duration > fTimeoutInMS * 2) { if (duration > fTimeoutInMS * 2) {
Cleanup(ShmId{fShmId}); Cleanup(ShmId{fShmId});
@ -305,31 +303,30 @@ void Monitor::ListAll(const std::string& path)
} }
} }
void Monitor::ReceiveHeartbeats() void Monitor::CheckHeartbeats()
{ {
try { using namespace boost::interprocess;
bipc::message_queue mq(bipc::open_or_create, fControlQueueName.c_str(), 1000, 256);
unsigned int priority = 0; uint64_t localHb = 0;
bipc::message_queue::size_type recvdSize = 0;
char msg[256] = {0};
while (!fTerminating) { while (!fTerminating) {
bpt::ptime rcvTill = bpt::microsec_clock::universal_time() + bpt::milliseconds(100); std::this_thread::sleep_for(std::chrono::milliseconds(200));
if (mq.timed_receive(&msg, sizeof(msg), recvdSize, priority, rcvTill)) { try {
fHeartbeatTriggered = true; managed_shared_memory managementSegment(open_read_only, std::string("fmq_" + fShmId + "_mng").c_str());
fLastHeartbeat = chrono::high_resolution_clock::now(); Heartbeat* hb = managementSegment.find<Heartbeat>(unique_instance).first;
string deviceId(msg, recvdSize);
fDeviceHeartbeats[deviceId] = fLastHeartbeat; if (hb) {
} else { uint64_t globalHb = hb->fCount;
// LOG(info) << "control queue timeout"; if (localHb != globalHb) {
fHeartbeatTriggered = true;
fLastHeartbeat.store(chrono::high_resolution_clock::now());
localHb = globalHb;
}
} }
} catch (bie&) {
// management segment not found, simply retry.
} }
} catch (bie& ie) {
LOG(info) << ie.what();
} }
RemoveQueue(fControlQueueName);
} }
void Monitor::Interactive() void Monitor::Interactive()
@ -629,7 +626,6 @@ std::vector<std::pair<std::string, bool>> Monitor::CleanupFull(const ShmId& shmI
{ {
auto result = Cleanup(shmId, verbose); auto result = Cleanup(shmId, verbose);
result.emplace_back(RunRemoval(Monitor::RemoveMutex, "fmq_" + shmId.shmId + "_ms", verbose)); result.emplace_back(RunRemoval(Monitor::RemoveMutex, "fmq_" + shmId.shmId + "_ms", verbose));
result.emplace_back(RunRemoval(Monitor::RemoveQueue, "fmq_" + shmId.shmId + "_cq", verbose));
return result; return result;
} }

View File

@ -117,7 +117,7 @@ class Monitor
private: private:
void PrintHelp(); void PrintHelp();
void Watch(); void Watch();
void ReceiveHeartbeats(); void CheckHeartbeats();
void CheckSegment(); void CheckSegment();
void Interactive(); void Interactive();
void SignalMonitor(); void SignalMonitor();
@ -131,12 +131,10 @@ class Monitor
unsigned int fTimeoutInMS; unsigned int fTimeoutInMS;
unsigned int fIntervalInMS; unsigned int fIntervalInMS;
std::string fShmId; std::string fShmId;
std::string fControlQueueName;
std::atomic<bool> fTerminating; std::atomic<bool> fTerminating;
std::atomic<bool> fHeartbeatTriggered; std::atomic<bool> fHeartbeatTriggered;
std::chrono::high_resolution_clock::time_point fLastHeartbeat; std::atomic<std::chrono::high_resolution_clock::time_point> fLastHeartbeat;
std::thread fSignalThread; std::thread fSignalThread;
std::unordered_map<std::string, std::chrono::high_resolution_clock::time_point> fDeviceHeartbeats;
}; };
} // namespace fair::mq::shmem } // namespace fair::mq::shmem

View File

@ -19,7 +19,6 @@ FairMQ Shared Memory currently uses the following names to register shared memor
| `fmq_<shmId>_rg_<index>` | unmanaged region(s) | one of the devices | devices with unmanaged regions | | `fmq_<shmId>_rg_<index>` | unmanaged region(s) | one of the devices | devices with unmanaged regions |
| `fmq_<shmId>_rgq_<index>` | unmanaged region queue(s) | one of the devices | devices with unmanaged regions | | `fmq_<shmId>_rgq_<index>` | unmanaged region queue(s) | one of the devices | devices with unmanaged regions |
| `fmq_<shmId>_ms` | shmmonitor status | shmmonitor | devices, shmmonitor | | `fmq_<shmId>_ms` | shmmonitor status | shmmonitor | devices, shmmonitor |
| `fmq_<shmId>_cq` | message queue between transport and shmmonitor | shmmonitor | devices, shmmonitor |
The shmId is generated out of session id and user id. The shmId is generated out of session id and user id.