Compare commits

..

10 Commits

Author SHA1 Message Date
Alexey Rybalchenko
af0d668951 Shm: fix region init with external regions 2022-09-09 15:40:33 +02:00
Alexey Rybalchenko
072d7cb744 shm: add some debug output 2022-09-09 15:40:33 +02:00
Alexey Rybalchenko
f5c46ce018 region example: add options for testing with externally-created regions 2022-09-09 15:40:33 +02:00
Alexey Rybalchenko
d105960444 fix(shm): Fix incorrect parameters when mapping regions 2022-09-06 08:09:47 +02:00
Dennis Klein
3aae5bae58 build: Add ORCID for Christian Tacke 2022-09-06 08:08:42 +02:00
Dennis Klein
9031029d2c build: Add ORCID for Dennis Klein 2022-09-06 08:08:34 +02:00
Dennis Klein
d478e050ba build: HTML-Format desc field in zenodo.org config 2022-09-06 08:08:14 +02:00
Dennis Klein
06b2b9b01f build: Add license hint to zenodo.org config 2022-09-06 08:08:05 +02:00
Dennis Klein
b3fa4f6e7e build: Add config for zenodo.org import 2022-09-06 08:07:46 +02:00
Alexey Rybalchenko
da5cb34416 fix(shm): race/deadlock in region locks 2022-08-21 18:32:24 +02:00
10 changed files with 207 additions and 75 deletions

86
.zenodo.json Normal file
View File

@@ -0,0 +1,86 @@
{
"creators": [
{
"name": "Al-Turany, Mohammad"
},
{
"orcid": "0000-0003-3787-1910",
"name": "Klein, Dennis"
},
{
"name": "Kollegger, Thorsten"
},
{
"name": "Rybalchenko, Alexey"
},
{
"name": "Winckler, Nicolas"
}
],
"contributors": [
{
"type": "Other",
"name": "Aphecetche, Laurent"
},
{
"type": "Other",
"name": "Binet, Sebastien"
},
{
"type": "Other",
"name": "Eulisse, Giulio"
},
{
"type": "Other",
"name": "Karabowicz, Radoslaw"
},
{
"type": "Other",
"name": "Kretz, Matthias"
},
{
"type": "Other",
"name": "Krzewicki, Mikolaj"
},
{
"type": "Other",
"name": "Lebedev, Andrey"
},
{
"type": "Other",
"name": "Mrnjavac, Teo"
},
{
"type": "Other",
"name": "Neskovic, Gvozden"
},
{
"type": "Other",
"name": "Richter, Matthias"
},
{
"type": "Other",
"orcid": "0000-0002-5321-8404",
"name": "Tacke, Christian"
},
{
"type": "Other",
"name": "Uhlig, Florian"
},
{
"type": "Other",
"name": "Wenzel, Sandro"
}
],
"description": "<p>C++ Message Queuing Library and Framework</p>",
"related_identifiers": [
{
"identifier": "https://github.com/FairRootGroup/FairMQ/",
"relation": "isSupplementTo",
"resource_type": "software",
"scheme": "url"
}
],
"title": "FairMQ",
"license": "LGPL-3.0-only"
}

View File

@@ -1,5 +1,5 @@
Al-Turany, Mohammad
Klein, Dennis
Klein, Dennis [https://orcid.org/0000-0003-3787-1910]
Kollegger, Thorsten
Rybalchenko, Alexey
Winckler, Nicolas

View File

@@ -8,6 +8,6 @@ Lebedev, Andrey
Mrnjavac, Teo <teo.m@cern.ch>
Neskovic, Gvozden
Richter, Matthias
Tacke, Christian
Tacke, Christian [https://orcid.org/0000-0002-5321-8404]
Uhlig, Florian
Wenzel, Sandro

View File

@@ -18,7 +18,8 @@
{
"@type": "Person",
"givenName": "Dennis",
"familyName": "Klein"
"familyName": "Klein",
"@id": "https://orcid.org/0000-0003-3787-1910"
},
{
"@type": "Person",
@@ -92,7 +93,8 @@
{
"@type": "Person",
"givenName": "Christian",
"familyName": "Tacke"
"familyName": "Tacke",
"@id": "https://orcid.org/0000-0002-5321-8404"
},
{
"@type": "Person",

View File

@@ -19,6 +19,10 @@ SAMPLER+=" --severity debug"
SAMPLER+=" --msg-size $msgSize"
# SAMPLER+=" --rate 10"
SAMPLER+=" --transport $transport"
# SAMPLER+=" --external-region true"
# SAMPLER+=" --shm-no-cleaup true"
# SAMPLER+=" --shm-monitor false"
# SAMPLER+=" --shmid 1"
SAMPLER+=" --channel-config name=data,type=push,method=bind,address=tcp://127.0.0.1:7777,sndKernelSize=212992"
xterm -geometry 120x60+0+0 -hold -e @EX_BIN_DIR@/$SAMPLER &
@@ -26,5 +30,8 @@ SINK="fairmq-ex-region-sink"
SINK+=" --id sink1"
SINK+=" --severity debug"
SINK+=" --transport $transport"
# SINK+=" --shm-no-cleaup true"
# SINK+=" --shm-monitor false"
# SINK+=" --shmid 1"
SINK+=" --channel-config name=data,type=pull,method=connect,address=tcp://127.0.0.1:7777,rcvKernelSize=212992"
xterm -geometry 120x60+750+0 -hold -e @EX_BIN_DIR@/$SINK &

View File

@@ -19,6 +19,7 @@ struct Sampler : fair::mq::Device
{
void InitTask() override
{
fExternalRegion = fConfig->GetProperty<bool>("external-region");
fMsgSize = fConfig->GetProperty<int>("msg-size");
fLinger = fConfig->GetProperty<uint32_t>("region-linger");
fMaxIterations = fConfig->GetProperty<uint64_t>("max-iterations");
@@ -34,9 +35,18 @@ struct Sampler : fair::mq::Device
fair::mq::RegionConfig regionCfg;
regionCfg.linger = fLinger; // delay in ms before region destruction to collect outstanding events
// options for testing with an externally-created -region
if (fExternalRegion) {
regionCfg.id = 1;
regionCfg.removeOnDestruction = false;
regionCfg.lock = false; // mlock region after creation
regionCfg.lock = false; // mlock region after creation
} else {
regionCfg.lock = true; // mlock region after creation
regionCfg.zero = true; // zero region content after creation
fRegion = fair::mq::UnmanagedRegionPtr(NewUnmanagedRegionFor("data", // region is created using the transport of this channel...
}
fRegion = fair::mq::UnmanagedRegionPtr(NewUnmanagedRegionFor(
"data", // region is created using the transport of this channel...
0, // ... and this sub-channel
10000000, // region size
[this](const std::vector<fair::mq::RegionBlock>& blocks) { // callback to be called when message buffers no longer needed by transport
@@ -45,7 +55,9 @@ struct Sampler : fair::mq::Device
if (fMaxIterations > 0) {
LOG(info) << "Received " << blocks.size() << " acks";
}
}, regionCfg));
},
regionCfg
));
}
bool ConditionalRun() override
@@ -76,6 +88,8 @@ struct Sampler : fair::mq::Device
void ResetTask() override
{
// give some time for acks to be received
std::this_thread::sleep_for(std::chrono::milliseconds(250));
fRegion.reset();
{
std::lock_guard<std::mutex> lock(fMtx);
@@ -89,6 +103,7 @@ struct Sampler : fair::mq::Device
}
private:
int fExternalRegion = false;
int fMsgSize = 10000;
uint32_t fLinger = 100;
uint64_t fMaxIterations = 0;
@@ -103,7 +118,8 @@ void addCustomOptions(bpo::options_description& options)
options.add_options()
("msg-size", bpo::value<int>()->default_value(1000), "Message size in bytes")
("region-linger", bpo::value<uint32_t>()->default_value(100), "Linger period for regions")
("max-iterations", bpo::value<uint64_t>()->default_value(0), "Maximum number of iterations of Run/ConditionalRun/OnData (0 - infinite)");
("max-iterations", bpo::value<uint64_t>()->default_value(0), "Maximum number of iterations of Run/ConditionalRun/OnData (0 - infinite)")
("external-region", bpo::value<bool>()->default_value(false), "Use region created by another process");
}
std::unique_ptr<fair::mq::Device> getDevice(fair::mq::ProgOptions& /*config*/)

View File

@@ -395,20 +395,10 @@ class Manager
const uint16_t id = cfg.id.value();
UnmanagedRegion* region = nullptr;
bool newRegionCreated = false;
{
std::lock_guard<std::mutex> lock(fLocalRegionsMtx);
auto res = fRegions.emplace(id, std::make_unique<UnmanagedRegion>(fShmId, size, false, cfg));
newRegionCreated = res.second;
region = res.first->second.get();
}
auto& region = fRegions[id] = std::make_unique<UnmanagedRegion>(fShmId, size, false, cfg);
// LOG(debug) << "Created region with id '" << id << "', path: '" << cfg.path << "', flags: '" << cfg.creationFlags << "'";
if (!newRegionCreated) {
region->fRemote = false; // TODO: this should be more clear, refactor it.
}
// start ack receiver only if a callback has been provided.
if (callback || bulkCallback) {
region->SetCallbacks(callback, bulkCallback);
@@ -416,7 +406,7 @@ class Manager
region->StartAckSender();
region->StartAckReceiver();
}
result.first = region;
result.first = region.get();
result.second = id;
}
fRegionsGen += 1; // signal TL cache invalidation
@@ -429,7 +419,7 @@ class Manager
}
}
UnmanagedRegion* GetRegion(uint16_t id)
UnmanagedRegion* GetRegionFromCache(uint16_t id)
{
// NOTE: gcc optimizations. Prevent loading tls addresses many times in the fast path
const auto &lTlCache = fTlRegionCache;
@@ -443,41 +433,40 @@ class Manager
}
}
boost::interprocess::scoped_lock<boost::interprocess::interprocess_mutex> shmLock(*fShmMtx);
// slow path: check invalidation
if (lTlCacheGen != fRegionsGen) {
fTlRegionCache.fRegionsTLCache.clear();
}
std::lock_guard<std::mutex> lock(fLocalRegionsMtx);
auto* lRegion = GetRegionUnsafe(id, shmLock);
auto* lRegion = GetRegion(id);
fTlRegionCache.fRegionsTLCache.emplace_back(std::make_tuple(lRegion, id, fShmId64));
fTlRegionCache.fRegionsTLCacheGen = fRegionsGen;
return lRegion;
}
UnmanagedRegion* GetRegionUnsafe(uint16_t id, boost::interprocess::scoped_lock<boost::interprocess::interprocess_mutex>& lockedShmLock)
UnmanagedRegion* GetRegion(uint16_t id)
{
std::lock_guard<std::mutex> lock(fLocalRegionsMtx);
// remote region could actually be a local one if a message originates from this device (has been sent out and returned)
auto it = fRegions.find(id);
if (it != fRegions.end()) {
return it->second.get();
} else {
try {
// get region info
RegionInfo regionInfo = fShmRegions->at(id);
// safe to unlock now - no shm container accessed after this
lockedShmLock.unlock();
RegionConfig cfg;
// get region info
{
boost::interprocess::scoped_lock<boost::interprocess::interprocess_mutex> shmLock(*fShmMtx);
RegionInfo regionInfo = fShmRegions->at(id);
cfg.id = id;
cfg.creationFlags = regionInfo.fCreationFlags;
cfg.path = regionInfo.fPath.c_str();
}
// LOG(debug) << "Located remote region with id '" << id << "', path: '" << cfg.path << "', flags: '" << cfg.creationFlags << "'";
auto r = fRegions.emplace(id, std::make_unique<UnmanagedRegion>(fShmId, 0, true, std::move(cfg)));
r.first->second->InitializeQueues();
r.first->second->StartAckSender();
lockedShmLock.lock();
return r.first->second.get();
} catch (std::out_of_range& oor) {
LOG(error) << "Could not get remote region with id '" << id << "'. Does the region creator run with the same session id?";
@@ -493,10 +482,10 @@ class Manager
void RemoveRegion(uint16_t id)
{
try {
boost::interprocess::scoped_lock<boost::interprocess::interprocess_mutex> shmLock(*fShmMtx);
std::lock_guard<std::mutex> lock(fLocalRegionsMtx);
fRegions.at(id)->StopAcks();
{
boost::interprocess::scoped_lock<boost::interprocess::interprocess_mutex> shmLock(*fShmMtx);
if (fRegions.at(id)->RemoveOnDestruction()) {
fShmRegions->at(id).fDestroyed = true;
(fEventCounter->fCount)++;
@@ -512,45 +501,74 @@ class Manager
std::vector<fair::mq::RegionInfo> GetRegionInfo()
{
std::vector<fair::mq::RegionInfo> result;
std::map<uint64_t, RegionConfig> regionCfgs;
{
boost::interprocess::scoped_lock<boost::interprocess::interprocess_mutex> shmLock(*fShmMtx);
for (const auto& e : *fShmSegments) {
for (const auto& [segmentId, segmentInfo] : *fShmSegments) {
// make sure any segments in the session are found
GetSegment(e.first);
GetSegment(segmentId);
try {
fair::mq::RegionInfo info;
info.managed = true;
info.id = e.first;
info.id = segmentId;
info.event = RegionEvent::created;
info.ptr = boost::apply_visitor(SegmentAddress(), fSegments.at(e.first));
info.size = boost::apply_visitor(SegmentSize(), fSegments.at(e.first));
info.ptr = boost::apply_visitor(SegmentAddress(), fSegments.at(segmentId));
info.size = boost::apply_visitor(SegmentSize(), fSegments.at(segmentId));
result.push_back(info);
} catch (const std::out_of_range& oor) {
LOG(error) << "could not find segment with id " << e.first;
LOG(error) << "could not find segment with id " << segmentId;
LOG(error) << oor.what();
}
}
for (const auto& e : *fShmRegions) {
for (const auto& [regionId, regionInfo] : *fShmRegions) {
fair::mq::RegionInfo info;
info.managed = false;
info.id = e.first;
info.flags = e.second.fUserFlags;
info.event = e.second.fDestroyed ? RegionEvent::destroyed : RegionEvent::created;
info.id = regionId;
info.flags = regionInfo.fUserFlags;
info.event = regionInfo.fDestroyed ? RegionEvent::destroyed : RegionEvent::created;
if (info.event == RegionEvent::created) {
auto region = GetRegionUnsafe(info.id, shmLock);
if (region) {
info.ptr = region->GetData();
info.size = region->GetSize();
} else {
throw std::runtime_error(tools::ToString("GetRegionInfo() could not get region with id '", info.id, "'"));
}
RegionConfig cfg;
cfg.id = info.id;
cfg.creationFlags = regionInfo.fCreationFlags;
cfg.path = regionInfo.fPath.c_str();
regionCfgs.emplace(info.id, cfg);
// fill the ptr+size info after shmLock is released, to avoid constructing local region under it
} else {
info.ptr = nullptr;
info.size = 0;
}
result.push_back(info);
}
}
// do another iteration outside of shm lock, to fill ptr+size of unmanaged regions
for (auto& info : result) {
if (!info.managed && info.event == RegionEvent::created) {
auto cfgIt = regionCfgs.find(info.id);
if (cfgIt != regionCfgs.end()) {
UnmanagedRegion* region = nullptr;
std::lock_guard<std::mutex> lock(fLocalRegionsMtx);
auto it = fRegions.find(info.id);
if (it != fRegions.end()) {
region = it->second.get();
} else {
auto r = fRegions.emplace(cfgIt->first, std::make_unique<UnmanagedRegion>(fShmId, 0, true, cfgIt->second));
region = r.first->second.get();
region->InitializeQueues();
region->StartAckSender();
}
info.ptr = region->GetData();
info.size = region->GetSize();
} else {
info.ptr = nullptr;
info.size = 0;
}
}
}
return result;
}

View File

@@ -195,7 +195,7 @@ class Message final : public fair::mq::Message
fLocalPtr = nullptr;
}
} else {
fRegionPtr = fManager.GetRegion(fMeta.fRegionId);
fRegionPtr = fManager.GetRegionFromCache(fMeta.fRegionId);
if (fRegionPtr) {
fLocalPtr = reinterpret_cast<char*>(fRegionPtr->GetData()) + fMeta.fHandle;
} else {
@@ -365,7 +365,7 @@ class Message final : public fair::mq::Message
void ReleaseUnmanagedRegionBlock()
{
if (!fRegionPtr) {
fRegionPtr = fManager.GetRegion(fMeta.fRegionId);
fRegionPtr = fManager.GetRegionFromCache(fMeta.fRegionId);
}
if (fRegionPtr) {

View File

@@ -670,7 +670,7 @@ std::vector<std::pair<std::string, bool>> Monitor::Cleanup(const ShmId& shmIdT,
string path = info.fPath.c_str();
int flags = info.fCreationFlags;
if (verbose) {
LOG(info) << "Found RegionInfo with path: '" << path << "', flags: " << flags << ", fDestroyed: " << info.fDestroyed << ".";
LOG(info) << "Found UnmanagedRegion with id: " << id << ", path: '" << path << "', flags: " << flags << ", fDestroyed: " << info.fDestroyed << ".";
}
if (!path.empty()) {
result.emplace_back(Remove<bipc::file_mapping>(path + "fmq_" + shmId + "_rg_" + to_string(id), verbose));

View File

@@ -74,6 +74,8 @@ struct UnmanagedRegion
// TODO: refactor this
cfg.size = size;
LOG(debug) << "UnmanagedRegion(): " << fName << " | remote: " << remote << ".";
if (!cfg.path.empty()) {
fName = std::string(cfg.path + fName);
@@ -171,6 +173,7 @@ struct UnmanagedRegion
~UnmanagedRegion()
{
LOG(debug) << "~UnmanagedRegion(): " << fName << " | remote: " << fRemote << ".";
fStopAcks = true;
if (fAcksSender.joinable()) {