FairMQ: Fix missing ofi completion events

This commit is contained in:
Dennis Klein 2018-03-08 03:07:26 +01:00 committed by Mohammad Al-Turany
parent 144aa912d7
commit c5072ea425
5 changed files with 53 additions and 33 deletions

View File

@ -122,15 +122,12 @@ auto Context::GetBoostVersion() const -> std::string
return tools::ToString(BOOST_VERSION / 100000, ".", BOOST_VERSION / 100 % 1000, ".", BOOST_VERSION % 100); return tools::ToString(BOOST_VERSION / 100000, ".", BOOST_VERSION / 100 % 1000, ".", BOOST_VERSION % 100);
} }
auto Context::InitOfi(ConnectionType type, std::string addr) -> void auto Context::InitOfi(ConnectionType type, Address addr) -> void
{ {
auto addr2 = ConvertAddress(addr);
if (addr2.Protocol != "tcp")
throw ContextError{"Wrong protocol: Supplied address must be in format tcp://ip:port"};
if (!fOfiInfo) { if (!fOfiInfo) {
sockaddr_in* sa = static_cast<sockaddr_in*>(malloc(sizeof(sockaddr_in))); sockaddr_in* sa = static_cast<sockaddr_in*>(malloc(sizeof(sockaddr_in)));
auto sa2 = ConvertAddress(addr2); addr.Port = 0;
auto sa2 = ConvertAddress(addr);
memcpy(sa, &sa2, sizeof(sockaddr_in)); memcpy(sa, &sa2, sizeof(sockaddr_in));
// Prepare fi_getinfo query // Prepare fi_getinfo query
@ -143,16 +140,15 @@ auto Context::InitOfi(ConnectionType type, std::string addr) -> void
ofi_hints->domain_attr->threading = FI_THREAD_SAFE; ofi_hints->domain_attr->threading = FI_THREAD_SAFE;
ofi_hints->domain_attr->control_progress = FI_PROGRESS_AUTO; ofi_hints->domain_attr->control_progress = FI_PROGRESS_AUTO;
ofi_hints->domain_attr->data_progress = FI_PROGRESS_AUTO; ofi_hints->domain_attr->data_progress = FI_PROGRESS_AUTO;
// if (type == ConnectionType::Bind) { ofi_hints->tx_attr->op_flags = FI_COMPLETION;
// ofi_hints->src_addr = sa; ofi_hints->rx_attr->op_flags = FI_COMPLETION;
// ofi_hints->src_addrlen = sizeof(sockaddr_in); ofi_hints->src_addr = sa;
// } else { ofi_hints->src_addrlen = sizeof(sockaddr_in);
// ofi_hints->dest_addr = sa; ofi_hints->dest_addr = nullptr;
// ofi_hints->dest_addrlen = sizeof(sockaddr_in); ofi_hints->dest_addrlen = 0;
// }
// Query fi_getinfo for fabric to use // Query fi_getinfo for fabric to use
auto res = fi_getinfo(FI_VERSION(1, 5), strdup(addr2.Ip.c_str()), 0, 0, ofi_hints.get(), &fOfiInfo); auto res = fi_getinfo(FI_VERSION(1, 5), nullptr, nullptr, 0, ofi_hints.get(), &fOfiInfo);
if (res != 0) throw ContextError{tools::ToString("Failed querying fi_getinfo, reason: ", fi_strerror(res))}; if (res != 0) throw ContextError{tools::ToString("Failed querying fi_getinfo, reason: ", fi_strerror(res))};
if (!fOfiInfo) throw ContextError{"Could not find any ofi compatible fabric."}; if (!fOfiInfo) throw ContextError{"Could not find any ofi compatible fabric."};
@ -286,7 +282,12 @@ auto Context::InsertAddressVector(sockaddr_in address) -> fi_addr_t
if (ret != 1) if (ret != 1)
throw ContextError{tools::ToString("Failed to insert address into ofi address vector")}; throw ContextError{tools::ToString("Failed to insert address into ofi address vector")};
return ret; return mappedAddress;
}
auto Context::AddressVectorLookup(fi_addr_t address) -> sockaddr_in
{
throw ContextError("Not yet implemented");
} }
auto Context::ConvertAddress(std::string address) -> Address auto Context::ConvertAddress(std::string address) -> Address

View File

@ -41,7 +41,6 @@ class Context
Context(int numberIoThreads = 2); Context(int numberIoThreads = 2);
~Context(); ~Context();
auto InitOfi(ConnectionType type, std::string address) -> void;
auto CreateOfiEndpoint() -> fid_ep*; auto CreateOfiEndpoint() -> fid_ep*;
auto CreateOfiCompletionQueue(Direction dir) -> fid_cq*; auto CreateOfiCompletionQueue(Direction dir) -> fid_cq*;
auto GetZmqVersion() const -> std::string; auto GetZmqVersion() const -> std::string;
@ -51,12 +50,14 @@ class Context
auto GetZmqContext() const -> void* { return fZmqContext; } auto GetZmqContext() const -> void* { return fZmqContext; }
auto GetIoContext() -> boost::asio::io_service& { return fIoContext; } auto GetIoContext() -> boost::asio::io_service& { return fIoContext; }
auto InsertAddressVector(sockaddr_in address) -> fi_addr_t; auto InsertAddressVector(sockaddr_in address) -> fi_addr_t;
auto AddressVectorLookup(fi_addr_t address) -> sockaddr_in;
struct Address { struct Address {
std::string Protocol; std::string Protocol;
std::string Ip; std::string Ip;
unsigned int Port; unsigned int Port;
friend auto operator<<(std::ostream& os, const Address& a) -> std::ostream& { return os << a.Protocol << "://" << a.Ip << ":" << a.Port; } friend auto operator<<(std::ostream& os, const Address& a) -> std::ostream& { return os << a.Protocol << "://" << a.Ip << ":" << a.Port; }
}; };
auto InitOfi(ConnectionType type, Address address) -> void;
static auto ConvertAddress(std::string address) -> Address; static auto ConvertAddress(std::string address) -> Address;
static auto ConvertAddress(Address address) -> sockaddr_in; static auto ConvertAddress(Address address) -> sockaddr_in;
static auto ConvertAddress(sockaddr_in address) -> Address; static auto ConvertAddress(sockaddr_in address) -> Address;

View File

@ -85,7 +85,7 @@ auto Socket::Bind(const string& address) -> bool
try { try {
auto addr = Context::VerifyAddress(address); auto addr = Context::VerifyAddress(address);
BindControlSocket(addr); BindControlSocket(addr);
fContext.InitOfi(ConnectionType::Bind, address); fContext.InitOfi(ConnectionType::Bind, addr);
InitDataEndpoint(); InitDataEndpoint();
fWaitingForControlPeer = true; fWaitingForControlPeer = true;
return true; return true;
@ -106,7 +106,7 @@ auto Socket::Connect(const string& address) -> void
{ {
auto addr = Context::VerifyAddress(address); auto addr = Context::VerifyAddress(address);
ConnectControlSocket(addr); ConnectControlSocket(addr);
fContext.InitOfi(ConnectionType::Connect, address); fContext.InitOfi(ConnectionType::Connect, addr);
InitDataEndpoint(); InitDataEndpoint();
fWaitingForControlPeer = true; fWaitingForControlPeer = true;
} }
@ -207,7 +207,7 @@ try {
auto Socket::SendControlMessage(unique_ptr<ControlMessage> ctrl) -> void auto Socket::SendControlMessage(unique_ptr<ControlMessage> ctrl) -> void
{ {
assert(fControlSocket); assert(fControlSocket);
LOG(debug) << "About to send control message: " << ctrl->DebugString(); // LOG(debug) << "About to send control message: " << ctrl->DebugString();
// Serialize // Serialize
string* str = new string(); string* str = new string();
@ -217,8 +217,10 @@ auto Socket::SendControlMessage(unique_ptr<ControlMessage> ctrl) -> void
assert(ret == 0); assert(ret == 0);
// Send // Send
if (zmq_msg_send(&msg, fControlSocket, 0) == -1) if (zmq_msg_send(&msg, fControlSocket, 0) == -1) {
zmq_msg_close(&msg);
throw SocketError(tools::ToString("Failed to send control message, reason: ", zmq_strerror(errno))); throw SocketError(tools::ToString("Failed to send control message, reason: ", zmq_strerror(errno)));
}
} }
auto Socket::ReceiveControlMessage() -> unique_ptr<ControlMessage> auto Socket::ReceiveControlMessage() -> unique_ptr<ControlMessage>
@ -229,14 +231,17 @@ auto Socket::ReceiveControlMessage() -> unique_ptr<ControlMessage>
zmq_msg_t msg; zmq_msg_t msg;
auto ret = zmq_msg_init(&msg); auto ret = zmq_msg_init(&msg);
assert(ret == 0); assert(ret == 0);
if (zmq_msg_recv(&msg, fControlSocket, 0) == -1) if (zmq_msg_recv(&msg, fControlSocket, 0) == -1) {
zmq_msg_close(&msg);
throw SocketError(tools::ToString("Failed to receive control message, reason: ", zmq_strerror(errno))); throw SocketError(tools::ToString("Failed to receive control message, reason: ", zmq_strerror(errno)));
}
// Deserialize // Deserialize
auto ctrl = tools::make_unique<ControlMessage>(); auto ctrl = tools::make_unique<ControlMessage>();
ctrl->ParseFromArray(zmq_msg_data(&msg), zmq_msg_size(&msg)); ctrl->ParseFromArray(zmq_msg_data(&msg), zmq_msg_size(&msg));
LOG(debug) << "Received control message: " << ctrl->DebugString(); zmq_msg_close(&msg);
// LOG(debug) << "Received control message: " << ctrl->DebugString();
return ctrl; return ctrl;
} }
@ -269,6 +274,9 @@ auto Socket::WaitForControlPeer() -> void
string remoteIp(inet_ntoa(remoteAddr.sin_addr)); string remoteIp(inet_ntoa(remoteAddr.sin_addr));
int remotePort = ntohs(remoteAddr.sin_port); int remotePort = ntohs(remoteAddr.sin_port);
LOG(debug) << "Accepted control peer connection from " << remoteIp << ":" << remotePort; LOG(debug) << "Accepted control peer connection from " << remoteIp << ":" << remotePort;
// sucks, but the above event does not guarantee the socket is operational ...
std::this_thread::sleep_for(std::chrono::milliseconds(200));
} else if (event == ZMQ_EVENT_CONNECTED) { } else if (event == ZMQ_EVENT_CONNECTED) {
LOG(debug) << "Connected successfully to control peer"; LOG(debug) << "Connected successfully to control peer";
} else { } else {
@ -318,7 +326,7 @@ try {
throw SocketError(tools::ToString("Failed posting ofi send buffer, reason: ", fi_strerror(ret))); throw SocketError(tools::ToString("Failed posting ofi send buffer, reason: ", fi_strerror(ret)));
fi_cq_err_entry cqEntry; fi_cq_err_entry cqEntry;
ret = fi_cq_sread(fDataCompletionQueueTx, &cqEntry, 1, nullptr, 1000); ret = fi_cq_sread(fDataCompletionQueueTx, &cqEntry, 1, nullptr, -1);
if (ret != 1) if (ret != 1)
throw SocketError(tools::ToString("Failed reading ofi tx completion queue event, reason: ", fi_strerror(ret))); throw SocketError(tools::ToString("Failed reading ofi tx completion queue event, reason: ", fi_strerror(ret)));
} }
@ -350,12 +358,13 @@ try {
assert(ctrl->has_post_buffer()); assert(ctrl->has_post_buffer());
auto postBuffer = ctrl->post_buffer(); auto postBuffer = ctrl->post_buffer();
auto size = postBuffer.size(); auto size = postBuffer.size();
LOG(debug) << "Received post buffer control message with size: " << size;
// Receive data // Receive data
if (size) { if (size) {
msg->Rebuild(size); msg->Rebuild(size);
auto ret = fi_recv(fDataEndpoint, msg->GetData(), msg->GetSize(), nullptr, fRemoteDataAddr, nullptr); auto buf = msg->GetData();
auto size2 = msg->GetSize();
auto ret = fi_recv(fDataEndpoint, buf, size2, nullptr, fRemoteDataAddr, nullptr);
if (ret != FI_SUCCESS) if (ret != FI_SUCCESS)
throw SocketError(tools::ToString("Failed posting ofi receive buffer, reason: ", fi_strerror(ret))); throw SocketError(tools::ToString("Failed posting ofi receive buffer, reason: ", fi_strerror(ret)));
@ -368,9 +377,11 @@ try {
SendControlMessage(move(ctrl2)); SendControlMessage(move(ctrl2));
fi_cq_err_entry cqEntry; fi_cq_err_entry cqEntry;
ret = fi_cq_sread(fDataCompletionQueueRx, &cqEntry, 1, nullptr, 1000); ret = fi_cq_sread(fDataCompletionQueueRx, &cqEntry, 1, nullptr, -1);
if (ret != 1) if (ret != 1)
throw SocketError(tools::ToString("Failed reading ofi rx completion queue event, reason: ", fi_strerror(ret))); throw SocketError(tools::ToString("Failed reading ofi rx completion queue event, reason: ", fi_strerror(ret)));
assert(cqEntry.len == size2);
assert(cqEntry.buf == buf);
} }
return size; return size;

View File

@ -45,13 +45,17 @@ class PairLeft : public FairMQDevice
// Simple message with short text data // Simple message with short text data
auto msg5{NewSimpleMessageFor("data", 0, "testdata1234")}; auto msg5{NewSimpleMessageFor("data", 0, "testdata1234")};
LOG(info) << "Will send msg5";
if (Send(msg5, "data") >= 0) counter++; if (Send(msg5, "data") >= 0) counter++;
LOG(info) << "Sent msg5"; auto msg6{NewMessageFor("data", 0)};
if (counter == 5) LOG(info) << "Simple message with short text data successfull"; auto ret = Receive(msg6, "data");
if (ret > 0) {
auto content = std::string{static_cast<char*>(msg6->GetData()), msg6->GetSize()};
LOG(info) << ret << ", " << msg6->GetSize() << ", '" << content << "'";
if (msg6->GetSize() == ret && content == "testdata1234") counter++;
}
if (counter == 6) LOG(info) << "Simple message with short text data successfull";
std::this_thread::sleep_for(std::chrono::milliseconds(1000)); assert(counter == 6);
assert(counter == 5);
}; };
}; };

View File

@ -52,9 +52,12 @@ class PairRight : public FairMQDevice
LOG(info) << ret << ", " << msg5->GetSize() << ", '" << content << "'"; LOG(info) << ret << ", " << msg5->GetSize() << ", '" << content << "'";
if (msg5->GetSize() == ret && content == "testdata1234") counter++; if (msg5->GetSize() == ret && content == "testdata1234") counter++;
} }
if (counter == 5) LOG(info) << "Simple message with short text data successfull"; auto msg6{NewSimpleMessageFor("data", 0, "testdata1234")};
if (Send(msg6, "data") >= 0) counter++;
if (counter == 6) LOG(info) << "Simple message with short text data successfull";
if (counter == 5) LOG(info) << "PAIR test successfull."; std::this_thread::sleep_for(std::chrono::milliseconds(1000));
if (counter == 6) LOG(info) << "PAIR test successfull.";
}; };
}; };