From 2c31bf79b0c6d83106110431be6c72e2368095d0 Mon Sep 17 00:00:00 2001 From: Horst Schirmeier <horst.schirmeier@tu-dortmund.de> Date: Mon, 20 Jan 2014 21:02:04 +0100 Subject: [PATCH] jobclient: expect communication failures This change makes the JobClient act properly on communication aborts. Change-Id: I0a76489f117e9721546215e3b627002605e25452 --- src/core/efw/JobClient.cc | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/src/core/efw/JobClient.cc b/src/core/efw/JobClient.cc index 34732338..01f673f2 100644 --- a/src/core/efw/JobClient.cc +++ b/src/core/efw/JobClient.cc @@ -149,10 +149,16 @@ FailControlMessage_Command JobClient::tryToGetExperimentData(ExperimentData& exp ExperimentData* temp_exp = new ExperimentData(exp.getMessage().New()); if (!SocketComm::rcvMsg(m_sockfd, temp_exp->getMessage())) { - // Failed to receive message? Retry. - close(m_sockfd); + // looks like we won't receive more jobs now, cleanup + delete &temp_exp->getMessage(); delete temp_exp; - return FailControlMessage::COME_AGAIN; + // did a previous loop iteration succeed? + if (m_parameters.size() > 0) { + break; + } else { + // nothing to do now, retry later + return FailControlMessage::COME_AGAIN; + } } temp_exp->setWorkloadID(ctrlmsg.workloadid(i)); //Store workload id of experiment data @@ -262,10 +268,16 @@ bool JobClient::sendResultsToServer() cout << "]"; // TODO: Log-level? - SocketComm::sendMsg(m_sockfd, ctrlmsg); + if (!SocketComm::sendMsg(m_sockfd, ctrlmsg)) { + close(m_sockfd); + return false; + } for (i = 0; i < ctrlmsg.job_size() ; i++) { - SocketComm::sendMsg(m_sockfd, m_results.front()->getMessage()); + if (!SocketComm::sendMsg(m_sockfd, m_results.front()->getMessage())) { + close(m_sockfd); + return false; + } delete &m_results.front()->getMessage(); delete m_results.front(); m_results.pop_front(); -- GitLab