From 2c31bf79b0c6d83106110431be6c72e2368095d0 Mon Sep 17 00:00:00 2001
From: Horst Schirmeier <horst.schirmeier@tu-dortmund.de>
Date: Mon, 20 Jan 2014 21:02:04 +0100
Subject: [PATCH] jobclient: expect communication failures

This change makes the JobClient act properly on communication aborts.

Change-Id: I0a76489f117e9721546215e3b627002605e25452
---
 src/core/efw/JobClient.cc | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/src/core/efw/JobClient.cc b/src/core/efw/JobClient.cc
index 34732338..01f673f2 100644
--- a/src/core/efw/JobClient.cc
+++ b/src/core/efw/JobClient.cc
@@ -149,10 +149,16 @@ FailControlMessage_Command JobClient::tryToGetExperimentData(ExperimentData& exp
 				ExperimentData* temp_exp = new ExperimentData(exp.getMessage().New());
 
 				if (!SocketComm::rcvMsg(m_sockfd, temp_exp->getMessage())) {
-					// Failed to receive message?  Retry.
-					close(m_sockfd);
+					// looks like we won't receive more jobs now, cleanup
+					delete &temp_exp->getMessage();
 					delete temp_exp;
-					return FailControlMessage::COME_AGAIN;
+					// did a previous loop iteration succeed?
+					if (m_parameters.size() > 0) {
+						break;
+					} else {
+						// nothing to do now, retry later
+						return FailControlMessage::COME_AGAIN;
+					}
 				}
 
 				temp_exp->setWorkloadID(ctrlmsg.workloadid(i)); //Store workload id of experiment data
@@ -262,10 +268,16 @@ bool JobClient::sendResultsToServer()
 		cout << "]";
 
 		// TODO: Log-level?
-		SocketComm::sendMsg(m_sockfd, ctrlmsg);
+		if (!SocketComm::sendMsg(m_sockfd, ctrlmsg)) {
+			close(m_sockfd);
+			return false;
+		}
 
 		for (i = 0; i < ctrlmsg.job_size() ; i++) {
-			SocketComm::sendMsg(m_sockfd, m_results.front()->getMessage());
+			if (!SocketComm::sendMsg(m_sockfd, m_results.front()->getMessage())) {
+				close(m_sockfd);
+				return false;
+			}
 			delete &m_results.front()->getMessage();
 			delete m_results.front();
 			m_results.pop_front();
-- 
GitLab